diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7099 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.04455895725977569, + "eval_steps": 500, + "global_step": 882, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.05203597049611e-05, + "grad_norm": 106.57671356201172, + "learning_rate": 0.0, + "loss": 2.4177, + "num_tokens": 3037.0, + "step": 1 + }, + { + "epoch": 0.0001010407194099222, + "grad_norm": 105.93892669677734, + "learning_rate": 7.407407407407407e-07, + "loss": 2.5252, + "num_tokens": 5552.0, + "step": 2 + }, + { + "epoch": 0.0001515610791148833, + "grad_norm": 128.10116577148438, + "learning_rate": 1.4814814814814815e-06, + "loss": 2.8326, + "num_tokens": 7987.0, + "step": 3 + }, + { + "epoch": 0.0002020814388198444, + "grad_norm": 102.48262786865234, + "learning_rate": 2.222222222222222e-06, + "loss": 2.4182, + "num_tokens": 11157.0, + "step": 4 + }, + { + "epoch": 0.0002526017985248055, + "grad_norm": 118.26165008544922, + "learning_rate": 2.962962962962963e-06, + "loss": 2.5241, + "num_tokens": 13744.0, + "step": 5 + }, + { + "epoch": 0.0003031221582297666, + "grad_norm": 124.30136108398438, + "learning_rate": 3.7037037037037037e-06, + "loss": 2.7998, + "num_tokens": 15722.0, + "step": 6 + }, + { + "epoch": 0.0003536425179347277, + "grad_norm": 132.9451141357422, + "learning_rate": 4.444444444444444e-06, + "loss": 2.8019, + "num_tokens": 17322.0, + "step": 7 + }, + { + "epoch": 0.0004041628776396888, + "grad_norm": 86.50129699707031, + "learning_rate": 5.185185185185185e-06, + "loss": 2.1519, + "num_tokens": 20140.0, + "step": 8 + }, + { + "epoch": 0.0004546832373446499, + "grad_norm": 94.24445343017578, + "learning_rate": 5.925925925925926e-06, + "loss": 2.2846, + "num_tokens": 22792.0, + "step": 9 + }, + { + "epoch": 0.000505203597049611, + "grad_norm": 115.1807861328125, + "learning_rate": 6.666666666666667e-06, + "loss": 2.3938, + "num_tokens": 25602.0, + "step": 10 + }, + { + "epoch": 0.0005557239567545721, + "grad_norm": 99.71337127685547, + "learning_rate": 7.4074074074074075e-06, + "loss": 2.4838, + "num_tokens": 27875.0, + "step": 11 + }, + { + "epoch": 0.0006062443164595332, + "grad_norm": 101.07054138183594, + "learning_rate": 8.148148148148148e-06, + "loss": 2.1364, + "num_tokens": 30333.0, + "step": 12 + }, + { + "epoch": 0.0006567646761644943, + "grad_norm": 98.68280029296875, + "learning_rate": 8.888888888888888e-06, + "loss": 1.8957, + "num_tokens": 33154.0, + "step": 13 + }, + { + "epoch": 0.0007072850358694554, + "grad_norm": 86.39786529541016, + "learning_rate": 9.62962962962963e-06, + "loss": 2.1925, + "num_tokens": 35379.0, + "step": 14 + }, + { + "epoch": 0.0007578053955744165, + "grad_norm": 90.54401397705078, + "learning_rate": 1.037037037037037e-05, + "loss": 1.9464, + "num_tokens": 37698.0, + "step": 15 + }, + { + "epoch": 0.0008083257552793776, + "grad_norm": 86.97708129882812, + "learning_rate": 1.1111111111111113e-05, + "loss": 2.1574, + "num_tokens": 39390.0, + "step": 16 + }, + { + "epoch": 0.0008588461149843387, + "grad_norm": 65.81087493896484, + "learning_rate": 1.1851851851851852e-05, + "loss": 1.7317, + "num_tokens": 42146.0, + "step": 17 + }, + { + "epoch": 0.0009093664746892998, + "grad_norm": 65.27281188964844, + "learning_rate": 1.2592592592592593e-05, + "loss": 1.6127, + "num_tokens": 44536.0, + "step": 18 + }, + { + "epoch": 0.0009598868343942609, + "grad_norm": 44.22801208496094, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4297, + "num_tokens": 47573.0, + "step": 19 + }, + { + "epoch": 0.001010407194099222, + "grad_norm": 37.38533401489258, + "learning_rate": 1.4074074074074075e-05, + "loss": 1.7198, + "num_tokens": 50367.0, + "step": 20 + }, + { + "epoch": 0.001060927553804183, + "grad_norm": 41.71638870239258, + "learning_rate": 1.4814814814814815e-05, + "loss": 1.6402, + "num_tokens": 53037.0, + "step": 21 + }, + { + "epoch": 0.0011114479135091442, + "grad_norm": 27.242555618286133, + "learning_rate": 1.555555555555556e-05, + "loss": 1.4327, + "num_tokens": 56040.0, + "step": 22 + }, + { + "epoch": 0.0011619682732141052, + "grad_norm": 40.86289596557617, + "learning_rate": 1.6296296296296297e-05, + "loss": 1.652, + "num_tokens": 58073.0, + "step": 23 + }, + { + "epoch": 0.0012124886329190664, + "grad_norm": 38.95692825317383, + "learning_rate": 1.7037037037037038e-05, + "loss": 1.4338, + "num_tokens": 60020.0, + "step": 24 + }, + { + "epoch": 0.0012630089926240274, + "grad_norm": 25.330848693847656, + "learning_rate": 1.7777777777777777e-05, + "loss": 1.3578, + "num_tokens": 62486.0, + "step": 25 + }, + { + "epoch": 0.0013135293523289886, + "grad_norm": 20.6157283782959, + "learning_rate": 1.851851851851852e-05, + "loss": 1.434, + "num_tokens": 64858.0, + "step": 26 + }, + { + "epoch": 0.0013640497120339496, + "grad_norm": 18.20408058166504, + "learning_rate": 1.925925925925926e-05, + "loss": 1.3757, + "num_tokens": 67729.0, + "step": 27 + }, + { + "epoch": 0.0014145700717389108, + "grad_norm": 20.01255989074707, + "learning_rate": 2e-05, + "loss": 1.2695, + "num_tokens": 70572.0, + "step": 28 + }, + { + "epoch": 0.0014650904314438718, + "grad_norm": 18.932086944580078, + "learning_rate": 1.9976608187134504e-05, + "loss": 1.3769, + "num_tokens": 73581.0, + "step": 29 + }, + { + "epoch": 0.001515610791148833, + "grad_norm": 15.81911563873291, + "learning_rate": 1.9953216374269007e-05, + "loss": 1.2035, + "num_tokens": 77028.0, + "step": 30 + }, + { + "epoch": 0.001566131150853794, + "grad_norm": 15.926407814025879, + "learning_rate": 1.992982456140351e-05, + "loss": 1.056, + "num_tokens": 79946.0, + "step": 31 + }, + { + "epoch": 0.0016166515105587552, + "grad_norm": 16.422443389892578, + "learning_rate": 1.9906432748538015e-05, + "loss": 1.2585, + "num_tokens": 82642.0, + "step": 32 + }, + { + "epoch": 0.0016671718702637162, + "grad_norm": 18.60963249206543, + "learning_rate": 1.9883040935672515e-05, + "loss": 1.4249, + "num_tokens": 84826.0, + "step": 33 + }, + { + "epoch": 0.0017176922299686774, + "grad_norm": 15.074679374694824, + "learning_rate": 1.9859649122807017e-05, + "loss": 1.2998, + "num_tokens": 87645.0, + "step": 34 + }, + { + "epoch": 0.0017682125896736384, + "grad_norm": 16.477188110351562, + "learning_rate": 1.9836257309941523e-05, + "loss": 1.0935, + "num_tokens": 90082.0, + "step": 35 + }, + { + "epoch": 0.0018187329493785996, + "grad_norm": 16.847627639770508, + "learning_rate": 1.9812865497076026e-05, + "loss": 1.1099, + "num_tokens": 92173.0, + "step": 36 + }, + { + "epoch": 0.0018692533090835606, + "grad_norm": 13.956262588500977, + "learning_rate": 1.9789473684210528e-05, + "loss": 1.2913, + "num_tokens": 95185.0, + "step": 37 + }, + { + "epoch": 0.0019197736687885218, + "grad_norm": 14.99960708618164, + "learning_rate": 1.976608187134503e-05, + "loss": 1.2391, + "num_tokens": 97845.0, + "step": 38 + }, + { + "epoch": 0.001970294028493483, + "grad_norm": 14.234345436096191, + "learning_rate": 1.9742690058479533e-05, + "loss": 1.3938, + "num_tokens": 100523.0, + "step": 39 + }, + { + "epoch": 0.002020814388198444, + "grad_norm": 12.731657028198242, + "learning_rate": 1.9719298245614036e-05, + "loss": 1.0717, + "num_tokens": 103335.0, + "step": 40 + }, + { + "epoch": 0.002071334747903405, + "grad_norm": 14.430706977844238, + "learning_rate": 1.969590643274854e-05, + "loss": 1.4927, + "num_tokens": 105519.0, + "step": 41 + }, + { + "epoch": 0.002121855107608366, + "grad_norm": 13.398330688476562, + "learning_rate": 1.9672514619883044e-05, + "loss": 1.2372, + "num_tokens": 107711.0, + "step": 42 + }, + { + "epoch": 0.002172375467313327, + "grad_norm": 15.616673469543457, + "learning_rate": 1.9649122807017544e-05, + "loss": 1.1269, + "num_tokens": 110813.0, + "step": 43 + }, + { + "epoch": 0.0022228958270182884, + "grad_norm": 13.693496704101562, + "learning_rate": 1.962573099415205e-05, + "loss": 1.2838, + "num_tokens": 113775.0, + "step": 44 + }, + { + "epoch": 0.0022734161867232496, + "grad_norm": 11.60438060760498, + "learning_rate": 1.9602339181286552e-05, + "loss": 1.2795, + "num_tokens": 115987.0, + "step": 45 + }, + { + "epoch": 0.0023239365464282104, + "grad_norm": 10.703219413757324, + "learning_rate": 1.9578947368421055e-05, + "loss": 1.1719, + "num_tokens": 118763.0, + "step": 46 + }, + { + "epoch": 0.0023744569061331716, + "grad_norm": 13.174905776977539, + "learning_rate": 1.9555555555555557e-05, + "loss": 1.0223, + "num_tokens": 120885.0, + "step": 47 + }, + { + "epoch": 0.002424977265838133, + "grad_norm": 13.412853240966797, + "learning_rate": 1.953216374269006e-05, + "loss": 1.1261, + "num_tokens": 123312.0, + "step": 48 + }, + { + "epoch": 0.002475497625543094, + "grad_norm": 16.830482482910156, + "learning_rate": 1.9508771929824562e-05, + "loss": 1.4913, + "num_tokens": 126507.0, + "step": 49 + }, + { + "epoch": 0.0025260179852480548, + "grad_norm": 14.804474830627441, + "learning_rate": 1.9485380116959065e-05, + "loss": 1.3638, + "num_tokens": 128806.0, + "step": 50 + }, + { + "epoch": 0.002576538344953016, + "grad_norm": 13.586077690124512, + "learning_rate": 1.9461988304093568e-05, + "loss": 1.0438, + "num_tokens": 130432.0, + "step": 51 + }, + { + "epoch": 0.002627058704657977, + "grad_norm": 10.382457733154297, + "learning_rate": 1.9438596491228074e-05, + "loss": 0.9718, + "num_tokens": 132983.0, + "step": 52 + }, + { + "epoch": 0.0026775790643629384, + "grad_norm": 11.234637260437012, + "learning_rate": 1.9415204678362573e-05, + "loss": 1.0573, + "num_tokens": 135268.0, + "step": 53 + }, + { + "epoch": 0.002728099424067899, + "grad_norm": 19.396862030029297, + "learning_rate": 1.939181286549708e-05, + "loss": 1.1807, + "num_tokens": 136719.0, + "step": 54 + }, + { + "epoch": 0.0027786197837728604, + "grad_norm": 10.552140235900879, + "learning_rate": 1.936842105263158e-05, + "loss": 1.1022, + "num_tokens": 139241.0, + "step": 55 + }, + { + "epoch": 0.0028291401434778216, + "grad_norm": 11.582058906555176, + "learning_rate": 1.9345029239766084e-05, + "loss": 1.1922, + "num_tokens": 141850.0, + "step": 56 + }, + { + "epoch": 0.002879660503182783, + "grad_norm": 15.60283088684082, + "learning_rate": 1.9321637426900586e-05, + "loss": 1.2008, + "num_tokens": 144239.0, + "step": 57 + }, + { + "epoch": 0.0029301808628877436, + "grad_norm": 13.299168586730957, + "learning_rate": 1.929824561403509e-05, + "loss": 1.0611, + "num_tokens": 146556.0, + "step": 58 + }, + { + "epoch": 0.0029807012225927048, + "grad_norm": 13.60411262512207, + "learning_rate": 1.927485380116959e-05, + "loss": 1.2405, + "num_tokens": 148783.0, + "step": 59 + }, + { + "epoch": 0.003031221582297666, + "grad_norm": 11.759748458862305, + "learning_rate": 1.9251461988304094e-05, + "loss": 1.1227, + "num_tokens": 151731.0, + "step": 60 + }, + { + "epoch": 0.003081741942002627, + "grad_norm": 13.562891960144043, + "learning_rate": 1.9228070175438597e-05, + "loss": 1.0028, + "num_tokens": 154529.0, + "step": 61 + }, + { + "epoch": 0.003132262301707588, + "grad_norm": 10.708846092224121, + "learning_rate": 1.9204678362573103e-05, + "loss": 1.1167, + "num_tokens": 157464.0, + "step": 62 + }, + { + "epoch": 0.003182782661412549, + "grad_norm": 11.097956657409668, + "learning_rate": 1.9181286549707602e-05, + "loss": 0.8722, + "num_tokens": 159833.0, + "step": 63 + }, + { + "epoch": 0.0032333030211175104, + "grad_norm": 12.363516807556152, + "learning_rate": 1.9157894736842108e-05, + "loss": 0.9283, + "num_tokens": 162016.0, + "step": 64 + }, + { + "epoch": 0.0032838233808224716, + "grad_norm": 10.18110179901123, + "learning_rate": 1.913450292397661e-05, + "loss": 1.0014, + "num_tokens": 164977.0, + "step": 65 + }, + { + "epoch": 0.0033343437405274324, + "grad_norm": 11.416281700134277, + "learning_rate": 1.9111111111111113e-05, + "loss": 1.1037, + "num_tokens": 167783.0, + "step": 66 + }, + { + "epoch": 0.0033848641002323936, + "grad_norm": 9.130837440490723, + "learning_rate": 1.9087719298245616e-05, + "loss": 1.1121, + "num_tokens": 170690.0, + "step": 67 + }, + { + "epoch": 0.003435384459937355, + "grad_norm": 14.66043472290039, + "learning_rate": 1.9064327485380118e-05, + "loss": 1.3696, + "num_tokens": 172597.0, + "step": 68 + }, + { + "epoch": 0.003485904819642316, + "grad_norm": 12.316787719726562, + "learning_rate": 1.904093567251462e-05, + "loss": 1.2659, + "num_tokens": 174762.0, + "step": 69 + }, + { + "epoch": 0.0035364251793472768, + "grad_norm": 12.782660484313965, + "learning_rate": 1.9017543859649123e-05, + "loss": 1.1124, + "num_tokens": 176485.0, + "step": 70 + }, + { + "epoch": 0.003586945539052238, + "grad_norm": 15.802315711975098, + "learning_rate": 1.8994152046783626e-05, + "loss": 1.256, + "num_tokens": 179565.0, + "step": 71 + }, + { + "epoch": 0.003637465898757199, + "grad_norm": 12.626399040222168, + "learning_rate": 1.8970760233918132e-05, + "loss": 1.2267, + "num_tokens": 181450.0, + "step": 72 + }, + { + "epoch": 0.0036879862584621604, + "grad_norm": 10.530603408813477, + "learning_rate": 1.894736842105263e-05, + "loss": 0.9479, + "num_tokens": 183719.0, + "step": 73 + }, + { + "epoch": 0.003738506618167121, + "grad_norm": 10.003649711608887, + "learning_rate": 1.8923976608187137e-05, + "loss": 1.2392, + "num_tokens": 186397.0, + "step": 74 + }, + { + "epoch": 0.0037890269778720824, + "grad_norm": 9.576560020446777, + "learning_rate": 1.890058479532164e-05, + "loss": 1.0477, + "num_tokens": 189195.0, + "step": 75 + }, + { + "epoch": 0.0038395473375770436, + "grad_norm": 11.996009826660156, + "learning_rate": 1.8877192982456142e-05, + "loss": 0.9894, + "num_tokens": 191277.0, + "step": 76 + }, + { + "epoch": 0.003890067697282005, + "grad_norm": 8.752811431884766, + "learning_rate": 1.8853801169590645e-05, + "loss": 1.3358, + "num_tokens": 194504.0, + "step": 77 + }, + { + "epoch": 0.003940588056986966, + "grad_norm": 10.455835342407227, + "learning_rate": 1.8830409356725147e-05, + "loss": 1.2135, + "num_tokens": 197030.0, + "step": 78 + }, + { + "epoch": 0.003991108416691927, + "grad_norm": 10.856278419494629, + "learning_rate": 1.880701754385965e-05, + "loss": 1.2045, + "num_tokens": 199000.0, + "step": 79 + }, + { + "epoch": 0.004041628776396888, + "grad_norm": 14.011846542358398, + "learning_rate": 1.8783625730994152e-05, + "loss": 1.136, + "num_tokens": 200481.0, + "step": 80 + }, + { + "epoch": 0.004092149136101849, + "grad_norm": 9.616527557373047, + "learning_rate": 1.8760233918128655e-05, + "loss": 1.2114, + "num_tokens": 203106.0, + "step": 81 + }, + { + "epoch": 0.00414266949580681, + "grad_norm": 11.71728801727295, + "learning_rate": 1.873684210526316e-05, + "loss": 1.048, + "num_tokens": 204942.0, + "step": 82 + }, + { + "epoch": 0.004193189855511772, + "grad_norm": 11.136825561523438, + "learning_rate": 1.871345029239766e-05, + "loss": 0.9881, + "num_tokens": 207081.0, + "step": 83 + }, + { + "epoch": 0.004243710215216732, + "grad_norm": 8.428516387939453, + "learning_rate": 1.8690058479532166e-05, + "loss": 1.2671, + "num_tokens": 210213.0, + "step": 84 + }, + { + "epoch": 0.004294230574921693, + "grad_norm": 11.529748916625977, + "learning_rate": 1.866666666666667e-05, + "loss": 1.0167, + "num_tokens": 212325.0, + "step": 85 + }, + { + "epoch": 0.004344750934626654, + "grad_norm": 10.744977951049805, + "learning_rate": 1.864327485380117e-05, + "loss": 0.9581, + "num_tokens": 214808.0, + "step": 86 + }, + { + "epoch": 0.004395271294331616, + "grad_norm": 11.164356231689453, + "learning_rate": 1.8619883040935674e-05, + "loss": 1.1796, + "num_tokens": 216929.0, + "step": 87 + }, + { + "epoch": 0.004445791654036577, + "grad_norm": 8.91334056854248, + "learning_rate": 1.8596491228070176e-05, + "loss": 1.076, + "num_tokens": 219889.0, + "step": 88 + }, + { + "epoch": 0.004496312013741538, + "grad_norm": 10.778691291809082, + "learning_rate": 1.857309941520468e-05, + "loss": 1.0683, + "num_tokens": 221914.0, + "step": 89 + }, + { + "epoch": 0.004546832373446499, + "grad_norm": 9.788928031921387, + "learning_rate": 1.854970760233918e-05, + "loss": 1.2023, + "num_tokens": 224619.0, + "step": 90 + }, + { + "epoch": 0.00459735273315146, + "grad_norm": 8.505138397216797, + "learning_rate": 1.8526315789473684e-05, + "loss": 0.8132, + "num_tokens": 227289.0, + "step": 91 + }, + { + "epoch": 0.004647873092856421, + "grad_norm": 9.891079902648926, + "learning_rate": 1.850292397660819e-05, + "loss": 1.2351, + "num_tokens": 230137.0, + "step": 92 + }, + { + "epoch": 0.004698393452561382, + "grad_norm": 12.763185501098633, + "learning_rate": 1.847953216374269e-05, + "loss": 1.192, + "num_tokens": 232669.0, + "step": 93 + }, + { + "epoch": 0.004748913812266343, + "grad_norm": 9.009286880493164, + "learning_rate": 1.8456140350877195e-05, + "loss": 1.2523, + "num_tokens": 235961.0, + "step": 94 + }, + { + "epoch": 0.004799434171971304, + "grad_norm": 13.312901496887207, + "learning_rate": 1.8432748538011698e-05, + "loss": 0.9824, + "num_tokens": 237509.0, + "step": 95 + }, + { + "epoch": 0.004849954531676266, + "grad_norm": 8.884242057800293, + "learning_rate": 1.84093567251462e-05, + "loss": 1.1793, + "num_tokens": 240206.0, + "step": 96 + }, + { + "epoch": 0.004900474891381227, + "grad_norm": 10.614914894104004, + "learning_rate": 1.8385964912280703e-05, + "loss": 1.2508, + "num_tokens": 242750.0, + "step": 97 + }, + { + "epoch": 0.004950995251086188, + "grad_norm": 8.750896453857422, + "learning_rate": 1.8362573099415205e-05, + "loss": 1.2678, + "num_tokens": 245948.0, + "step": 98 + }, + { + "epoch": 0.005001515610791149, + "grad_norm": 8.790016174316406, + "learning_rate": 1.833918128654971e-05, + "loss": 0.989, + "num_tokens": 248792.0, + "step": 99 + }, + { + "epoch": 0.0050520359704961096, + "grad_norm": 11.009282112121582, + "learning_rate": 1.831578947368421e-05, + "loss": 1.2416, + "num_tokens": 250955.0, + "step": 100 + }, + { + "epoch": 0.005102556330201071, + "grad_norm": 9.102219581604004, + "learning_rate": 1.8292397660818713e-05, + "loss": 0.9346, + "num_tokens": 253294.0, + "step": 101 + }, + { + "epoch": 0.005153076689906032, + "grad_norm": 10.528311729431152, + "learning_rate": 1.826900584795322e-05, + "loss": 1.0751, + "num_tokens": 255690.0, + "step": 102 + }, + { + "epoch": 0.005203597049610993, + "grad_norm": 10.888592720031738, + "learning_rate": 1.824561403508772e-05, + "loss": 0.9822, + "num_tokens": 257543.0, + "step": 103 + }, + { + "epoch": 0.005254117409315954, + "grad_norm": 8.510196685791016, + "learning_rate": 1.8222222222222224e-05, + "loss": 0.9857, + "num_tokens": 260459.0, + "step": 104 + }, + { + "epoch": 0.005304637769020916, + "grad_norm": 8.882211685180664, + "learning_rate": 1.8198830409356727e-05, + "loss": 1.0541, + "num_tokens": 263170.0, + "step": 105 + }, + { + "epoch": 0.005355158128725877, + "grad_norm": 9.070907592773438, + "learning_rate": 1.817543859649123e-05, + "loss": 1.1895, + "num_tokens": 265818.0, + "step": 106 + }, + { + "epoch": 0.005405678488430838, + "grad_norm": 8.84577751159668, + "learning_rate": 1.8152046783625732e-05, + "loss": 1.0264, + "num_tokens": 269220.0, + "step": 107 + }, + { + "epoch": 0.005456198848135798, + "grad_norm": 11.427336692810059, + "learning_rate": 1.8128654970760235e-05, + "loss": 1.0469, + "num_tokens": 271446.0, + "step": 108 + }, + { + "epoch": 0.0055067192078407596, + "grad_norm": 11.6751708984375, + "learning_rate": 1.810526315789474e-05, + "loss": 1.1459, + "num_tokens": 273575.0, + "step": 109 + }, + { + "epoch": 0.005557239567545721, + "grad_norm": 11.761689186096191, + "learning_rate": 1.808187134502924e-05, + "loss": 1.1233, + "num_tokens": 275401.0, + "step": 110 + }, + { + "epoch": 0.005607759927250682, + "grad_norm": 9.728570938110352, + "learning_rate": 1.8058479532163746e-05, + "loss": 1.1521, + "num_tokens": 277649.0, + "step": 111 + }, + { + "epoch": 0.005658280286955643, + "grad_norm": 11.157766342163086, + "learning_rate": 1.8035087719298248e-05, + "loss": 1.3542, + "num_tokens": 280532.0, + "step": 112 + }, + { + "epoch": 0.005708800646660604, + "grad_norm": 12.459864616394043, + "learning_rate": 1.8011695906432747e-05, + "loss": 1.0957, + "num_tokens": 282329.0, + "step": 113 + }, + { + "epoch": 0.005759321006365566, + "grad_norm": 12.06000804901123, + "learning_rate": 1.7988304093567253e-05, + "loss": 0.962, + "num_tokens": 285003.0, + "step": 114 + }, + { + "epoch": 0.005809841366070527, + "grad_norm": 10.345196723937988, + "learning_rate": 1.7964912280701756e-05, + "loss": 1.1397, + "num_tokens": 288576.0, + "step": 115 + }, + { + "epoch": 0.005860361725775487, + "grad_norm": 9.01580810546875, + "learning_rate": 1.794152046783626e-05, + "loss": 1.2344, + "num_tokens": 291564.0, + "step": 116 + }, + { + "epoch": 0.005910882085480448, + "grad_norm": 8.928837776184082, + "learning_rate": 1.791812865497076e-05, + "loss": 1.1266, + "num_tokens": 294429.0, + "step": 117 + }, + { + "epoch": 0.0059614024451854096, + "grad_norm": 11.21674633026123, + "learning_rate": 1.7894736842105264e-05, + "loss": 1.1374, + "num_tokens": 296652.0, + "step": 118 + }, + { + "epoch": 0.006011922804890371, + "grad_norm": 11.951678276062012, + "learning_rate": 1.787134502923977e-05, + "loss": 1.1432, + "num_tokens": 298882.0, + "step": 119 + }, + { + "epoch": 0.006062443164595332, + "grad_norm": 9.76797866821289, + "learning_rate": 1.784795321637427e-05, + "loss": 1.1307, + "num_tokens": 301611.0, + "step": 120 + }, + { + "epoch": 0.006112963524300293, + "grad_norm": 14.044525146484375, + "learning_rate": 1.7824561403508775e-05, + "loss": 1.2233, + "num_tokens": 303300.0, + "step": 121 + }, + { + "epoch": 0.006163483884005254, + "grad_norm": 8.35912799835205, + "learning_rate": 1.7801169590643277e-05, + "loss": 1.1593, + "num_tokens": 306448.0, + "step": 122 + }, + { + "epoch": 0.006214004243710216, + "grad_norm": 9.726041793823242, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.9313, + "num_tokens": 308815.0, + "step": 123 + }, + { + "epoch": 0.006264524603415176, + "grad_norm": 10.566062927246094, + "learning_rate": 1.7754385964912283e-05, + "loss": 1.1177, + "num_tokens": 311561.0, + "step": 124 + }, + { + "epoch": 0.006315044963120137, + "grad_norm": 9.374105453491211, + "learning_rate": 1.7730994152046785e-05, + "loss": 1.0791, + "num_tokens": 314524.0, + "step": 125 + }, + { + "epoch": 0.006365565322825098, + "grad_norm": 8.617390632629395, + "learning_rate": 1.7707602339181288e-05, + "loss": 0.9974, + "num_tokens": 317477.0, + "step": 126 + }, + { + "epoch": 0.00641608568253006, + "grad_norm": 9.578972816467285, + "learning_rate": 1.768421052631579e-05, + "loss": 1.0462, + "num_tokens": 320329.0, + "step": 127 + }, + { + "epoch": 0.006466606042235021, + "grad_norm": 9.119145393371582, + "learning_rate": 1.7660818713450293e-05, + "loss": 0.9974, + "num_tokens": 323272.0, + "step": 128 + }, + { + "epoch": 0.006517126401939982, + "grad_norm": 9.659331321716309, + "learning_rate": 1.76374269005848e-05, + "loss": 1.0658, + "num_tokens": 326046.0, + "step": 129 + }, + { + "epoch": 0.006567646761644943, + "grad_norm": 11.023038864135742, + "learning_rate": 1.7614035087719298e-05, + "loss": 1.2348, + "num_tokens": 328203.0, + "step": 130 + }, + { + "epoch": 0.006618167121349904, + "grad_norm": 10.130457878112793, + "learning_rate": 1.7590643274853804e-05, + "loss": 1.2609, + "num_tokens": 330592.0, + "step": 131 + }, + { + "epoch": 0.006668687481054865, + "grad_norm": 8.70687198638916, + "learning_rate": 1.7567251461988307e-05, + "loss": 1.0515, + "num_tokens": 333422.0, + "step": 132 + }, + { + "epoch": 0.006719207840759826, + "grad_norm": 8.856954574584961, + "learning_rate": 1.754385964912281e-05, + "loss": 1.3311, + "num_tokens": 336140.0, + "step": 133 + }, + { + "epoch": 0.006769728200464787, + "grad_norm": 8.185256004333496, + "learning_rate": 1.752046783625731e-05, + "loss": 0.9183, + "num_tokens": 338645.0, + "step": 134 + }, + { + "epoch": 0.006820248560169748, + "grad_norm": 9.065742492675781, + "learning_rate": 1.7497076023391814e-05, + "loss": 0.9429, + "num_tokens": 340717.0, + "step": 135 + }, + { + "epoch": 0.00687076891987471, + "grad_norm": 14.22849178314209, + "learning_rate": 1.7473684210526317e-05, + "loss": 1.0964, + "num_tokens": 341946.0, + "step": 136 + }, + { + "epoch": 0.006921289279579671, + "grad_norm": 8.727455139160156, + "learning_rate": 1.745029239766082e-05, + "loss": 1.0153, + "num_tokens": 344409.0, + "step": 137 + }, + { + "epoch": 0.006971809639284632, + "grad_norm": 10.213702201843262, + "learning_rate": 1.7426900584795322e-05, + "loss": 1.0678, + "num_tokens": 346995.0, + "step": 138 + }, + { + "epoch": 0.007022329998989593, + "grad_norm": 9.326253890991211, + "learning_rate": 1.7403508771929828e-05, + "loss": 0.9906, + "num_tokens": 349103.0, + "step": 139 + }, + { + "epoch": 0.0070728503586945535, + "grad_norm": 9.160941123962402, + "learning_rate": 1.7380116959064327e-05, + "loss": 1.1067, + "num_tokens": 351555.0, + "step": 140 + }, + { + "epoch": 0.007123370718399515, + "grad_norm": 8.147638320922852, + "learning_rate": 1.7356725146198833e-05, + "loss": 1.0307, + "num_tokens": 354979.0, + "step": 141 + }, + { + "epoch": 0.007173891078104476, + "grad_norm": 10.61776065826416, + "learning_rate": 1.7333333333333336e-05, + "loss": 0.9498, + "num_tokens": 357868.0, + "step": 142 + }, + { + "epoch": 0.007224411437809437, + "grad_norm": 8.159193992614746, + "learning_rate": 1.7309941520467838e-05, + "loss": 1.3346, + "num_tokens": 361202.0, + "step": 143 + }, + { + "epoch": 0.007274931797514398, + "grad_norm": 9.549636840820312, + "learning_rate": 1.728654970760234e-05, + "loss": 0.9609, + "num_tokens": 363561.0, + "step": 144 + }, + { + "epoch": 0.00732545215721936, + "grad_norm": 10.52137279510498, + "learning_rate": 1.7263157894736843e-05, + "loss": 1.1242, + "num_tokens": 365779.0, + "step": 145 + }, + { + "epoch": 0.007375972516924321, + "grad_norm": 10.084936141967773, + "learning_rate": 1.7239766081871346e-05, + "loss": 1.1155, + "num_tokens": 367866.0, + "step": 146 + }, + { + "epoch": 0.007426492876629282, + "grad_norm": 8.489282608032227, + "learning_rate": 1.721637426900585e-05, + "loss": 1.0286, + "num_tokens": 370652.0, + "step": 147 + }, + { + "epoch": 0.007477013236334242, + "grad_norm": 10.306445121765137, + "learning_rate": 1.719298245614035e-05, + "loss": 0.8878, + "num_tokens": 372474.0, + "step": 148 + }, + { + "epoch": 0.0075275335960392036, + "grad_norm": 8.740140914916992, + "learning_rate": 1.7169590643274857e-05, + "loss": 1.0746, + "num_tokens": 375152.0, + "step": 149 + }, + { + "epoch": 0.007578053955744165, + "grad_norm": 9.163982391357422, + "learning_rate": 1.7146198830409356e-05, + "loss": 1.0951, + "num_tokens": 377802.0, + "step": 150 + }, + { + "epoch": 0.007628574315449126, + "grad_norm": 9.95666217803955, + "learning_rate": 1.7122807017543862e-05, + "loss": 1.306, + "num_tokens": 380314.0, + "step": 151 + }, + { + "epoch": 0.007679094675154087, + "grad_norm": 9.463590621948242, + "learning_rate": 1.7099415204678365e-05, + "loss": 1.1088, + "num_tokens": 383187.0, + "step": 152 + }, + { + "epoch": 0.007729615034859048, + "grad_norm": 9.050843238830566, + "learning_rate": 1.7076023391812867e-05, + "loss": 0.9069, + "num_tokens": 385684.0, + "step": 153 + }, + { + "epoch": 0.00778013539456401, + "grad_norm": 10.796246528625488, + "learning_rate": 1.705263157894737e-05, + "loss": 1.3432, + "num_tokens": 387853.0, + "step": 154 + }, + { + "epoch": 0.00783065575426897, + "grad_norm": 8.347738265991211, + "learning_rate": 1.7029239766081872e-05, + "loss": 1.1035, + "num_tokens": 390601.0, + "step": 155 + }, + { + "epoch": 0.007881176113973931, + "grad_norm": 9.587059020996094, + "learning_rate": 1.7005847953216375e-05, + "loss": 0.9998, + "num_tokens": 393268.0, + "step": 156 + }, + { + "epoch": 0.007931696473678893, + "grad_norm": 8.672136306762695, + "learning_rate": 1.6982456140350878e-05, + "loss": 1.0505, + "num_tokens": 395851.0, + "step": 157 + }, + { + "epoch": 0.007982216833383854, + "grad_norm": 7.66909122467041, + "learning_rate": 1.695906432748538e-05, + "loss": 1.1595, + "num_tokens": 399221.0, + "step": 158 + }, + { + "epoch": 0.008032737193088816, + "grad_norm": 9.186577796936035, + "learning_rate": 1.6935672514619886e-05, + "loss": 1.0684, + "num_tokens": 401481.0, + "step": 159 + }, + { + "epoch": 0.008083257552793776, + "grad_norm": 7.981843948364258, + "learning_rate": 1.6912280701754385e-05, + "loss": 1.0981, + "num_tokens": 403943.0, + "step": 160 + }, + { + "epoch": 0.008133777912498736, + "grad_norm": 8.94477367401123, + "learning_rate": 1.688888888888889e-05, + "loss": 1.1426, + "num_tokens": 406396.0, + "step": 161 + }, + { + "epoch": 0.008184298272203698, + "grad_norm": 8.0164213180542, + "learning_rate": 1.6865497076023394e-05, + "loss": 1.1282, + "num_tokens": 409305.0, + "step": 162 + }, + { + "epoch": 0.008234818631908659, + "grad_norm": 8.932347297668457, + "learning_rate": 1.6842105263157896e-05, + "loss": 1.2709, + "num_tokens": 411682.0, + "step": 163 + }, + { + "epoch": 0.00828533899161362, + "grad_norm": 9.681609153747559, + "learning_rate": 1.68187134502924e-05, + "loss": 1.0333, + "num_tokens": 413424.0, + "step": 164 + }, + { + "epoch": 0.008335859351318581, + "grad_norm": 9.514274597167969, + "learning_rate": 1.67953216374269e-05, + "loss": 1.0157, + "num_tokens": 415144.0, + "step": 165 + }, + { + "epoch": 0.008386379711023543, + "grad_norm": 10.601012229919434, + "learning_rate": 1.6771929824561408e-05, + "loss": 1.0232, + "num_tokens": 417042.0, + "step": 166 + }, + { + "epoch": 0.008436900070728504, + "grad_norm": 8.665529251098633, + "learning_rate": 1.6748538011695907e-05, + "loss": 1.137, + "num_tokens": 419911.0, + "step": 167 + }, + { + "epoch": 0.008487420430433464, + "grad_norm": 11.028315544128418, + "learning_rate": 1.672514619883041e-05, + "loss": 1.456, + "num_tokens": 422289.0, + "step": 168 + }, + { + "epoch": 0.008537940790138426, + "grad_norm": 8.87790584564209, + "learning_rate": 1.6701754385964915e-05, + "loss": 1.2796, + "num_tokens": 426103.0, + "step": 169 + }, + { + "epoch": 0.008588461149843386, + "grad_norm": 9.805023193359375, + "learning_rate": 1.6678362573099414e-05, + "loss": 1.2709, + "num_tokens": 429098.0, + "step": 170 + }, + { + "epoch": 0.008638981509548348, + "grad_norm": 8.186580657958984, + "learning_rate": 1.665497076023392e-05, + "loss": 0.9664, + "num_tokens": 431545.0, + "step": 171 + }, + { + "epoch": 0.008689501869253309, + "grad_norm": 7.773228168487549, + "learning_rate": 1.6631578947368423e-05, + "loss": 1.1346, + "num_tokens": 434453.0, + "step": 172 + }, + { + "epoch": 0.00874002222895827, + "grad_norm": 9.658727645874023, + "learning_rate": 1.6608187134502926e-05, + "loss": 0.9211, + "num_tokens": 436460.0, + "step": 173 + }, + { + "epoch": 0.008790542588663231, + "grad_norm": 11.120407104492188, + "learning_rate": 1.6584795321637428e-05, + "loss": 1.6329, + "num_tokens": 439010.0, + "step": 174 + }, + { + "epoch": 0.008841062948368193, + "grad_norm": 8.684976577758789, + "learning_rate": 1.656140350877193e-05, + "loss": 1.0009, + "num_tokens": 441723.0, + "step": 175 + }, + { + "epoch": 0.008891583308073154, + "grad_norm": 10.02524185180664, + "learning_rate": 1.6538011695906437e-05, + "loss": 1.0103, + "num_tokens": 443759.0, + "step": 176 + }, + { + "epoch": 0.008942103667778114, + "grad_norm": 8.690000534057617, + "learning_rate": 1.6514619883040936e-05, + "loss": 1.0249, + "num_tokens": 446476.0, + "step": 177 + }, + { + "epoch": 0.008992624027483076, + "grad_norm": 11.559849739074707, + "learning_rate": 1.649122807017544e-05, + "loss": 1.1606, + "num_tokens": 448803.0, + "step": 178 + }, + { + "epoch": 0.009043144387188036, + "grad_norm": 8.70150089263916, + "learning_rate": 1.6467836257309944e-05, + "loss": 0.8612, + "num_tokens": 451290.0, + "step": 179 + }, + { + "epoch": 0.009093664746892998, + "grad_norm": 10.985761642456055, + "learning_rate": 1.6444444444444444e-05, + "loss": 0.9861, + "num_tokens": 453243.0, + "step": 180 + }, + { + "epoch": 0.009144185106597959, + "grad_norm": 9.171394348144531, + "learning_rate": 1.642105263157895e-05, + "loss": 1.0685, + "num_tokens": 455983.0, + "step": 181 + }, + { + "epoch": 0.00919470546630292, + "grad_norm": 8.427783012390137, + "learning_rate": 1.6397660818713452e-05, + "loss": 0.9651, + "num_tokens": 458462.0, + "step": 182 + }, + { + "epoch": 0.009245225826007881, + "grad_norm": 9.658039093017578, + "learning_rate": 1.6374269005847955e-05, + "loss": 1.1239, + "num_tokens": 460277.0, + "step": 183 + }, + { + "epoch": 0.009295746185712842, + "grad_norm": 9.490887641906738, + "learning_rate": 1.6350877192982457e-05, + "loss": 1.335, + "num_tokens": 463048.0, + "step": 184 + }, + { + "epoch": 0.009346266545417804, + "grad_norm": 9.867181777954102, + "learning_rate": 1.632748538011696e-05, + "loss": 0.9359, + "num_tokens": 465246.0, + "step": 185 + }, + { + "epoch": 0.009396786905122764, + "grad_norm": 8.184590339660645, + "learning_rate": 1.6304093567251466e-05, + "loss": 1.1818, + "num_tokens": 468440.0, + "step": 186 + }, + { + "epoch": 0.009447307264827726, + "grad_norm": 9.460564613342285, + "learning_rate": 1.6280701754385965e-05, + "loss": 1.1308, + "num_tokens": 470753.0, + "step": 187 + }, + { + "epoch": 0.009497827624532686, + "grad_norm": 8.28761100769043, + "learning_rate": 1.625730994152047e-05, + "loss": 1.0784, + "num_tokens": 473456.0, + "step": 188 + }, + { + "epoch": 0.009548347984237648, + "grad_norm": 9.042991638183594, + "learning_rate": 1.6233918128654974e-05, + "loss": 1.1404, + "num_tokens": 475815.0, + "step": 189 + }, + { + "epoch": 0.009598868343942609, + "grad_norm": 8.574507713317871, + "learning_rate": 1.6210526315789473e-05, + "loss": 1.1259, + "num_tokens": 479052.0, + "step": 190 + }, + { + "epoch": 0.00964938870364757, + "grad_norm": 9.014772415161133, + "learning_rate": 1.618713450292398e-05, + "loss": 1.0067, + "num_tokens": 481445.0, + "step": 191 + }, + { + "epoch": 0.009699909063352531, + "grad_norm": 7.718846797943115, + "learning_rate": 1.616374269005848e-05, + "loss": 0.8926, + "num_tokens": 483822.0, + "step": 192 + }, + { + "epoch": 0.009750429423057492, + "grad_norm": 9.32669448852539, + "learning_rate": 1.6140350877192984e-05, + "loss": 0.91, + "num_tokens": 486041.0, + "step": 193 + }, + { + "epoch": 0.009800949782762454, + "grad_norm": 8.745234489440918, + "learning_rate": 1.6116959064327486e-05, + "loss": 1.1527, + "num_tokens": 489036.0, + "step": 194 + }, + { + "epoch": 0.009851470142467414, + "grad_norm": 7.342167377471924, + "learning_rate": 1.609356725146199e-05, + "loss": 1.144, + "num_tokens": 492148.0, + "step": 195 + }, + { + "epoch": 0.009901990502172376, + "grad_norm": 14.181269645690918, + "learning_rate": 1.6070175438596495e-05, + "loss": 1.3328, + "num_tokens": 494297.0, + "step": 196 + }, + { + "epoch": 0.009952510861877336, + "grad_norm": 11.076637268066406, + "learning_rate": 1.6046783625730994e-05, + "loss": 1.3129, + "num_tokens": 496514.0, + "step": 197 + }, + { + "epoch": 0.010003031221582298, + "grad_norm": 7.233493328094482, + "learning_rate": 1.60233918128655e-05, + "loss": 1.0351, + "num_tokens": 500106.0, + "step": 198 + }, + { + "epoch": 0.010053551581287259, + "grad_norm": 7.64019250869751, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.973, + "num_tokens": 502787.0, + "step": 199 + }, + { + "epoch": 0.010104071940992219, + "grad_norm": 10.309501647949219, + "learning_rate": 1.5976608187134505e-05, + "loss": 1.0403, + "num_tokens": 505761.0, + "step": 200 + }, + { + "epoch": 0.010154592300697181, + "grad_norm": 9.313553810119629, + "learning_rate": 1.5953216374269008e-05, + "loss": 0.9903, + "num_tokens": 507627.0, + "step": 201 + }, + { + "epoch": 0.010205112660402142, + "grad_norm": 8.003402709960938, + "learning_rate": 1.592982456140351e-05, + "loss": 1.0583, + "num_tokens": 510427.0, + "step": 202 + }, + { + "epoch": 0.010255633020107104, + "grad_norm": 8.720063209533691, + "learning_rate": 1.5906432748538013e-05, + "loss": 1.0858, + "num_tokens": 512767.0, + "step": 203 + }, + { + "epoch": 0.010306153379812064, + "grad_norm": 7.976707458496094, + "learning_rate": 1.5883040935672516e-05, + "loss": 1.0577, + "num_tokens": 515374.0, + "step": 204 + }, + { + "epoch": 0.010356673739517026, + "grad_norm": 8.53968620300293, + "learning_rate": 1.5859649122807018e-05, + "loss": 0.872, + "num_tokens": 517583.0, + "step": 205 + }, + { + "epoch": 0.010407194099221986, + "grad_norm": 13.066980361938477, + "learning_rate": 1.583625730994152e-05, + "loss": 1.2191, + "num_tokens": 519009.0, + "step": 206 + }, + { + "epoch": 0.010457714458926948, + "grad_norm": 9.141525268554688, + "learning_rate": 1.5812865497076023e-05, + "loss": 1.1477, + "num_tokens": 521284.0, + "step": 207 + }, + { + "epoch": 0.010508234818631909, + "grad_norm": 12.273428916931152, + "learning_rate": 1.578947368421053e-05, + "loss": 1.0736, + "num_tokens": 524052.0, + "step": 208 + }, + { + "epoch": 0.010558755178336869, + "grad_norm": 7.734642505645752, + "learning_rate": 1.5766081871345032e-05, + "loss": 1.0034, + "num_tokens": 526698.0, + "step": 209 + }, + { + "epoch": 0.010609275538041831, + "grad_norm": 8.959403991699219, + "learning_rate": 1.5742690058479534e-05, + "loss": 1.3169, + "num_tokens": 529570.0, + "step": 210 + }, + { + "epoch": 0.010659795897746792, + "grad_norm": 9.456208229064941, + "learning_rate": 1.5719298245614037e-05, + "loss": 1.238, + "num_tokens": 532215.0, + "step": 211 + }, + { + "epoch": 0.010710316257451754, + "grad_norm": 8.091153144836426, + "learning_rate": 1.569590643274854e-05, + "loss": 1.0445, + "num_tokens": 534781.0, + "step": 212 + }, + { + "epoch": 0.010760836617156714, + "grad_norm": 13.84492015838623, + "learning_rate": 1.5672514619883042e-05, + "loss": 1.1039, + "num_tokens": 536386.0, + "step": 213 + }, + { + "epoch": 0.010811356976861676, + "grad_norm": 9.705891609191895, + "learning_rate": 1.5649122807017545e-05, + "loss": 1.0201, + "num_tokens": 538529.0, + "step": 214 + }, + { + "epoch": 0.010861877336566636, + "grad_norm": 10.426817893981934, + "learning_rate": 1.5625730994152047e-05, + "loss": 0.9575, + "num_tokens": 540459.0, + "step": 215 + }, + { + "epoch": 0.010912397696271597, + "grad_norm": 9.1675386428833, + "learning_rate": 1.560233918128655e-05, + "loss": 1.0637, + "num_tokens": 542828.0, + "step": 216 + }, + { + "epoch": 0.010962918055976559, + "grad_norm": 8.99853801727295, + "learning_rate": 1.5578947368421052e-05, + "loss": 1.2354, + "num_tokens": 544953.0, + "step": 217 + }, + { + "epoch": 0.011013438415681519, + "grad_norm": 10.00879955291748, + "learning_rate": 1.555555555555556e-05, + "loss": 1.0261, + "num_tokens": 546711.0, + "step": 218 + }, + { + "epoch": 0.011063958775386481, + "grad_norm": 8.375962257385254, + "learning_rate": 1.553216374269006e-05, + "loss": 1.0244, + "num_tokens": 549316.0, + "step": 219 + }, + { + "epoch": 0.011114479135091442, + "grad_norm": 9.637651443481445, + "learning_rate": 1.5508771929824563e-05, + "loss": 1.2065, + "num_tokens": 551637.0, + "step": 220 + }, + { + "epoch": 0.011164999494796404, + "grad_norm": 10.263127326965332, + "learning_rate": 1.5485380116959066e-05, + "loss": 1.3222, + "num_tokens": 554221.0, + "step": 221 + }, + { + "epoch": 0.011215519854501364, + "grad_norm": 9.176608085632324, + "learning_rate": 1.546198830409357e-05, + "loss": 0.8675, + "num_tokens": 557228.0, + "step": 222 + }, + { + "epoch": 0.011266040214206324, + "grad_norm": 9.548181533813477, + "learning_rate": 1.543859649122807e-05, + "loss": 1.0622, + "num_tokens": 559452.0, + "step": 223 + }, + { + "epoch": 0.011316560573911286, + "grad_norm": 8.64331340789795, + "learning_rate": 1.5415204678362574e-05, + "loss": 0.8962, + "num_tokens": 561714.0, + "step": 224 + }, + { + "epoch": 0.011367080933616247, + "grad_norm": 7.029260158538818, + "learning_rate": 1.5391812865497076e-05, + "loss": 0.8875, + "num_tokens": 564759.0, + "step": 225 + }, + { + "epoch": 0.011417601293321209, + "grad_norm": 10.958613395690918, + "learning_rate": 1.536842105263158e-05, + "loss": 1.167, + "num_tokens": 566574.0, + "step": 226 + }, + { + "epoch": 0.011468121653026169, + "grad_norm": 9.12553596496582, + "learning_rate": 1.534502923976608e-05, + "loss": 1.0156, + "num_tokens": 568734.0, + "step": 227 + }, + { + "epoch": 0.011518642012731131, + "grad_norm": 10.394488334655762, + "learning_rate": 1.5321637426900587e-05, + "loss": 1.0796, + "num_tokens": 570546.0, + "step": 228 + }, + { + "epoch": 0.011569162372436092, + "grad_norm": 8.067846298217773, + "learning_rate": 1.529824561403509e-05, + "loss": 0.969, + "num_tokens": 572991.0, + "step": 229 + }, + { + "epoch": 0.011619682732141054, + "grad_norm": 9.05314826965332, + "learning_rate": 1.5274853801169593e-05, + "loss": 1.0741, + "num_tokens": 575292.0, + "step": 230 + }, + { + "epoch": 0.011670203091846014, + "grad_norm": 7.073040008544922, + "learning_rate": 1.5251461988304095e-05, + "loss": 0.8952, + "num_tokens": 578054.0, + "step": 231 + }, + { + "epoch": 0.011720723451550974, + "grad_norm": 8.003592491149902, + "learning_rate": 1.5228070175438598e-05, + "loss": 0.9012, + "num_tokens": 580765.0, + "step": 232 + }, + { + "epoch": 0.011771243811255936, + "grad_norm": 8.793253898620605, + "learning_rate": 1.52046783625731e-05, + "loss": 1.0092, + "num_tokens": 583075.0, + "step": 233 + }, + { + "epoch": 0.011821764170960897, + "grad_norm": 10.239913940429688, + "learning_rate": 1.5181286549707603e-05, + "loss": 0.9599, + "num_tokens": 585033.0, + "step": 234 + }, + { + "epoch": 0.011872284530665859, + "grad_norm": 7.711139678955078, + "learning_rate": 1.5157894736842107e-05, + "loss": 1.0732, + "num_tokens": 587761.0, + "step": 235 + }, + { + "epoch": 0.011922804890370819, + "grad_norm": 8.047666549682617, + "learning_rate": 1.5134502923976608e-05, + "loss": 1.2469, + "num_tokens": 590689.0, + "step": 236 + }, + { + "epoch": 0.011973325250075781, + "grad_norm": 7.255981922149658, + "learning_rate": 1.5111111111111112e-05, + "loss": 1.0444, + "num_tokens": 593630.0, + "step": 237 + }, + { + "epoch": 0.012023845609780742, + "grad_norm": 8.820837020874023, + "learning_rate": 1.5087719298245615e-05, + "loss": 0.9073, + "num_tokens": 595542.0, + "step": 238 + }, + { + "epoch": 0.012074365969485702, + "grad_norm": 7.713109970092773, + "learning_rate": 1.5064327485380119e-05, + "loss": 1.0993, + "num_tokens": 598149.0, + "step": 239 + }, + { + "epoch": 0.012124886329190664, + "grad_norm": 6.606431484222412, + "learning_rate": 1.504093567251462e-05, + "loss": 0.9194, + "num_tokens": 601555.0, + "step": 240 + }, + { + "epoch": 0.012175406688895624, + "grad_norm": 7.347223281860352, + "learning_rate": 1.5017543859649124e-05, + "loss": 1.0577, + "num_tokens": 604330.0, + "step": 241 + }, + { + "epoch": 0.012225927048600586, + "grad_norm": 9.134928703308105, + "learning_rate": 1.4994152046783627e-05, + "loss": 1.059, + "num_tokens": 607624.0, + "step": 242 + }, + { + "epoch": 0.012276447408305547, + "grad_norm": 7.343535423278809, + "learning_rate": 1.497076023391813e-05, + "loss": 0.9968, + "num_tokens": 610457.0, + "step": 243 + }, + { + "epoch": 0.012326967768010509, + "grad_norm": 7.686361789703369, + "learning_rate": 1.4947368421052632e-05, + "loss": 1.0537, + "num_tokens": 613349.0, + "step": 244 + }, + { + "epoch": 0.01237748812771547, + "grad_norm": 10.147302627563477, + "learning_rate": 1.4923976608187136e-05, + "loss": 1.1463, + "num_tokens": 615486.0, + "step": 245 + }, + { + "epoch": 0.012428008487420431, + "grad_norm": 7.770886421203613, + "learning_rate": 1.4900584795321637e-05, + "loss": 0.8683, + "num_tokens": 618065.0, + "step": 246 + }, + { + "epoch": 0.012478528847125392, + "grad_norm": 10.838753700256348, + "learning_rate": 1.4877192982456141e-05, + "loss": 1.3516, + "num_tokens": 620391.0, + "step": 247 + }, + { + "epoch": 0.012529049206830352, + "grad_norm": 8.523736000061035, + "learning_rate": 1.4853801169590644e-05, + "loss": 1.0872, + "num_tokens": 622601.0, + "step": 248 + }, + { + "epoch": 0.012579569566535314, + "grad_norm": 9.177868843078613, + "learning_rate": 1.4830409356725148e-05, + "loss": 1.2674, + "num_tokens": 624943.0, + "step": 249 + }, + { + "epoch": 0.012630089926240274, + "grad_norm": 8.880724906921387, + "learning_rate": 1.4807017543859649e-05, + "loss": 1.125, + "num_tokens": 627247.0, + "step": 250 + }, + { + "epoch": 0.012680610285945236, + "grad_norm": 10.24360466003418, + "learning_rate": 1.4783625730994153e-05, + "loss": 1.2849, + "num_tokens": 629628.0, + "step": 251 + }, + { + "epoch": 0.012731130645650197, + "grad_norm": 7.977736473083496, + "learning_rate": 1.4760233918128658e-05, + "loss": 0.9452, + "num_tokens": 632057.0, + "step": 252 + }, + { + "epoch": 0.012781651005355159, + "grad_norm": 9.096660614013672, + "learning_rate": 1.4736842105263159e-05, + "loss": 1.1513, + "num_tokens": 634436.0, + "step": 253 + }, + { + "epoch": 0.01283217136506012, + "grad_norm": 8.368144989013672, + "learning_rate": 1.4713450292397661e-05, + "loss": 1.0474, + "num_tokens": 637750.0, + "step": 254 + }, + { + "epoch": 0.01288269172476508, + "grad_norm": 8.667540550231934, + "learning_rate": 1.4690058479532165e-05, + "loss": 1.0991, + "num_tokens": 640104.0, + "step": 255 + }, + { + "epoch": 0.012933212084470042, + "grad_norm": 10.37819766998291, + "learning_rate": 1.4666666666666666e-05, + "loss": 0.9154, + "num_tokens": 641983.0, + "step": 256 + }, + { + "epoch": 0.012983732444175002, + "grad_norm": 9.203189849853516, + "learning_rate": 1.464327485380117e-05, + "loss": 1.0024, + "num_tokens": 644071.0, + "step": 257 + }, + { + "epoch": 0.013034252803879964, + "grad_norm": 8.303580284118652, + "learning_rate": 1.4619883040935675e-05, + "loss": 0.99, + "num_tokens": 646661.0, + "step": 258 + }, + { + "epoch": 0.013084773163584924, + "grad_norm": 7.4197235107421875, + "learning_rate": 1.4596491228070177e-05, + "loss": 1.1536, + "num_tokens": 650009.0, + "step": 259 + }, + { + "epoch": 0.013135293523289886, + "grad_norm": 7.7513556480407715, + "learning_rate": 1.4573099415204678e-05, + "loss": 1.0743, + "num_tokens": 652516.0, + "step": 260 + }, + { + "epoch": 0.013185813882994847, + "grad_norm": 7.888401508331299, + "learning_rate": 1.4549707602339183e-05, + "loss": 0.9673, + "num_tokens": 654979.0, + "step": 261 + }, + { + "epoch": 0.013236334242699809, + "grad_norm": 8.836228370666504, + "learning_rate": 1.4526315789473687e-05, + "loss": 1.1075, + "num_tokens": 657258.0, + "step": 262 + }, + { + "epoch": 0.01328685460240477, + "grad_norm": 10.447460174560547, + "learning_rate": 1.4502923976608188e-05, + "loss": 0.9551, + "num_tokens": 659663.0, + "step": 263 + }, + { + "epoch": 0.01333737496210973, + "grad_norm": 9.966935157775879, + "learning_rate": 1.447953216374269e-05, + "loss": 1.1701, + "num_tokens": 661763.0, + "step": 264 + }, + { + "epoch": 0.013387895321814692, + "grad_norm": 8.052924156188965, + "learning_rate": 1.4456140350877195e-05, + "loss": 0.9388, + "num_tokens": 664268.0, + "step": 265 + }, + { + "epoch": 0.013438415681519652, + "grad_norm": 9.245466232299805, + "learning_rate": 1.4432748538011695e-05, + "loss": 1.2282, + "num_tokens": 667199.0, + "step": 266 + }, + { + "epoch": 0.013488936041224614, + "grad_norm": 13.537809371948242, + "learning_rate": 1.44093567251462e-05, + "loss": 1.034, + "num_tokens": 668810.0, + "step": 267 + }, + { + "epoch": 0.013539456400929574, + "grad_norm": 9.831520080566406, + "learning_rate": 1.4385964912280704e-05, + "loss": 1.2609, + "num_tokens": 670827.0, + "step": 268 + }, + { + "epoch": 0.013589976760634536, + "grad_norm": 8.890970230102539, + "learning_rate": 1.4362573099415207e-05, + "loss": 1.0724, + "num_tokens": 673027.0, + "step": 269 + }, + { + "epoch": 0.013640497120339497, + "grad_norm": 8.956450462341309, + "learning_rate": 1.4339181286549707e-05, + "loss": 1.2304, + "num_tokens": 675128.0, + "step": 270 + }, + { + "epoch": 0.013691017480044457, + "grad_norm": 8.992768287658691, + "learning_rate": 1.4315789473684212e-05, + "loss": 1.0655, + "num_tokens": 677088.0, + "step": 271 + }, + { + "epoch": 0.01374153783974942, + "grad_norm": 9.972984313964844, + "learning_rate": 1.4292397660818716e-05, + "loss": 1.1598, + "num_tokens": 679548.0, + "step": 272 + }, + { + "epoch": 0.01379205819945438, + "grad_norm": 8.878399848937988, + "learning_rate": 1.4269005847953217e-05, + "loss": 1.0564, + "num_tokens": 682451.0, + "step": 273 + }, + { + "epoch": 0.013842578559159342, + "grad_norm": 9.177355766296387, + "learning_rate": 1.4245614035087721e-05, + "loss": 1.2846, + "num_tokens": 685123.0, + "step": 274 + }, + { + "epoch": 0.013893098918864302, + "grad_norm": 7.473504066467285, + "learning_rate": 1.4222222222222224e-05, + "loss": 1.1739, + "num_tokens": 688301.0, + "step": 275 + }, + { + "epoch": 0.013943619278569264, + "grad_norm": 9.052894592285156, + "learning_rate": 1.4198830409356725e-05, + "loss": 1.3004, + "num_tokens": 690916.0, + "step": 276 + }, + { + "epoch": 0.013994139638274224, + "grad_norm": 8.998008728027344, + "learning_rate": 1.4175438596491229e-05, + "loss": 1.3545, + "num_tokens": 693611.0, + "step": 277 + }, + { + "epoch": 0.014044659997979186, + "grad_norm": 11.342598915100098, + "learning_rate": 1.4152046783625733e-05, + "loss": 1.0592, + "num_tokens": 696026.0, + "step": 278 + }, + { + "epoch": 0.014095180357684147, + "grad_norm": 10.178295135498047, + "learning_rate": 1.4128654970760236e-05, + "loss": 1.2543, + "num_tokens": 698453.0, + "step": 279 + }, + { + "epoch": 0.014145700717389107, + "grad_norm": 6.9755778312683105, + "learning_rate": 1.4105263157894738e-05, + "loss": 0.9905, + "num_tokens": 701783.0, + "step": 280 + }, + { + "epoch": 0.01419622107709407, + "grad_norm": 7.98940372467041, + "learning_rate": 1.408187134502924e-05, + "loss": 1.0734, + "num_tokens": 704141.0, + "step": 281 + }, + { + "epoch": 0.01424674143679903, + "grad_norm": 8.097122192382812, + "learning_rate": 1.4058479532163745e-05, + "loss": 1.0673, + "num_tokens": 706525.0, + "step": 282 + }, + { + "epoch": 0.014297261796503992, + "grad_norm": 7.434200763702393, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.1848, + "num_tokens": 709511.0, + "step": 283 + }, + { + "epoch": 0.014347782156208952, + "grad_norm": 8.499911308288574, + "learning_rate": 1.401169590643275e-05, + "loss": 1.1633, + "num_tokens": 712209.0, + "step": 284 + }, + { + "epoch": 0.014398302515913914, + "grad_norm": 7.746021747589111, + "learning_rate": 1.3988304093567253e-05, + "loss": 1.2175, + "num_tokens": 716155.0, + "step": 285 + }, + { + "epoch": 0.014448822875618874, + "grad_norm": 8.352596282958984, + "learning_rate": 1.3964912280701755e-05, + "loss": 1.0825, + "num_tokens": 718392.0, + "step": 286 + }, + { + "epoch": 0.014499343235323835, + "grad_norm": 7.94016695022583, + "learning_rate": 1.3941520467836258e-05, + "loss": 0.9364, + "num_tokens": 720768.0, + "step": 287 + }, + { + "epoch": 0.014549863595028797, + "grad_norm": 9.229229927062988, + "learning_rate": 1.3918128654970762e-05, + "loss": 1.015, + "num_tokens": 723419.0, + "step": 288 + }, + { + "epoch": 0.014600383954733757, + "grad_norm": 9.36227035522461, + "learning_rate": 1.3894736842105265e-05, + "loss": 0.9417, + "num_tokens": 725711.0, + "step": 289 + }, + { + "epoch": 0.01465090431443872, + "grad_norm": 7.728912353515625, + "learning_rate": 1.3871345029239767e-05, + "loss": 0.8151, + "num_tokens": 728204.0, + "step": 290 + }, + { + "epoch": 0.01470142467414368, + "grad_norm": 8.304814338684082, + "learning_rate": 1.384795321637427e-05, + "loss": 0.9078, + "num_tokens": 730697.0, + "step": 291 + }, + { + "epoch": 0.014751945033848642, + "grad_norm": 8.742960929870605, + "learning_rate": 1.3824561403508774e-05, + "loss": 1.2279, + "num_tokens": 733385.0, + "step": 292 + }, + { + "epoch": 0.014802465393553602, + "grad_norm": 9.254317283630371, + "learning_rate": 1.3801169590643275e-05, + "loss": 1.3945, + "num_tokens": 736434.0, + "step": 293 + }, + { + "epoch": 0.014852985753258564, + "grad_norm": 8.595590591430664, + "learning_rate": 1.377777777777778e-05, + "loss": 1.2158, + "num_tokens": 739252.0, + "step": 294 + }, + { + "epoch": 0.014903506112963524, + "grad_norm": 7.337155342102051, + "learning_rate": 1.3754385964912282e-05, + "loss": 1.1091, + "num_tokens": 742271.0, + "step": 295 + }, + { + "epoch": 0.014954026472668485, + "grad_norm": 8.072590827941895, + "learning_rate": 1.3730994152046784e-05, + "loss": 1.0311, + "num_tokens": 744775.0, + "step": 296 + }, + { + "epoch": 0.015004546832373447, + "grad_norm": 8.47529125213623, + "learning_rate": 1.3707602339181287e-05, + "loss": 1.0447, + "num_tokens": 747023.0, + "step": 297 + }, + { + "epoch": 0.015055067192078407, + "grad_norm": 8.814570426940918, + "learning_rate": 1.3684210526315791e-05, + "loss": 1.0041, + "num_tokens": 749212.0, + "step": 298 + }, + { + "epoch": 0.01510558755178337, + "grad_norm": 8.880142211914062, + "learning_rate": 1.3660818713450294e-05, + "loss": 1.1009, + "num_tokens": 751367.0, + "step": 299 + }, + { + "epoch": 0.01515610791148833, + "grad_norm": 7.787696361541748, + "learning_rate": 1.3637426900584796e-05, + "loss": 1.2977, + "num_tokens": 754077.0, + "step": 300 + }, + { + "epoch": 0.015206628271193292, + "grad_norm": 6.868487358093262, + "learning_rate": 1.3614035087719299e-05, + "loss": 0.8695, + "num_tokens": 757044.0, + "step": 301 + }, + { + "epoch": 0.015257148630898252, + "grad_norm": 9.410465240478516, + "learning_rate": 1.3590643274853803e-05, + "loss": 1.3469, + "num_tokens": 759737.0, + "step": 302 + }, + { + "epoch": 0.015307668990603212, + "grad_norm": 8.500602722167969, + "learning_rate": 1.3567251461988304e-05, + "loss": 0.9904, + "num_tokens": 762279.0, + "step": 303 + }, + { + "epoch": 0.015358189350308174, + "grad_norm": 8.725210189819336, + "learning_rate": 1.3543859649122808e-05, + "loss": 1.0427, + "num_tokens": 764268.0, + "step": 304 + }, + { + "epoch": 0.015408709710013135, + "grad_norm": 10.401493072509766, + "learning_rate": 1.3520467836257311e-05, + "loss": 1.4522, + "num_tokens": 766608.0, + "step": 305 + }, + { + "epoch": 0.015459230069718097, + "grad_norm": 7.81357479095459, + "learning_rate": 1.3497076023391814e-05, + "loss": 1.1569, + "num_tokens": 769758.0, + "step": 306 + }, + { + "epoch": 0.015509750429423057, + "grad_norm": 8.852273941040039, + "learning_rate": 1.3473684210526316e-05, + "loss": 1.4822, + "num_tokens": 772480.0, + "step": 307 + }, + { + "epoch": 0.01556027078912802, + "grad_norm": 7.420098304748535, + "learning_rate": 1.345029239766082e-05, + "loss": 1.0324, + "num_tokens": 775211.0, + "step": 308 + }, + { + "epoch": 0.01561079114883298, + "grad_norm": 10.004176139831543, + "learning_rate": 1.3426900584795323e-05, + "loss": 1.1256, + "num_tokens": 777099.0, + "step": 309 + }, + { + "epoch": 0.01566131150853794, + "grad_norm": 11.179290771484375, + "learning_rate": 1.3403508771929826e-05, + "loss": 1.3139, + "num_tokens": 779368.0, + "step": 310 + }, + { + "epoch": 0.0157118318682429, + "grad_norm": 9.197726249694824, + "learning_rate": 1.3380116959064328e-05, + "loss": 1.1256, + "num_tokens": 781475.0, + "step": 311 + }, + { + "epoch": 0.015762352227947862, + "grad_norm": 9.446526527404785, + "learning_rate": 1.3356725146198832e-05, + "loss": 1.1283, + "num_tokens": 783468.0, + "step": 312 + }, + { + "epoch": 0.015812872587652824, + "grad_norm": 9.767399787902832, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.3126, + "num_tokens": 785849.0, + "step": 313 + }, + { + "epoch": 0.015863392947357786, + "grad_norm": 6.978866100311279, + "learning_rate": 1.3309941520467838e-05, + "loss": 1.1601, + "num_tokens": 788707.0, + "step": 314 + }, + { + "epoch": 0.015913913307062745, + "grad_norm": 8.648423194885254, + "learning_rate": 1.328654970760234e-05, + "loss": 1.0145, + "num_tokens": 790713.0, + "step": 315 + }, + { + "epoch": 0.015964433666767707, + "grad_norm": 8.532211303710938, + "learning_rate": 1.3263157894736843e-05, + "loss": 1.1338, + "num_tokens": 792999.0, + "step": 316 + }, + { + "epoch": 0.01601495402647267, + "grad_norm": 8.301265716552734, + "learning_rate": 1.3239766081871345e-05, + "loss": 1.0289, + "num_tokens": 795309.0, + "step": 317 + }, + { + "epoch": 0.01606547438617763, + "grad_norm": 7.21840763092041, + "learning_rate": 1.321637426900585e-05, + "loss": 1.002, + "num_tokens": 797991.0, + "step": 318 + }, + { + "epoch": 0.01611599474588259, + "grad_norm": 8.049905776977539, + "learning_rate": 1.3192982456140354e-05, + "loss": 1.1429, + "num_tokens": 800413.0, + "step": 319 + }, + { + "epoch": 0.016166515105587552, + "grad_norm": 9.546985626220703, + "learning_rate": 1.3169590643274855e-05, + "loss": 1.2939, + "num_tokens": 802160.0, + "step": 320 + }, + { + "epoch": 0.016217035465292514, + "grad_norm": 8.670045852661133, + "learning_rate": 1.3146198830409357e-05, + "loss": 1.0304, + "num_tokens": 803945.0, + "step": 321 + }, + { + "epoch": 0.016267555824997473, + "grad_norm": 8.127334594726562, + "learning_rate": 1.3122807017543862e-05, + "loss": 0.9769, + "num_tokens": 806400.0, + "step": 322 + }, + { + "epoch": 0.016318076184702435, + "grad_norm": 8.263005256652832, + "learning_rate": 1.3099415204678362e-05, + "loss": 1.019, + "num_tokens": 808573.0, + "step": 323 + }, + { + "epoch": 0.016368596544407397, + "grad_norm": 10.196557998657227, + "learning_rate": 1.3076023391812867e-05, + "loss": 1.3352, + "num_tokens": 810631.0, + "step": 324 + }, + { + "epoch": 0.01641911690411236, + "grad_norm": 8.776551246643066, + "learning_rate": 1.305263157894737e-05, + "loss": 1.0292, + "num_tokens": 812547.0, + "step": 325 + }, + { + "epoch": 0.016469637263817317, + "grad_norm": 7.490231037139893, + "learning_rate": 1.3029239766081872e-05, + "loss": 1.0104, + "num_tokens": 815079.0, + "step": 326 + }, + { + "epoch": 0.01652015762352228, + "grad_norm": 9.417389869689941, + "learning_rate": 1.3005847953216374e-05, + "loss": 1.0875, + "num_tokens": 817074.0, + "step": 327 + }, + { + "epoch": 0.01657067798322724, + "grad_norm": 7.856191158294678, + "learning_rate": 1.2982456140350879e-05, + "loss": 1.1308, + "num_tokens": 819686.0, + "step": 328 + }, + { + "epoch": 0.0166211983429322, + "grad_norm": 7.3317155838012695, + "learning_rate": 1.2959064327485383e-05, + "loss": 1.0251, + "num_tokens": 822279.0, + "step": 329 + }, + { + "epoch": 0.016671718702637162, + "grad_norm": 8.909266471862793, + "learning_rate": 1.2935672514619884e-05, + "loss": 1.0848, + "num_tokens": 824715.0, + "step": 330 + }, + { + "epoch": 0.016722239062342124, + "grad_norm": 9.398529052734375, + "learning_rate": 1.2912280701754386e-05, + "loss": 0.9414, + "num_tokens": 826719.0, + "step": 331 + }, + { + "epoch": 0.016772759422047086, + "grad_norm": 7.581336498260498, + "learning_rate": 1.288888888888889e-05, + "loss": 1.0375, + "num_tokens": 829501.0, + "step": 332 + }, + { + "epoch": 0.016823279781752045, + "grad_norm": 8.365718841552734, + "learning_rate": 1.2865497076023392e-05, + "loss": 0.9673, + "num_tokens": 831639.0, + "step": 333 + }, + { + "epoch": 0.016873800141457007, + "grad_norm": 8.144573211669922, + "learning_rate": 1.2842105263157896e-05, + "loss": 0.9441, + "num_tokens": 833957.0, + "step": 334 + }, + { + "epoch": 0.01692432050116197, + "grad_norm": 8.76260757446289, + "learning_rate": 1.28187134502924e-05, + "loss": 1.1034, + "num_tokens": 836125.0, + "step": 335 + }, + { + "epoch": 0.016974840860866928, + "grad_norm": 7.845893859863281, + "learning_rate": 1.2795321637426901e-05, + "loss": 1.0519, + "num_tokens": 839060.0, + "step": 336 + }, + { + "epoch": 0.01702536122057189, + "grad_norm": 7.806420803070068, + "learning_rate": 1.2771929824561404e-05, + "loss": 0.9888, + "num_tokens": 841832.0, + "step": 337 + }, + { + "epoch": 0.017075881580276852, + "grad_norm": 8.474066734313965, + "learning_rate": 1.2748538011695908e-05, + "loss": 1.188, + "num_tokens": 844354.0, + "step": 338 + }, + { + "epoch": 0.017126401939981814, + "grad_norm": 7.281275749206543, + "learning_rate": 1.2725146198830412e-05, + "loss": 1.1884, + "num_tokens": 847878.0, + "step": 339 + }, + { + "epoch": 0.017176922299686773, + "grad_norm": 8.520792961120605, + "learning_rate": 1.2701754385964913e-05, + "loss": 1.2011, + "num_tokens": 850615.0, + "step": 340 + }, + { + "epoch": 0.017227442659391735, + "grad_norm": 7.7677459716796875, + "learning_rate": 1.2678362573099417e-05, + "loss": 1.1719, + "num_tokens": 853460.0, + "step": 341 + }, + { + "epoch": 0.017277963019096697, + "grad_norm": 10.753673553466797, + "learning_rate": 1.265497076023392e-05, + "loss": 1.3672, + "num_tokens": 855443.0, + "step": 342 + }, + { + "epoch": 0.017328483378801655, + "grad_norm": 7.999899864196777, + "learning_rate": 1.263157894736842e-05, + "loss": 1.0013, + "num_tokens": 857902.0, + "step": 343 + }, + { + "epoch": 0.017379003738506617, + "grad_norm": 7.563441753387451, + "learning_rate": 1.2608187134502925e-05, + "loss": 0.9678, + "num_tokens": 860456.0, + "step": 344 + }, + { + "epoch": 0.01742952409821158, + "grad_norm": 6.544392108917236, + "learning_rate": 1.258479532163743e-05, + "loss": 1.0179, + "num_tokens": 863658.0, + "step": 345 + }, + { + "epoch": 0.01748004445791654, + "grad_norm": 7.35209846496582, + "learning_rate": 1.256140350877193e-05, + "loss": 1.015, + "num_tokens": 866356.0, + "step": 346 + }, + { + "epoch": 0.0175305648176215, + "grad_norm": 9.063884735107422, + "learning_rate": 1.2538011695906434e-05, + "loss": 1.2172, + "num_tokens": 868721.0, + "step": 347 + }, + { + "epoch": 0.017581085177326462, + "grad_norm": 7.650192737579346, + "learning_rate": 1.2514619883040937e-05, + "loss": 1.0134, + "num_tokens": 871201.0, + "step": 348 + }, + { + "epoch": 0.017631605537031424, + "grad_norm": 7.4320573806762695, + "learning_rate": 1.2491228070175441e-05, + "loss": 1.3226, + "num_tokens": 874534.0, + "step": 349 + }, + { + "epoch": 0.017682125896736386, + "grad_norm": 8.409152030944824, + "learning_rate": 1.2467836257309942e-05, + "loss": 1.2582, + "num_tokens": 877082.0, + "step": 350 + }, + { + "epoch": 0.017732646256441345, + "grad_norm": 7.047739505767822, + "learning_rate": 1.2444444444444446e-05, + "loss": 1.1153, + "num_tokens": 880424.0, + "step": 351 + }, + { + "epoch": 0.017783166616146307, + "grad_norm": 8.717232704162598, + "learning_rate": 1.2421052631578949e-05, + "loss": 1.2539, + "num_tokens": 882616.0, + "step": 352 + }, + { + "epoch": 0.01783368697585127, + "grad_norm": 7.533171653747559, + "learning_rate": 1.239766081871345e-05, + "loss": 1.128, + "num_tokens": 885254.0, + "step": 353 + }, + { + "epoch": 0.017884207335556228, + "grad_norm": 8.926271438598633, + "learning_rate": 1.2374269005847954e-05, + "loss": 1.2362, + "num_tokens": 887316.0, + "step": 354 + }, + { + "epoch": 0.01793472769526119, + "grad_norm": 8.123795509338379, + "learning_rate": 1.2350877192982458e-05, + "loss": 1.2223, + "num_tokens": 890430.0, + "step": 355 + }, + { + "epoch": 0.017985248054966152, + "grad_norm": 8.804718971252441, + "learning_rate": 1.232748538011696e-05, + "loss": 1.3381, + "num_tokens": 892954.0, + "step": 356 + }, + { + "epoch": 0.018035768414671114, + "grad_norm": 7.522372722625732, + "learning_rate": 1.2304093567251463e-05, + "loss": 1.1414, + "num_tokens": 895507.0, + "step": 357 + }, + { + "epoch": 0.018086288774376073, + "grad_norm": 6.742160320281982, + "learning_rate": 1.2280701754385966e-05, + "loss": 1.0115, + "num_tokens": 898702.0, + "step": 358 + }, + { + "epoch": 0.018136809134081035, + "grad_norm": 6.936591148376465, + "learning_rate": 1.225730994152047e-05, + "loss": 1.1419, + "num_tokens": 902003.0, + "step": 359 + }, + { + "epoch": 0.018187329493785997, + "grad_norm": 6.606788635253906, + "learning_rate": 1.2233918128654971e-05, + "loss": 1.0636, + "num_tokens": 905330.0, + "step": 360 + }, + { + "epoch": 0.018237849853490955, + "grad_norm": 7.769503116607666, + "learning_rate": 1.2210526315789475e-05, + "loss": 1.2042, + "num_tokens": 908077.0, + "step": 361 + }, + { + "epoch": 0.018288370213195917, + "grad_norm": 7.975001335144043, + "learning_rate": 1.2187134502923978e-05, + "loss": 1.3051, + "num_tokens": 911302.0, + "step": 362 + }, + { + "epoch": 0.01833889057290088, + "grad_norm": 7.281492233276367, + "learning_rate": 1.216374269005848e-05, + "loss": 1.0398, + "num_tokens": 914640.0, + "step": 363 + }, + { + "epoch": 0.01838941093260584, + "grad_norm": 9.83873462677002, + "learning_rate": 1.2140350877192983e-05, + "loss": 1.2896, + "num_tokens": 916941.0, + "step": 364 + }, + { + "epoch": 0.0184399312923108, + "grad_norm": 9.743363380432129, + "learning_rate": 1.2116959064327487e-05, + "loss": 1.0493, + "num_tokens": 918873.0, + "step": 365 + }, + { + "epoch": 0.018490451652015762, + "grad_norm": 9.33188247680664, + "learning_rate": 1.2093567251461988e-05, + "loss": 1.1965, + "num_tokens": 920823.0, + "step": 366 + }, + { + "epoch": 0.018540972011720724, + "grad_norm": 8.53341293334961, + "learning_rate": 1.2070175438596493e-05, + "loss": 0.963, + "num_tokens": 923328.0, + "step": 367 + }, + { + "epoch": 0.018591492371425683, + "grad_norm": 8.234481811523438, + "learning_rate": 1.2046783625730995e-05, + "loss": 1.1179, + "num_tokens": 925677.0, + "step": 368 + }, + { + "epoch": 0.018642012731130645, + "grad_norm": 6.606385707855225, + "learning_rate": 1.20233918128655e-05, + "loss": 1.1202, + "num_tokens": 928589.0, + "step": 369 + }, + { + "epoch": 0.018692533090835607, + "grad_norm": 7.004721164703369, + "learning_rate": 1.2e-05, + "loss": 0.96, + "num_tokens": 931465.0, + "step": 370 + }, + { + "epoch": 0.01874305345054057, + "grad_norm": 7.241075038909912, + "learning_rate": 1.1976608187134505e-05, + "loss": 0.9406, + "num_tokens": 933853.0, + "step": 371 + }, + { + "epoch": 0.018793573810245528, + "grad_norm": 7.583524227142334, + "learning_rate": 1.1953216374269007e-05, + "loss": 1.0568, + "num_tokens": 936723.0, + "step": 372 + }, + { + "epoch": 0.01884409416995049, + "grad_norm": 7.991731643676758, + "learning_rate": 1.192982456140351e-05, + "loss": 1.2063, + "num_tokens": 939635.0, + "step": 373 + }, + { + "epoch": 0.018894614529655452, + "grad_norm": 8.607834815979004, + "learning_rate": 1.1906432748538012e-05, + "loss": 1.2751, + "num_tokens": 942473.0, + "step": 374 + }, + { + "epoch": 0.01894513488936041, + "grad_norm": 10.59875202178955, + "learning_rate": 1.1883040935672517e-05, + "loss": 0.9887, + "num_tokens": 944084.0, + "step": 375 + }, + { + "epoch": 0.018995655249065373, + "grad_norm": 7.635941028594971, + "learning_rate": 1.1859649122807017e-05, + "loss": 0.9936, + "num_tokens": 946515.0, + "step": 376 + }, + { + "epoch": 0.019046175608770335, + "grad_norm": 7.352439880371094, + "learning_rate": 1.1836257309941522e-05, + "loss": 1.1586, + "num_tokens": 949799.0, + "step": 377 + }, + { + "epoch": 0.019096695968475297, + "grad_norm": 7.889686584472656, + "learning_rate": 1.1812865497076024e-05, + "loss": 0.9715, + "num_tokens": 952200.0, + "step": 378 + }, + { + "epoch": 0.019147216328180255, + "grad_norm": 11.504788398742676, + "learning_rate": 1.1789473684210527e-05, + "loss": 1.3998, + "num_tokens": 953920.0, + "step": 379 + }, + { + "epoch": 0.019197736687885218, + "grad_norm": 9.799585342407227, + "learning_rate": 1.176608187134503e-05, + "loss": 1.2399, + "num_tokens": 955705.0, + "step": 380 + }, + { + "epoch": 0.01924825704759018, + "grad_norm": 9.085797309875488, + "learning_rate": 1.1742690058479534e-05, + "loss": 1.1873, + "num_tokens": 957966.0, + "step": 381 + }, + { + "epoch": 0.01929877740729514, + "grad_norm": 7.8672637939453125, + "learning_rate": 1.1719298245614036e-05, + "loss": 1.0582, + "num_tokens": 960973.0, + "step": 382 + }, + { + "epoch": 0.0193492977670001, + "grad_norm": 8.567300796508789, + "learning_rate": 1.1695906432748539e-05, + "loss": 1.017, + "num_tokens": 963151.0, + "step": 383 + }, + { + "epoch": 0.019399818126705062, + "grad_norm": 8.648608207702637, + "learning_rate": 1.1672514619883041e-05, + "loss": 1.2714, + "num_tokens": 965440.0, + "step": 384 + }, + { + "epoch": 0.019450338486410024, + "grad_norm": 8.253742218017578, + "learning_rate": 1.1649122807017546e-05, + "loss": 0.9781, + "num_tokens": 967745.0, + "step": 385 + }, + { + "epoch": 0.019500858846114983, + "grad_norm": 8.201629638671875, + "learning_rate": 1.1625730994152047e-05, + "loss": 0.9929, + "num_tokens": 970029.0, + "step": 386 + }, + { + "epoch": 0.019551379205819945, + "grad_norm": 7.4650678634643555, + "learning_rate": 1.160233918128655e-05, + "loss": 0.9734, + "num_tokens": 972719.0, + "step": 387 + }, + { + "epoch": 0.019601899565524907, + "grad_norm": 8.536724090576172, + "learning_rate": 1.1578947368421053e-05, + "loss": 0.9804, + "num_tokens": 974717.0, + "step": 388 + }, + { + "epoch": 0.01965241992522987, + "grad_norm": 9.41419506072998, + "learning_rate": 1.1555555555555556e-05, + "loss": 1.2392, + "num_tokens": 976677.0, + "step": 389 + }, + { + "epoch": 0.019702940284934828, + "grad_norm": 7.688607215881348, + "learning_rate": 1.1532163742690059e-05, + "loss": 1.1475, + "num_tokens": 979754.0, + "step": 390 + }, + { + "epoch": 0.01975346064463979, + "grad_norm": 10.458429336547852, + "learning_rate": 1.1508771929824563e-05, + "loss": 1.4729, + "num_tokens": 981608.0, + "step": 391 + }, + { + "epoch": 0.019803981004344752, + "grad_norm": 8.165546417236328, + "learning_rate": 1.1485380116959065e-05, + "loss": 1.2997, + "num_tokens": 983995.0, + "step": 392 + }, + { + "epoch": 0.01985450136404971, + "grad_norm": 10.701775550842285, + "learning_rate": 1.1461988304093568e-05, + "loss": 1.2167, + "num_tokens": 985529.0, + "step": 393 + }, + { + "epoch": 0.019905021723754673, + "grad_norm": 8.193254470825195, + "learning_rate": 1.143859649122807e-05, + "loss": 1.0912, + "num_tokens": 987703.0, + "step": 394 + }, + { + "epoch": 0.019955542083459635, + "grad_norm": 7.695500373840332, + "learning_rate": 1.1415204678362575e-05, + "loss": 1.1996, + "num_tokens": 990391.0, + "step": 395 + }, + { + "epoch": 0.020006062443164597, + "grad_norm": 8.260912895202637, + "learning_rate": 1.1391812865497076e-05, + "loss": 1.2251, + "num_tokens": 992640.0, + "step": 396 + }, + { + "epoch": 0.020056582802869555, + "grad_norm": 8.934926986694336, + "learning_rate": 1.136842105263158e-05, + "loss": 1.2343, + "num_tokens": 994867.0, + "step": 397 + }, + { + "epoch": 0.020107103162574518, + "grad_norm": 7.018784999847412, + "learning_rate": 1.1345029239766083e-05, + "loss": 0.9593, + "num_tokens": 997380.0, + "step": 398 + }, + { + "epoch": 0.02015762352227948, + "grad_norm": 8.981276512145996, + "learning_rate": 1.1321637426900585e-05, + "loss": 1.3894, + "num_tokens": 999790.0, + "step": 399 + }, + { + "epoch": 0.020208143881984438, + "grad_norm": 8.494843482971191, + "learning_rate": 1.1298245614035088e-05, + "loss": 0.9347, + "num_tokens": 1001607.0, + "step": 400 + }, + { + "epoch": 0.0202586642416894, + "grad_norm": 7.798030376434326, + "learning_rate": 1.1274853801169592e-05, + "loss": 1.0908, + "num_tokens": 1004137.0, + "step": 401 + }, + { + "epoch": 0.020309184601394362, + "grad_norm": 8.13697338104248, + "learning_rate": 1.1251461988304096e-05, + "loss": 1.0598, + "num_tokens": 1006575.0, + "step": 402 + }, + { + "epoch": 0.020359704961099324, + "grad_norm": 8.29926872253418, + "learning_rate": 1.1228070175438597e-05, + "loss": 0.9779, + "num_tokens": 1009103.0, + "step": 403 + }, + { + "epoch": 0.020410225320804283, + "grad_norm": 8.01266860961914, + "learning_rate": 1.12046783625731e-05, + "loss": 1.1942, + "num_tokens": 1011814.0, + "step": 404 + }, + { + "epoch": 0.020460745680509245, + "grad_norm": 6.485651969909668, + "learning_rate": 1.1181286549707604e-05, + "loss": 0.9993, + "num_tokens": 1014836.0, + "step": 405 + }, + { + "epoch": 0.020511266040214207, + "grad_norm": 7.6060919761657715, + "learning_rate": 1.1157894736842105e-05, + "loss": 1.019, + "num_tokens": 1017449.0, + "step": 406 + }, + { + "epoch": 0.020561786399919166, + "grad_norm": 8.32107925415039, + "learning_rate": 1.1134502923976609e-05, + "loss": 0.9683, + "num_tokens": 1020367.0, + "step": 407 + }, + { + "epoch": 0.020612306759624128, + "grad_norm": 7.491749286651611, + "learning_rate": 1.1111111111111113e-05, + "loss": 1.0098, + "num_tokens": 1023165.0, + "step": 408 + }, + { + "epoch": 0.02066282711932909, + "grad_norm": 8.37585163116455, + "learning_rate": 1.1087719298245614e-05, + "loss": 1.2241, + "num_tokens": 1025467.0, + "step": 409 + }, + { + "epoch": 0.020713347479034052, + "grad_norm": 7.744221210479736, + "learning_rate": 1.1064327485380117e-05, + "loss": 1.1959, + "num_tokens": 1028578.0, + "step": 410 + }, + { + "epoch": 0.02076386783873901, + "grad_norm": 9.227510452270508, + "learning_rate": 1.1040935672514621e-05, + "loss": 1.0328, + "num_tokens": 1030871.0, + "step": 411 + }, + { + "epoch": 0.020814388198443973, + "grad_norm": 9.742758750915527, + "learning_rate": 1.1017543859649125e-05, + "loss": 1.0525, + "num_tokens": 1032646.0, + "step": 412 + }, + { + "epoch": 0.020864908558148935, + "grad_norm": 7.9829840660095215, + "learning_rate": 1.0994152046783626e-05, + "loss": 1.0963, + "num_tokens": 1035374.0, + "step": 413 + }, + { + "epoch": 0.020915428917853897, + "grad_norm": 8.80949878692627, + "learning_rate": 1.0970760233918129e-05, + "loss": 1.0312, + "num_tokens": 1037922.0, + "step": 414 + }, + { + "epoch": 0.020965949277558855, + "grad_norm": 7.700923442840576, + "learning_rate": 1.0947368421052633e-05, + "loss": 0.8095, + "num_tokens": 1040004.0, + "step": 415 + }, + { + "epoch": 0.021016469637263818, + "grad_norm": 9.332331657409668, + "learning_rate": 1.0923976608187134e-05, + "loss": 1.4346, + "num_tokens": 1042146.0, + "step": 416 + }, + { + "epoch": 0.02106698999696878, + "grad_norm": 11.888642311096191, + "learning_rate": 1.0900584795321638e-05, + "loss": 1.5693, + "num_tokens": 1043957.0, + "step": 417 + }, + { + "epoch": 0.021117510356673738, + "grad_norm": 6.9829864501953125, + "learning_rate": 1.0877192982456142e-05, + "loss": 1.139, + "num_tokens": 1047167.0, + "step": 418 + }, + { + "epoch": 0.0211680307163787, + "grad_norm": 7.229538440704346, + "learning_rate": 1.0853801169590643e-05, + "loss": 1.0326, + "num_tokens": 1050236.0, + "step": 419 + }, + { + "epoch": 0.021218551076083662, + "grad_norm": 7.659509181976318, + "learning_rate": 1.0830409356725146e-05, + "loss": 0.9678, + "num_tokens": 1053164.0, + "step": 420 + }, + { + "epoch": 0.021269071435788624, + "grad_norm": 7.526303768157959, + "learning_rate": 1.080701754385965e-05, + "loss": 1.2983, + "num_tokens": 1056041.0, + "step": 421 + }, + { + "epoch": 0.021319591795493583, + "grad_norm": 7.517195701599121, + "learning_rate": 1.0783625730994154e-05, + "loss": 0.953, + "num_tokens": 1058381.0, + "step": 422 + }, + { + "epoch": 0.021370112155198545, + "grad_norm": 7.999603748321533, + "learning_rate": 1.0760233918128655e-05, + "loss": 1.1828, + "num_tokens": 1060829.0, + "step": 423 + }, + { + "epoch": 0.021420632514903507, + "grad_norm": 7.091024398803711, + "learning_rate": 1.073684210526316e-05, + "loss": 1.0284, + "num_tokens": 1063438.0, + "step": 424 + }, + { + "epoch": 0.021471152874608466, + "grad_norm": 8.865612983703613, + "learning_rate": 1.0713450292397662e-05, + "loss": 1.1087, + "num_tokens": 1065643.0, + "step": 425 + }, + { + "epoch": 0.021521673234313428, + "grad_norm": 8.065427780151367, + "learning_rate": 1.0690058479532163e-05, + "loss": 1.1077, + "num_tokens": 1068061.0, + "step": 426 + }, + { + "epoch": 0.02157219359401839, + "grad_norm": 7.308866024017334, + "learning_rate": 1.0666666666666667e-05, + "loss": 0.9851, + "num_tokens": 1070596.0, + "step": 427 + }, + { + "epoch": 0.021622713953723352, + "grad_norm": 9.514137268066406, + "learning_rate": 1.0643274853801172e-05, + "loss": 1.4733, + "num_tokens": 1073045.0, + "step": 428 + }, + { + "epoch": 0.02167323431342831, + "grad_norm": 8.035856246948242, + "learning_rate": 1.0619883040935672e-05, + "loss": 1.0206, + "num_tokens": 1075544.0, + "step": 429 + }, + { + "epoch": 0.021723754673133273, + "grad_norm": 7.081408977508545, + "learning_rate": 1.0596491228070177e-05, + "loss": 0.9567, + "num_tokens": 1078209.0, + "step": 430 + }, + { + "epoch": 0.021774275032838235, + "grad_norm": 8.047063827514648, + "learning_rate": 1.057309941520468e-05, + "loss": 1.2457, + "num_tokens": 1080905.0, + "step": 431 + }, + { + "epoch": 0.021824795392543193, + "grad_norm": 9.152252197265625, + "learning_rate": 1.0549707602339184e-05, + "loss": 0.9964, + "num_tokens": 1082821.0, + "step": 432 + }, + { + "epoch": 0.021875315752248155, + "grad_norm": 7.406858921051025, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.1027, + "num_tokens": 1085297.0, + "step": 433 + }, + { + "epoch": 0.021925836111953118, + "grad_norm": 7.638632297515869, + "learning_rate": 1.0502923976608189e-05, + "loss": 0.9874, + "num_tokens": 1087575.0, + "step": 434 + }, + { + "epoch": 0.02197635647165808, + "grad_norm": 8.230912208557129, + "learning_rate": 1.0479532163742691e-05, + "loss": 0.9357, + "num_tokens": 1089977.0, + "step": 435 + }, + { + "epoch": 0.022026876831363038, + "grad_norm": 8.287261962890625, + "learning_rate": 1.0456140350877194e-05, + "loss": 1.0334, + "num_tokens": 1092045.0, + "step": 436 + }, + { + "epoch": 0.022077397191068, + "grad_norm": 7.4078240394592285, + "learning_rate": 1.0432748538011696e-05, + "loss": 1.1433, + "num_tokens": 1094989.0, + "step": 437 + }, + { + "epoch": 0.022127917550772962, + "grad_norm": 9.254385948181152, + "learning_rate": 1.04093567251462e-05, + "loss": 1.1471, + "num_tokens": 1097145.0, + "step": 438 + }, + { + "epoch": 0.02217843791047792, + "grad_norm": 8.367359161376953, + "learning_rate": 1.0385964912280702e-05, + "loss": 1.1309, + "num_tokens": 1099656.0, + "step": 439 + }, + { + "epoch": 0.022228958270182883, + "grad_norm": 7.356039047241211, + "learning_rate": 1.0362573099415206e-05, + "loss": 1.0829, + "num_tokens": 1102322.0, + "step": 440 + }, + { + "epoch": 0.022279478629887845, + "grad_norm": 8.84168815612793, + "learning_rate": 1.0339181286549708e-05, + "loss": 1.4603, + "num_tokens": 1105265.0, + "step": 441 + }, + { + "epoch": 0.022329998989592807, + "grad_norm": 8.177007675170898, + "learning_rate": 1.0315789473684213e-05, + "loss": 1.117, + "num_tokens": 1108225.0, + "step": 442 + }, + { + "epoch": 0.022380519349297766, + "grad_norm": 9.334571838378906, + "learning_rate": 1.0292397660818714e-05, + "loss": 1.0477, + "num_tokens": 1110616.0, + "step": 443 + }, + { + "epoch": 0.022431039709002728, + "grad_norm": 9.319694519042969, + "learning_rate": 1.0269005847953218e-05, + "loss": 1.1547, + "num_tokens": 1112610.0, + "step": 444 + }, + { + "epoch": 0.02248156006870769, + "grad_norm": 7.869724273681641, + "learning_rate": 1.024561403508772e-05, + "loss": 1.1421, + "num_tokens": 1115185.0, + "step": 445 + }, + { + "epoch": 0.02253208042841265, + "grad_norm": 8.06291675567627, + "learning_rate": 1.0222222222222223e-05, + "loss": 0.9336, + "num_tokens": 1117691.0, + "step": 446 + }, + { + "epoch": 0.02258260078811761, + "grad_norm": 7.627496242523193, + "learning_rate": 1.0198830409356726e-05, + "loss": 1.1265, + "num_tokens": 1120130.0, + "step": 447 + }, + { + "epoch": 0.022633121147822573, + "grad_norm": 9.72512149810791, + "learning_rate": 1.017543859649123e-05, + "loss": 0.961, + "num_tokens": 1122262.0, + "step": 448 + }, + { + "epoch": 0.022683641507527535, + "grad_norm": 9.196995735168457, + "learning_rate": 1.015204678362573e-05, + "loss": 1.0798, + "num_tokens": 1124218.0, + "step": 449 + }, + { + "epoch": 0.022734161867232493, + "grad_norm": 7.037284851074219, + "learning_rate": 1.0128654970760235e-05, + "loss": 0.9626, + "num_tokens": 1127634.0, + "step": 450 + }, + { + "epoch": 0.022784682226937455, + "grad_norm": 7.932760715484619, + "learning_rate": 1.0105263157894738e-05, + "loss": 0.9147, + "num_tokens": 1129893.0, + "step": 451 + }, + { + "epoch": 0.022835202586642418, + "grad_norm": 7.850600242614746, + "learning_rate": 1.0081871345029242e-05, + "loss": 1.0856, + "num_tokens": 1133074.0, + "step": 452 + }, + { + "epoch": 0.02288572294634738, + "grad_norm": 10.015412330627441, + "learning_rate": 1.0058479532163743e-05, + "loss": 1.0976, + "num_tokens": 1135152.0, + "step": 453 + }, + { + "epoch": 0.022936243306052338, + "grad_norm": 8.20608139038086, + "learning_rate": 1.0035087719298247e-05, + "loss": 0.9868, + "num_tokens": 1137122.0, + "step": 454 + }, + { + "epoch": 0.0229867636657573, + "grad_norm": 8.830216407775879, + "learning_rate": 1.001169590643275e-05, + "loss": 1.0508, + "num_tokens": 1139194.0, + "step": 455 + }, + { + "epoch": 0.023037284025462262, + "grad_norm": 9.603080749511719, + "learning_rate": 9.988304093567252e-06, + "loss": 1.2128, + "num_tokens": 1141742.0, + "step": 456 + }, + { + "epoch": 0.02308780438516722, + "grad_norm": 7.51000452041626, + "learning_rate": 9.964912280701755e-06, + "loss": 1.0694, + "num_tokens": 1145200.0, + "step": 457 + }, + { + "epoch": 0.023138324744872183, + "grad_norm": 7.955310821533203, + "learning_rate": 9.941520467836257e-06, + "loss": 1.2182, + "num_tokens": 1148031.0, + "step": 458 + }, + { + "epoch": 0.023188845104577145, + "grad_norm": 6.96146297454834, + "learning_rate": 9.918128654970762e-06, + "loss": 1.0793, + "num_tokens": 1151331.0, + "step": 459 + }, + { + "epoch": 0.023239365464282107, + "grad_norm": 8.107987403869629, + "learning_rate": 9.894736842105264e-06, + "loss": 1.1551, + "num_tokens": 1153802.0, + "step": 460 + }, + { + "epoch": 0.023289885823987066, + "grad_norm": 8.180355072021484, + "learning_rate": 9.871345029239767e-06, + "loss": 1.1155, + "num_tokens": 1156889.0, + "step": 461 + }, + { + "epoch": 0.023340406183692028, + "grad_norm": 8.641302108764648, + "learning_rate": 9.84795321637427e-06, + "loss": 1.2854, + "num_tokens": 1160129.0, + "step": 462 + }, + { + "epoch": 0.02339092654339699, + "grad_norm": 7.053234577178955, + "learning_rate": 9.824561403508772e-06, + "loss": 0.986, + "num_tokens": 1162714.0, + "step": 463 + }, + { + "epoch": 0.02344144690310195, + "grad_norm": 8.116785049438477, + "learning_rate": 9.801169590643276e-06, + "loss": 1.1684, + "num_tokens": 1165233.0, + "step": 464 + }, + { + "epoch": 0.02349196726280691, + "grad_norm": 8.327916145324707, + "learning_rate": 9.777777777777779e-06, + "loss": 1.2715, + "num_tokens": 1167937.0, + "step": 465 + }, + { + "epoch": 0.023542487622511873, + "grad_norm": 7.312363147735596, + "learning_rate": 9.754385964912281e-06, + "loss": 0.9305, + "num_tokens": 1170382.0, + "step": 466 + }, + { + "epoch": 0.023593007982216835, + "grad_norm": 7.678922176361084, + "learning_rate": 9.730994152046784e-06, + "loss": 0.9546, + "num_tokens": 1172700.0, + "step": 467 + }, + { + "epoch": 0.023643528341921793, + "grad_norm": 12.352724075317383, + "learning_rate": 9.707602339181286e-06, + "loss": 1.0142, + "num_tokens": 1173997.0, + "step": 468 + }, + { + "epoch": 0.023694048701626755, + "grad_norm": 7.131642818450928, + "learning_rate": 9.68421052631579e-06, + "loss": 1.0355, + "num_tokens": 1176818.0, + "step": 469 + }, + { + "epoch": 0.023744569061331718, + "grad_norm": 7.444904327392578, + "learning_rate": 9.660818713450293e-06, + "loss": 1.0349, + "num_tokens": 1179211.0, + "step": 470 + }, + { + "epoch": 0.023795089421036676, + "grad_norm": 8.650837898254395, + "learning_rate": 9.637426900584796e-06, + "loss": 1.011, + "num_tokens": 1181334.0, + "step": 471 + }, + { + "epoch": 0.023845609780741638, + "grad_norm": 8.991205215454102, + "learning_rate": 9.614035087719298e-06, + "loss": 1.4397, + "num_tokens": 1183852.0, + "step": 472 + }, + { + "epoch": 0.0238961301404466, + "grad_norm": 7.23414945602417, + "learning_rate": 9.590643274853801e-06, + "loss": 1.1899, + "num_tokens": 1186745.0, + "step": 473 + }, + { + "epoch": 0.023946650500151562, + "grad_norm": 8.069503784179688, + "learning_rate": 9.567251461988305e-06, + "loss": 0.9644, + "num_tokens": 1189053.0, + "step": 474 + }, + { + "epoch": 0.02399717085985652, + "grad_norm": 9.354156494140625, + "learning_rate": 9.543859649122808e-06, + "loss": 1.0726, + "num_tokens": 1191011.0, + "step": 475 + }, + { + "epoch": 0.024047691219561483, + "grad_norm": 7.026230812072754, + "learning_rate": 9.52046783625731e-06, + "loss": 0.9736, + "num_tokens": 1194139.0, + "step": 476 + }, + { + "epoch": 0.024098211579266445, + "grad_norm": 9.671537399291992, + "learning_rate": 9.497076023391813e-06, + "loss": 1.0656, + "num_tokens": 1196209.0, + "step": 477 + }, + { + "epoch": 0.024148731938971404, + "grad_norm": 8.742350578308105, + "learning_rate": 9.473684210526315e-06, + "loss": 0.9295, + "num_tokens": 1198415.0, + "step": 478 + }, + { + "epoch": 0.024199252298676366, + "grad_norm": 9.21944522857666, + "learning_rate": 9.45029239766082e-06, + "loss": 1.4303, + "num_tokens": 1200642.0, + "step": 479 + }, + { + "epoch": 0.024249772658381328, + "grad_norm": 8.941781997680664, + "learning_rate": 9.426900584795322e-06, + "loss": 1.0785, + "num_tokens": 1202815.0, + "step": 480 + }, + { + "epoch": 0.02430029301808629, + "grad_norm": 7.975043773651123, + "learning_rate": 9.403508771929825e-06, + "loss": 1.2159, + "num_tokens": 1205206.0, + "step": 481 + }, + { + "epoch": 0.02435081337779125, + "grad_norm": 7.967435836791992, + "learning_rate": 9.380116959064327e-06, + "loss": 1.0974, + "num_tokens": 1207574.0, + "step": 482 + }, + { + "epoch": 0.02440133373749621, + "grad_norm": 9.471907615661621, + "learning_rate": 9.35672514619883e-06, + "loss": 1.158, + "num_tokens": 1209341.0, + "step": 483 + }, + { + "epoch": 0.024451854097201173, + "grad_norm": 9.525802612304688, + "learning_rate": 9.333333333333334e-06, + "loss": 1.4753, + "num_tokens": 1212207.0, + "step": 484 + }, + { + "epoch": 0.024502374456906135, + "grad_norm": 6.678181171417236, + "learning_rate": 9.309941520467837e-06, + "loss": 0.8692, + "num_tokens": 1215050.0, + "step": 485 + }, + { + "epoch": 0.024552894816611093, + "grad_norm": 6.672004222869873, + "learning_rate": 9.28654970760234e-06, + "loss": 0.9202, + "num_tokens": 1218270.0, + "step": 486 + }, + { + "epoch": 0.024603415176316056, + "grad_norm": 7.645212173461914, + "learning_rate": 9.263157894736842e-06, + "loss": 1.1638, + "num_tokens": 1220972.0, + "step": 487 + }, + { + "epoch": 0.024653935536021018, + "grad_norm": 7.0845561027526855, + "learning_rate": 9.239766081871345e-06, + "loss": 1.1746, + "num_tokens": 1224246.0, + "step": 488 + }, + { + "epoch": 0.024704455895725976, + "grad_norm": 8.58920955657959, + "learning_rate": 9.216374269005849e-06, + "loss": 0.9336, + "num_tokens": 1226879.0, + "step": 489 + }, + { + "epoch": 0.02475497625543094, + "grad_norm": 7.797658443450928, + "learning_rate": 9.192982456140351e-06, + "loss": 0.9908, + "num_tokens": 1229301.0, + "step": 490 + }, + { + "epoch": 0.0248054966151359, + "grad_norm": 10.265057563781738, + "learning_rate": 9.169590643274856e-06, + "loss": 0.9196, + "num_tokens": 1231335.0, + "step": 491 + }, + { + "epoch": 0.024856016974840862, + "grad_norm": 7.84185266494751, + "learning_rate": 9.146198830409357e-06, + "loss": 1.024, + "num_tokens": 1233896.0, + "step": 492 + }, + { + "epoch": 0.02490653733454582, + "grad_norm": 6.79918909072876, + "learning_rate": 9.12280701754386e-06, + "loss": 0.9778, + "num_tokens": 1236615.0, + "step": 493 + }, + { + "epoch": 0.024957057694250783, + "grad_norm": 10.588382720947266, + "learning_rate": 9.099415204678363e-06, + "loss": 1.0641, + "num_tokens": 1238841.0, + "step": 494 + }, + { + "epoch": 0.025007578053955745, + "grad_norm": 9.146163940429688, + "learning_rate": 9.076023391812866e-06, + "loss": 1.3176, + "num_tokens": 1241307.0, + "step": 495 + }, + { + "epoch": 0.025058098413660704, + "grad_norm": 7.881531238555908, + "learning_rate": 9.05263157894737e-06, + "loss": 1.1623, + "num_tokens": 1244175.0, + "step": 496 + }, + { + "epoch": 0.025108618773365666, + "grad_norm": 7.058382034301758, + "learning_rate": 9.029239766081873e-06, + "loss": 1.1692, + "num_tokens": 1247120.0, + "step": 497 + }, + { + "epoch": 0.025159139133070628, + "grad_norm": 10.12267017364502, + "learning_rate": 9.005847953216374e-06, + "loss": 0.9301, + "num_tokens": 1248915.0, + "step": 498 + }, + { + "epoch": 0.02520965949277559, + "grad_norm": 9.34394645690918, + "learning_rate": 8.982456140350878e-06, + "loss": 1.0336, + "num_tokens": 1250994.0, + "step": 499 + }, + { + "epoch": 0.02526017985248055, + "grad_norm": 6.16126823425293, + "learning_rate": 8.95906432748538e-06, + "loss": 0.9581, + "num_tokens": 1254359.0, + "step": 500 + }, + { + "epoch": 0.02531070021218551, + "grad_norm": 8.266057968139648, + "learning_rate": 8.935672514619885e-06, + "loss": 1.3318, + "num_tokens": 1256968.0, + "step": 501 + }, + { + "epoch": 0.025361220571890473, + "grad_norm": 7.282562732696533, + "learning_rate": 8.912280701754387e-06, + "loss": 1.0629, + "num_tokens": 1259808.0, + "step": 502 + }, + { + "epoch": 0.02541174093159543, + "grad_norm": 9.284351348876953, + "learning_rate": 8.888888888888888e-06, + "loss": 0.9811, + "num_tokens": 1261681.0, + "step": 503 + }, + { + "epoch": 0.025462261291300393, + "grad_norm": 8.650157928466797, + "learning_rate": 8.865497076023393e-06, + "loss": 1.1638, + "num_tokens": 1264192.0, + "step": 504 + }, + { + "epoch": 0.025512781651005356, + "grad_norm": 7.917022705078125, + "learning_rate": 8.842105263157895e-06, + "loss": 1.2178, + "num_tokens": 1266837.0, + "step": 505 + }, + { + "epoch": 0.025563302010710318, + "grad_norm": 7.831037521362305, + "learning_rate": 8.8187134502924e-06, + "loss": 0.9943, + "num_tokens": 1270040.0, + "step": 506 + }, + { + "epoch": 0.025613822370415276, + "grad_norm": 8.059146881103516, + "learning_rate": 8.795321637426902e-06, + "loss": 0.8595, + "num_tokens": 1272059.0, + "step": 507 + }, + { + "epoch": 0.02566434273012024, + "grad_norm": 8.514704704284668, + "learning_rate": 8.771929824561405e-06, + "loss": 1.1956, + "num_tokens": 1274476.0, + "step": 508 + }, + { + "epoch": 0.0257148630898252, + "grad_norm": 8.267817497253418, + "learning_rate": 8.748538011695907e-06, + "loss": 1.1855, + "num_tokens": 1277498.0, + "step": 509 + }, + { + "epoch": 0.02576538344953016, + "grad_norm": 7.298532009124756, + "learning_rate": 8.72514619883041e-06, + "loss": 0.9891, + "num_tokens": 1279783.0, + "step": 510 + }, + { + "epoch": 0.02581590380923512, + "grad_norm": 6.8726654052734375, + "learning_rate": 8.701754385964914e-06, + "loss": 1.0062, + "num_tokens": 1282392.0, + "step": 511 + }, + { + "epoch": 0.025866424168940083, + "grad_norm": 7.091618061065674, + "learning_rate": 8.678362573099417e-06, + "loss": 0.9148, + "num_tokens": 1285223.0, + "step": 512 + }, + { + "epoch": 0.025916944528645045, + "grad_norm": 7.5556206703186035, + "learning_rate": 8.654970760233919e-06, + "loss": 1.2347, + "num_tokens": 1288440.0, + "step": 513 + }, + { + "epoch": 0.025967464888350004, + "grad_norm": 7.303925514221191, + "learning_rate": 8.631578947368422e-06, + "loss": 1.1366, + "num_tokens": 1291265.0, + "step": 514 + }, + { + "epoch": 0.026017985248054966, + "grad_norm": 8.106623649597168, + "learning_rate": 8.608187134502924e-06, + "loss": 1.2598, + "num_tokens": 1293640.0, + "step": 515 + }, + { + "epoch": 0.026068505607759928, + "grad_norm": 7.573641777038574, + "learning_rate": 8.584795321637429e-06, + "loss": 1.1553, + "num_tokens": 1296642.0, + "step": 516 + }, + { + "epoch": 0.02611902596746489, + "grad_norm": 7.0134077072143555, + "learning_rate": 8.561403508771931e-06, + "loss": 1.06, + "num_tokens": 1299726.0, + "step": 517 + }, + { + "epoch": 0.02616954632716985, + "grad_norm": 8.203680038452148, + "learning_rate": 8.538011695906434e-06, + "loss": 1.24, + "num_tokens": 1302323.0, + "step": 518 + }, + { + "epoch": 0.02622006668687481, + "grad_norm": 7.4958977699279785, + "learning_rate": 8.514619883040936e-06, + "loss": 0.9127, + "num_tokens": 1305027.0, + "step": 519 + }, + { + "epoch": 0.026270587046579773, + "grad_norm": 8.409562110900879, + "learning_rate": 8.491228070175439e-06, + "loss": 1.063, + "num_tokens": 1307170.0, + "step": 520 + }, + { + "epoch": 0.02632110740628473, + "grad_norm": 7.781862258911133, + "learning_rate": 8.467836257309943e-06, + "loss": 1.0937, + "num_tokens": 1309677.0, + "step": 521 + }, + { + "epoch": 0.026371627765989693, + "grad_norm": 6.958422660827637, + "learning_rate": 8.444444444444446e-06, + "loss": 1.2275, + "num_tokens": 1312808.0, + "step": 522 + }, + { + "epoch": 0.026422148125694656, + "grad_norm": 7.915128231048584, + "learning_rate": 8.421052631578948e-06, + "loss": 0.9942, + "num_tokens": 1315614.0, + "step": 523 + }, + { + "epoch": 0.026472668485399618, + "grad_norm": 9.52972412109375, + "learning_rate": 8.39766081871345e-06, + "loss": 1.2968, + "num_tokens": 1317934.0, + "step": 524 + }, + { + "epoch": 0.026523188845104576, + "grad_norm": 7.880041599273682, + "learning_rate": 8.374269005847953e-06, + "loss": 0.9073, + "num_tokens": 1320108.0, + "step": 525 + }, + { + "epoch": 0.02657370920480954, + "grad_norm": 7.260169982910156, + "learning_rate": 8.350877192982458e-06, + "loss": 0.9521, + "num_tokens": 1322721.0, + "step": 526 + }, + { + "epoch": 0.0266242295645145, + "grad_norm": 7.779957294464111, + "learning_rate": 8.32748538011696e-06, + "loss": 1.0417, + "num_tokens": 1325348.0, + "step": 527 + }, + { + "epoch": 0.02667474992421946, + "grad_norm": 6.894069671630859, + "learning_rate": 8.304093567251463e-06, + "loss": 1.0385, + "num_tokens": 1328086.0, + "step": 528 + }, + { + "epoch": 0.02672527028392442, + "grad_norm": 7.881601810455322, + "learning_rate": 8.280701754385965e-06, + "loss": 1.0507, + "num_tokens": 1330195.0, + "step": 529 + }, + { + "epoch": 0.026775790643629383, + "grad_norm": 8.462027549743652, + "learning_rate": 8.257309941520468e-06, + "loss": 1.1905, + "num_tokens": 1332861.0, + "step": 530 + }, + { + "epoch": 0.026826311003334345, + "grad_norm": 8.52175235748291, + "learning_rate": 8.233918128654972e-06, + "loss": 1.0854, + "num_tokens": 1334848.0, + "step": 531 + }, + { + "epoch": 0.026876831363039304, + "grad_norm": 9.371026992797852, + "learning_rate": 8.210526315789475e-06, + "loss": 1.2329, + "num_tokens": 1336751.0, + "step": 532 + }, + { + "epoch": 0.026927351722744266, + "grad_norm": 8.265593528747559, + "learning_rate": 8.187134502923977e-06, + "loss": 1.0362, + "num_tokens": 1338862.0, + "step": 533 + }, + { + "epoch": 0.026977872082449228, + "grad_norm": 8.216224670410156, + "learning_rate": 8.16374269005848e-06, + "loss": 0.8583, + "num_tokens": 1341104.0, + "step": 534 + }, + { + "epoch": 0.027028392442154187, + "grad_norm": 7.810232162475586, + "learning_rate": 8.140350877192983e-06, + "loss": 1.3831, + "num_tokens": 1344054.0, + "step": 535 + }, + { + "epoch": 0.02707891280185915, + "grad_norm": 9.114062309265137, + "learning_rate": 8.116959064327487e-06, + "loss": 1.0858, + "num_tokens": 1346778.0, + "step": 536 + }, + { + "epoch": 0.02712943316156411, + "grad_norm": 10.635411262512207, + "learning_rate": 8.09356725146199e-06, + "loss": 1.5285, + "num_tokens": 1348913.0, + "step": 537 + }, + { + "epoch": 0.027179953521269073, + "grad_norm": 7.35087251663208, + "learning_rate": 8.070175438596492e-06, + "loss": 1.0425, + "num_tokens": 1351533.0, + "step": 538 + }, + { + "epoch": 0.02723047388097403, + "grad_norm": 6.784363269805908, + "learning_rate": 8.046783625730994e-06, + "loss": 0.9131, + "num_tokens": 1354598.0, + "step": 539 + }, + { + "epoch": 0.027280994240678993, + "grad_norm": 8.264861106872559, + "learning_rate": 8.023391812865497e-06, + "loss": 1.145, + "num_tokens": 1356928.0, + "step": 540 + }, + { + "epoch": 0.027331514600383956, + "grad_norm": 7.399439334869385, + "learning_rate": 8.000000000000001e-06, + "loss": 1.3013, + "num_tokens": 1360029.0, + "step": 541 + }, + { + "epoch": 0.027382034960088914, + "grad_norm": 8.00053882598877, + "learning_rate": 7.976608187134504e-06, + "loss": 1.0174, + "num_tokens": 1362423.0, + "step": 542 + }, + { + "epoch": 0.027432555319793876, + "grad_norm": 7.717677593231201, + "learning_rate": 7.953216374269006e-06, + "loss": 1.0194, + "num_tokens": 1364819.0, + "step": 543 + }, + { + "epoch": 0.02748307567949884, + "grad_norm": 8.83278751373291, + "learning_rate": 7.929824561403509e-06, + "loss": 1.2334, + "num_tokens": 1367565.0, + "step": 544 + }, + { + "epoch": 0.0275335960392038, + "grad_norm": 9.800333976745605, + "learning_rate": 7.906432748538012e-06, + "loss": 0.9401, + "num_tokens": 1369303.0, + "step": 545 + }, + { + "epoch": 0.02758411639890876, + "grad_norm": 10.568621635437012, + "learning_rate": 7.883040935672516e-06, + "loss": 0.8519, + "num_tokens": 1370761.0, + "step": 546 + }, + { + "epoch": 0.02763463675861372, + "grad_norm": 9.293244361877441, + "learning_rate": 7.859649122807018e-06, + "loss": 0.9919, + "num_tokens": 1372992.0, + "step": 547 + }, + { + "epoch": 0.027685157118318683, + "grad_norm": 7.680420398712158, + "learning_rate": 7.836257309941521e-06, + "loss": 1.1079, + "num_tokens": 1375622.0, + "step": 548 + }, + { + "epoch": 0.027735677478023645, + "grad_norm": 7.9923858642578125, + "learning_rate": 7.812865497076024e-06, + "loss": 1.2297, + "num_tokens": 1378659.0, + "step": 549 + }, + { + "epoch": 0.027786197837728604, + "grad_norm": 8.857748985290527, + "learning_rate": 7.789473684210526e-06, + "loss": 1.039, + "num_tokens": 1380596.0, + "step": 550 + }, + { + "epoch": 0.027836718197433566, + "grad_norm": 6.95242166519165, + "learning_rate": 7.76608187134503e-06, + "loss": 1.0728, + "num_tokens": 1383696.0, + "step": 551 + }, + { + "epoch": 0.027887238557138528, + "grad_norm": 8.34567928314209, + "learning_rate": 7.742690058479533e-06, + "loss": 1.2795, + "num_tokens": 1386068.0, + "step": 552 + }, + { + "epoch": 0.027937758916843487, + "grad_norm": 8.763282775878906, + "learning_rate": 7.719298245614036e-06, + "loss": 1.3538, + "num_tokens": 1388635.0, + "step": 553 + }, + { + "epoch": 0.02798827927654845, + "grad_norm": 8.023956298828125, + "learning_rate": 7.695906432748538e-06, + "loss": 0.978, + "num_tokens": 1390793.0, + "step": 554 + }, + { + "epoch": 0.02803879963625341, + "grad_norm": 8.52585220336914, + "learning_rate": 7.67251461988304e-06, + "loss": 1.4234, + "num_tokens": 1393492.0, + "step": 555 + }, + { + "epoch": 0.028089319995958373, + "grad_norm": 8.335930824279785, + "learning_rate": 7.649122807017545e-06, + "loss": 1.3499, + "num_tokens": 1396227.0, + "step": 556 + }, + { + "epoch": 0.02813984035566333, + "grad_norm": 7.912451267242432, + "learning_rate": 7.625730994152048e-06, + "loss": 1.1538, + "num_tokens": 1398701.0, + "step": 557 + }, + { + "epoch": 0.028190360715368293, + "grad_norm": 8.871837615966797, + "learning_rate": 7.60233918128655e-06, + "loss": 0.9169, + "num_tokens": 1400534.0, + "step": 558 + }, + { + "epoch": 0.028240881075073256, + "grad_norm": 7.395883083343506, + "learning_rate": 7.578947368421054e-06, + "loss": 1.0306, + "num_tokens": 1403013.0, + "step": 559 + }, + { + "epoch": 0.028291401434778214, + "grad_norm": 7.920651912689209, + "learning_rate": 7.555555555555556e-06, + "loss": 1.2203, + "num_tokens": 1405513.0, + "step": 560 + }, + { + "epoch": 0.028341921794483176, + "grad_norm": 7.742629528045654, + "learning_rate": 7.5321637426900596e-06, + "loss": 1.0388, + "num_tokens": 1408008.0, + "step": 561 + }, + { + "epoch": 0.02839244215418814, + "grad_norm": 6.321586608886719, + "learning_rate": 7.508771929824562e-06, + "loss": 1.1409, + "num_tokens": 1411678.0, + "step": 562 + }, + { + "epoch": 0.0284429625138931, + "grad_norm": 8.639960289001465, + "learning_rate": 7.485380116959065e-06, + "loss": 1.1977, + "num_tokens": 1414073.0, + "step": 563 + }, + { + "epoch": 0.02849348287359806, + "grad_norm": 9.910258293151855, + "learning_rate": 7.461988304093568e-06, + "loss": 1.3535, + "num_tokens": 1416054.0, + "step": 564 + }, + { + "epoch": 0.02854400323330302, + "grad_norm": 10.015944480895996, + "learning_rate": 7.438596491228071e-06, + "loss": 1.1978, + "num_tokens": 1417775.0, + "step": 565 + }, + { + "epoch": 0.028594523593007983, + "grad_norm": 8.60866928100586, + "learning_rate": 7.415204678362574e-06, + "loss": 1.3574, + "num_tokens": 1420162.0, + "step": 566 + }, + { + "epoch": 0.028645043952712942, + "grad_norm": 8.378902435302734, + "learning_rate": 7.391812865497077e-06, + "loss": 1.0039, + "num_tokens": 1422261.0, + "step": 567 + }, + { + "epoch": 0.028695564312417904, + "grad_norm": 7.113250732421875, + "learning_rate": 7.368421052631579e-06, + "loss": 0.9878, + "num_tokens": 1425094.0, + "step": 568 + }, + { + "epoch": 0.028746084672122866, + "grad_norm": 7.27842903137207, + "learning_rate": 7.345029239766083e-06, + "loss": 1.1017, + "num_tokens": 1427716.0, + "step": 569 + }, + { + "epoch": 0.028796605031827828, + "grad_norm": 9.736827850341797, + "learning_rate": 7.321637426900585e-06, + "loss": 1.15, + "num_tokens": 1429904.0, + "step": 570 + }, + { + "epoch": 0.028847125391532787, + "grad_norm": 7.852704048156738, + "learning_rate": 7.298245614035089e-06, + "loss": 0.9064, + "num_tokens": 1432003.0, + "step": 571 + }, + { + "epoch": 0.02889764575123775, + "grad_norm": 8.28087043762207, + "learning_rate": 7.274853801169591e-06, + "loss": 1.2396, + "num_tokens": 1434556.0, + "step": 572 + }, + { + "epoch": 0.02894816611094271, + "grad_norm": 10.027870178222656, + "learning_rate": 7.251461988304094e-06, + "loss": 1.5227, + "num_tokens": 1436788.0, + "step": 573 + }, + { + "epoch": 0.02899868647064767, + "grad_norm": 9.149057388305664, + "learning_rate": 7.228070175438597e-06, + "loss": 0.8294, + "num_tokens": 1438414.0, + "step": 574 + }, + { + "epoch": 0.02904920683035263, + "grad_norm": 7.821249961853027, + "learning_rate": 7.2046783625731e-06, + "loss": 1.1003, + "num_tokens": 1441056.0, + "step": 575 + }, + { + "epoch": 0.029099727190057594, + "grad_norm": 7.0643086433410645, + "learning_rate": 7.181286549707603e-06, + "loss": 0.9263, + "num_tokens": 1443502.0, + "step": 576 + }, + { + "epoch": 0.029150247549762556, + "grad_norm": 8.15132999420166, + "learning_rate": 7.157894736842106e-06, + "loss": 1.0762, + "num_tokens": 1445719.0, + "step": 577 + }, + { + "epoch": 0.029200767909467514, + "grad_norm": 8.837310791015625, + "learning_rate": 7.134502923976608e-06, + "loss": 1.3033, + "num_tokens": 1448040.0, + "step": 578 + }, + { + "epoch": 0.029251288269172476, + "grad_norm": 8.682538032531738, + "learning_rate": 7.111111111111112e-06, + "loss": 1.3062, + "num_tokens": 1450216.0, + "step": 579 + }, + { + "epoch": 0.02930180862887744, + "grad_norm": 7.323039531707764, + "learning_rate": 7.087719298245614e-06, + "loss": 1.3407, + "num_tokens": 1453323.0, + "step": 580 + }, + { + "epoch": 0.0293523289885824, + "grad_norm": 7.50269889831543, + "learning_rate": 7.064327485380118e-06, + "loss": 1.0059, + "num_tokens": 1456116.0, + "step": 581 + }, + { + "epoch": 0.02940284934828736, + "grad_norm": 7.518065452575684, + "learning_rate": 7.04093567251462e-06, + "loss": 1.0704, + "num_tokens": 1458629.0, + "step": 582 + }, + { + "epoch": 0.02945336970799232, + "grad_norm": 6.795121669769287, + "learning_rate": 7.017543859649123e-06, + "loss": 0.9808, + "num_tokens": 1461263.0, + "step": 583 + }, + { + "epoch": 0.029503890067697283, + "grad_norm": 8.670248985290527, + "learning_rate": 6.994152046783626e-06, + "loss": 1.1003, + "num_tokens": 1463645.0, + "step": 584 + }, + { + "epoch": 0.029554410427402242, + "grad_norm": 7.029455184936523, + "learning_rate": 6.970760233918129e-06, + "loss": 0.9082, + "num_tokens": 1466112.0, + "step": 585 + }, + { + "epoch": 0.029604930787107204, + "grad_norm": 7.781805038452148, + "learning_rate": 6.947368421052632e-06, + "loss": 0.9572, + "num_tokens": 1468282.0, + "step": 586 + }, + { + "epoch": 0.029655451146812166, + "grad_norm": 9.165765762329102, + "learning_rate": 6.923976608187135e-06, + "loss": 1.2318, + "num_tokens": 1470055.0, + "step": 587 + }, + { + "epoch": 0.029705971506517128, + "grad_norm": 7.369879722595215, + "learning_rate": 6.9005847953216375e-06, + "loss": 1.0069, + "num_tokens": 1472520.0, + "step": 588 + }, + { + "epoch": 0.029756491866222087, + "grad_norm": 7.5016937255859375, + "learning_rate": 6.877192982456141e-06, + "loss": 0.9762, + "num_tokens": 1475016.0, + "step": 589 + }, + { + "epoch": 0.02980701222592705, + "grad_norm": 6.893375873565674, + "learning_rate": 6.8538011695906435e-06, + "loss": 0.9666, + "num_tokens": 1477679.0, + "step": 590 + }, + { + "epoch": 0.02985753258563201, + "grad_norm": 7.325122833251953, + "learning_rate": 6.830409356725147e-06, + "loss": 1.0898, + "num_tokens": 1480528.0, + "step": 591 + }, + { + "epoch": 0.02990805294533697, + "grad_norm": 10.349858283996582, + "learning_rate": 6.8070175438596495e-06, + "loss": 1.3431, + "num_tokens": 1482604.0, + "step": 592 + }, + { + "epoch": 0.02995857330504193, + "grad_norm": 7.8944196701049805, + "learning_rate": 6.783625730994152e-06, + "loss": 1.1076, + "num_tokens": 1485118.0, + "step": 593 + }, + { + "epoch": 0.030009093664746894, + "grad_norm": 8.23731517791748, + "learning_rate": 6.7602339181286555e-06, + "loss": 0.9904, + "num_tokens": 1487196.0, + "step": 594 + }, + { + "epoch": 0.030059614024451856, + "grad_norm": 9.08659839630127, + "learning_rate": 6.736842105263158e-06, + "loss": 1.015, + "num_tokens": 1488924.0, + "step": 595 + }, + { + "epoch": 0.030110134384156814, + "grad_norm": 6.095712661743164, + "learning_rate": 6.7134502923976615e-06, + "loss": 0.9861, + "num_tokens": 1492340.0, + "step": 596 + }, + { + "epoch": 0.030160654743861776, + "grad_norm": 8.20645523071289, + "learning_rate": 6.690058479532164e-06, + "loss": 1.0299, + "num_tokens": 1494784.0, + "step": 597 + }, + { + "epoch": 0.03021117510356674, + "grad_norm": 7.462918758392334, + "learning_rate": 6.666666666666667e-06, + "loss": 1.0741, + "num_tokens": 1497300.0, + "step": 598 + }, + { + "epoch": 0.030261695463271697, + "grad_norm": 7.794787883758545, + "learning_rate": 6.64327485380117e-06, + "loss": 1.0206, + "num_tokens": 1499627.0, + "step": 599 + }, + { + "epoch": 0.03031221582297666, + "grad_norm": 8.762218475341797, + "learning_rate": 6.619883040935673e-06, + "loss": 0.9476, + "num_tokens": 1501426.0, + "step": 600 + }, + { + "epoch": 0.03036273618268162, + "grad_norm": 7.999383449554443, + "learning_rate": 6.596491228070177e-06, + "loss": 1.0201, + "num_tokens": 1503840.0, + "step": 601 + }, + { + "epoch": 0.030413256542386583, + "grad_norm": 9.501809120178223, + "learning_rate": 6.573099415204679e-06, + "loss": 1.1281, + "num_tokens": 1505919.0, + "step": 602 + }, + { + "epoch": 0.030463776902091542, + "grad_norm": 7.047045707702637, + "learning_rate": 6.549707602339181e-06, + "loss": 0.9979, + "num_tokens": 1508757.0, + "step": 603 + }, + { + "epoch": 0.030514297261796504, + "grad_norm": 8.55207347869873, + "learning_rate": 6.526315789473685e-06, + "loss": 1.3233, + "num_tokens": 1511068.0, + "step": 604 + }, + { + "epoch": 0.030564817621501466, + "grad_norm": 8.288250923156738, + "learning_rate": 6.502923976608187e-06, + "loss": 1.2302, + "num_tokens": 1513825.0, + "step": 605 + }, + { + "epoch": 0.030615337981206425, + "grad_norm": 7.409893989562988, + "learning_rate": 6.4795321637426915e-06, + "loss": 0.8886, + "num_tokens": 1516208.0, + "step": 606 + }, + { + "epoch": 0.030665858340911387, + "grad_norm": 8.533541679382324, + "learning_rate": 6.456140350877193e-06, + "loss": 1.1728, + "num_tokens": 1518670.0, + "step": 607 + }, + { + "epoch": 0.03071637870061635, + "grad_norm": 7.29590368270874, + "learning_rate": 6.432748538011696e-06, + "loss": 0.974, + "num_tokens": 1521559.0, + "step": 608 + }, + { + "epoch": 0.03076689906032131, + "grad_norm": 7.554265022277832, + "learning_rate": 6.4093567251462e-06, + "loss": 1.0386, + "num_tokens": 1523994.0, + "step": 609 + }, + { + "epoch": 0.03081741942002627, + "grad_norm": 8.648849487304688, + "learning_rate": 6.385964912280702e-06, + "loss": 1.0692, + "num_tokens": 1525950.0, + "step": 610 + }, + { + "epoch": 0.03086793977973123, + "grad_norm": 7.120666980743408, + "learning_rate": 6.362573099415206e-06, + "loss": 0.8511, + "num_tokens": 1528763.0, + "step": 611 + }, + { + "epoch": 0.030918460139436194, + "grad_norm": 9.236486434936523, + "learning_rate": 6.339181286549709e-06, + "loss": 0.966, + "num_tokens": 1530784.0, + "step": 612 + }, + { + "epoch": 0.030968980499141156, + "grad_norm": 7.317360877990723, + "learning_rate": 6.31578947368421e-06, + "loss": 1.1884, + "num_tokens": 1533509.0, + "step": 613 + }, + { + "epoch": 0.031019500858846114, + "grad_norm": 7.476950645446777, + "learning_rate": 6.292397660818715e-06, + "loss": 1.2452, + "num_tokens": 1536265.0, + "step": 614 + }, + { + "epoch": 0.031070021218551076, + "grad_norm": 8.157122611999512, + "learning_rate": 6.269005847953217e-06, + "loss": 0.9855, + "num_tokens": 1538873.0, + "step": 615 + }, + { + "epoch": 0.03112054157825604, + "grad_norm": 6.286567211151123, + "learning_rate": 6.245614035087721e-06, + "loss": 0.9616, + "num_tokens": 1542118.0, + "step": 616 + }, + { + "epoch": 0.031171061937960997, + "grad_norm": 9.18763256072998, + "learning_rate": 6.222222222222223e-06, + "loss": 1.2499, + "num_tokens": 1544087.0, + "step": 617 + }, + { + "epoch": 0.03122158229766596, + "grad_norm": 8.84661865234375, + "learning_rate": 6.198830409356725e-06, + "loss": 1.2279, + "num_tokens": 1546264.0, + "step": 618 + }, + { + "epoch": 0.03127210265737092, + "grad_norm": 7.39112663269043, + "learning_rate": 6.175438596491229e-06, + "loss": 1.0612, + "num_tokens": 1549267.0, + "step": 619 + }, + { + "epoch": 0.03132262301707588, + "grad_norm": 8.905179023742676, + "learning_rate": 6.152046783625732e-06, + "loss": 0.9632, + "num_tokens": 1551243.0, + "step": 620 + }, + { + "epoch": 0.03137314337678084, + "grad_norm": 8.069854736328125, + "learning_rate": 6.128654970760235e-06, + "loss": 1.3285, + "num_tokens": 1553634.0, + "step": 621 + }, + { + "epoch": 0.0314236637364858, + "grad_norm": 8.615447044372559, + "learning_rate": 6.105263157894738e-06, + "loss": 1.3139, + "num_tokens": 1556195.0, + "step": 622 + }, + { + "epoch": 0.031474184096190766, + "grad_norm": 7.024984359741211, + "learning_rate": 6.08187134502924e-06, + "loss": 1.1004, + "num_tokens": 1559037.0, + "step": 623 + }, + { + "epoch": 0.031524704455895725, + "grad_norm": 8.354039192199707, + "learning_rate": 6.058479532163744e-06, + "loss": 1.1124, + "num_tokens": 1560978.0, + "step": 624 + }, + { + "epoch": 0.03157522481560069, + "grad_norm": 7.72731351852417, + "learning_rate": 6.035087719298246e-06, + "loss": 1.1461, + "num_tokens": 1564117.0, + "step": 625 + }, + { + "epoch": 0.03162574517530565, + "grad_norm": 10.652926445007324, + "learning_rate": 6.01169590643275e-06, + "loss": 1.2396, + "num_tokens": 1565861.0, + "step": 626 + }, + { + "epoch": 0.03167626553501061, + "grad_norm": 9.616717338562012, + "learning_rate": 5.988304093567252e-06, + "loss": 1.6321, + "num_tokens": 1568240.0, + "step": 627 + }, + { + "epoch": 0.03172678589471557, + "grad_norm": 7.272972106933594, + "learning_rate": 5.964912280701755e-06, + "loss": 1.1238, + "num_tokens": 1570763.0, + "step": 628 + }, + { + "epoch": 0.03177730625442053, + "grad_norm": 7.224928379058838, + "learning_rate": 5.941520467836258e-06, + "loss": 1.0055, + "num_tokens": 1573117.0, + "step": 629 + }, + { + "epoch": 0.03182782661412549, + "grad_norm": 7.230025768280029, + "learning_rate": 5.918128654970761e-06, + "loss": 1.0699, + "num_tokens": 1575810.0, + "step": 630 + }, + { + "epoch": 0.031878346973830456, + "grad_norm": 6.789632320404053, + "learning_rate": 5.8947368421052634e-06, + "loss": 1.0002, + "num_tokens": 1578501.0, + "step": 631 + }, + { + "epoch": 0.031928867333535414, + "grad_norm": 7.361072063446045, + "learning_rate": 5.871345029239767e-06, + "loss": 1.1577, + "num_tokens": 1581005.0, + "step": 632 + }, + { + "epoch": 0.03197938769324037, + "grad_norm": 8.942316055297852, + "learning_rate": 5.847953216374269e-06, + "loss": 1.0419, + "num_tokens": 1583448.0, + "step": 633 + }, + { + "epoch": 0.03202990805294534, + "grad_norm": 7.986004829406738, + "learning_rate": 5.824561403508773e-06, + "loss": 0.9211, + "num_tokens": 1585708.0, + "step": 634 + }, + { + "epoch": 0.0320804284126503, + "grad_norm": 6.813812255859375, + "learning_rate": 5.801169590643275e-06, + "loss": 1.1733, + "num_tokens": 1588760.0, + "step": 635 + }, + { + "epoch": 0.03213094877235526, + "grad_norm": 9.140789985656738, + "learning_rate": 5.777777777777778e-06, + "loss": 1.5445, + "num_tokens": 1591523.0, + "step": 636 + }, + { + "epoch": 0.03218146913206022, + "grad_norm": 7.130503177642822, + "learning_rate": 5.754385964912281e-06, + "loss": 1.258, + "num_tokens": 1594223.0, + "step": 637 + }, + { + "epoch": 0.03223198949176518, + "grad_norm": 7.160690784454346, + "learning_rate": 5.730994152046784e-06, + "loss": 0.8701, + "num_tokens": 1596616.0, + "step": 638 + }, + { + "epoch": 0.032282509851470145, + "grad_norm": 6.597822666168213, + "learning_rate": 5.707602339181287e-06, + "loss": 1.0348, + "num_tokens": 1599915.0, + "step": 639 + }, + { + "epoch": 0.032333030211175104, + "grad_norm": 7.018404006958008, + "learning_rate": 5.68421052631579e-06, + "loss": 1.1075, + "num_tokens": 1602833.0, + "step": 640 + }, + { + "epoch": 0.03238355057088006, + "grad_norm": 7.119444847106934, + "learning_rate": 5.6608187134502925e-06, + "loss": 1.073, + "num_tokens": 1605685.0, + "step": 641 + }, + { + "epoch": 0.03243407093058503, + "grad_norm": 8.49666976928711, + "learning_rate": 5.637426900584796e-06, + "loss": 0.9828, + "num_tokens": 1607920.0, + "step": 642 + }, + { + "epoch": 0.03248459129028999, + "grad_norm": 6.9808454513549805, + "learning_rate": 5.6140350877192985e-06, + "loss": 0.9553, + "num_tokens": 1610580.0, + "step": 643 + }, + { + "epoch": 0.032535111649994945, + "grad_norm": 7.724054336547852, + "learning_rate": 5.590643274853802e-06, + "loss": 1.194, + "num_tokens": 1613000.0, + "step": 644 + }, + { + "epoch": 0.03258563200969991, + "grad_norm": 7.294096946716309, + "learning_rate": 5.5672514619883045e-06, + "loss": 1.2243, + "num_tokens": 1616124.0, + "step": 645 + }, + { + "epoch": 0.03263615236940487, + "grad_norm": 9.598305702209473, + "learning_rate": 5.543859649122807e-06, + "loss": 1.3787, + "num_tokens": 1618928.0, + "step": 646 + }, + { + "epoch": 0.03268667272910983, + "grad_norm": 8.297586441040039, + "learning_rate": 5.5204678362573105e-06, + "loss": 1.1144, + "num_tokens": 1621379.0, + "step": 647 + }, + { + "epoch": 0.032737193088814794, + "grad_norm": 6.843406677246094, + "learning_rate": 5.497076023391813e-06, + "loss": 0.9793, + "num_tokens": 1624191.0, + "step": 648 + }, + { + "epoch": 0.03278771344851975, + "grad_norm": 6.79688024520874, + "learning_rate": 5.4736842105263165e-06, + "loss": 1.1188, + "num_tokens": 1627147.0, + "step": 649 + }, + { + "epoch": 0.03283823380822472, + "grad_norm": 7.678542137145996, + "learning_rate": 5.450292397660819e-06, + "loss": 0.8958, + "num_tokens": 1629688.0, + "step": 650 + }, + { + "epoch": 0.032888754167929676, + "grad_norm": 7.972912788391113, + "learning_rate": 5.426900584795322e-06, + "loss": 0.9635, + "num_tokens": 1631942.0, + "step": 651 + }, + { + "epoch": 0.032939274527634635, + "grad_norm": 6.9689154624938965, + "learning_rate": 5.403508771929825e-06, + "loss": 0.8471, + "num_tokens": 1634495.0, + "step": 652 + }, + { + "epoch": 0.0329897948873396, + "grad_norm": 7.233901500701904, + "learning_rate": 5.380116959064328e-06, + "loss": 1.1242, + "num_tokens": 1637409.0, + "step": 653 + }, + { + "epoch": 0.03304031524704456, + "grad_norm": 7.155329704284668, + "learning_rate": 5.356725146198831e-06, + "loss": 0.9016, + "num_tokens": 1639493.0, + "step": 654 + }, + { + "epoch": 0.03309083560674952, + "grad_norm": 9.539928436279297, + "learning_rate": 5.333333333333334e-06, + "loss": 0.9581, + "num_tokens": 1641213.0, + "step": 655 + }, + { + "epoch": 0.03314135596645448, + "grad_norm": 8.467994689941406, + "learning_rate": 5.309941520467836e-06, + "loss": 0.9871, + "num_tokens": 1643115.0, + "step": 656 + }, + { + "epoch": 0.03319187632615944, + "grad_norm": 7.5885419845581055, + "learning_rate": 5.28654970760234e-06, + "loss": 0.7525, + "num_tokens": 1645388.0, + "step": 657 + }, + { + "epoch": 0.0332423966858644, + "grad_norm": 8.647848129272461, + "learning_rate": 5.263157894736842e-06, + "loss": 1.1596, + "num_tokens": 1647801.0, + "step": 658 + }, + { + "epoch": 0.033292917045569366, + "grad_norm": 7.351362705230713, + "learning_rate": 5.239766081871346e-06, + "loss": 1.1743, + "num_tokens": 1650788.0, + "step": 659 + }, + { + "epoch": 0.033343437405274325, + "grad_norm": 7.265820503234863, + "learning_rate": 5.216374269005848e-06, + "loss": 1.2265, + "num_tokens": 1654214.0, + "step": 660 + }, + { + "epoch": 0.03339395776497928, + "grad_norm": 8.275772094726562, + "learning_rate": 5.192982456140351e-06, + "loss": 1.0414, + "num_tokens": 1656773.0, + "step": 661 + }, + { + "epoch": 0.03344447812468425, + "grad_norm": 10.60268497467041, + "learning_rate": 5.169590643274854e-06, + "loss": 1.0783, + "num_tokens": 1658250.0, + "step": 662 + }, + { + "epoch": 0.03349499848438921, + "grad_norm": 9.565180778503418, + "learning_rate": 5.146198830409357e-06, + "loss": 1.2738, + "num_tokens": 1660952.0, + "step": 663 + }, + { + "epoch": 0.03354551884409417, + "grad_norm": 7.559276580810547, + "learning_rate": 5.12280701754386e-06, + "loss": 1.1929, + "num_tokens": 1663702.0, + "step": 664 + }, + { + "epoch": 0.03359603920379913, + "grad_norm": 7.664490699768066, + "learning_rate": 5.099415204678363e-06, + "loss": 1.1947, + "num_tokens": 1666447.0, + "step": 665 + }, + { + "epoch": 0.03364655956350409, + "grad_norm": 8.05148696899414, + "learning_rate": 5.076023391812865e-06, + "loss": 1.2373, + "num_tokens": 1669052.0, + "step": 666 + }, + { + "epoch": 0.033697079923209056, + "grad_norm": 7.887591361999512, + "learning_rate": 5.052631578947369e-06, + "loss": 1.0344, + "num_tokens": 1672113.0, + "step": 667 + }, + { + "epoch": 0.033747600282914014, + "grad_norm": 6.831094264984131, + "learning_rate": 5.029239766081871e-06, + "loss": 0.9692, + "num_tokens": 1674906.0, + "step": 668 + }, + { + "epoch": 0.03379812064261897, + "grad_norm": 8.84568977355957, + "learning_rate": 5.005847953216375e-06, + "loss": 1.0344, + "num_tokens": 1677147.0, + "step": 669 + }, + { + "epoch": 0.03384864100232394, + "grad_norm": 7.756647109985352, + "learning_rate": 4.982456140350877e-06, + "loss": 1.1549, + "num_tokens": 1679671.0, + "step": 670 + }, + { + "epoch": 0.0338991613620289, + "grad_norm": 8.351460456848145, + "learning_rate": 4.959064327485381e-06, + "loss": 1.3362, + "num_tokens": 1681974.0, + "step": 671 + }, + { + "epoch": 0.033949681721733856, + "grad_norm": 7.18549919128418, + "learning_rate": 4.935672514619883e-06, + "loss": 1.0311, + "num_tokens": 1684635.0, + "step": 672 + }, + { + "epoch": 0.03400020208143882, + "grad_norm": 9.003915786743164, + "learning_rate": 4.912280701754386e-06, + "loss": 1.1502, + "num_tokens": 1686803.0, + "step": 673 + }, + { + "epoch": 0.03405072244114378, + "grad_norm": 7.852712631225586, + "learning_rate": 4.888888888888889e-06, + "loss": 1.0303, + "num_tokens": 1689081.0, + "step": 674 + }, + { + "epoch": 0.034101242800848745, + "grad_norm": 11.381545066833496, + "learning_rate": 4.865497076023392e-06, + "loss": 1.252, + "num_tokens": 1690767.0, + "step": 675 + }, + { + "epoch": 0.034151763160553704, + "grad_norm": 6.729422569274902, + "learning_rate": 4.842105263157895e-06, + "loss": 1.0162, + "num_tokens": 1693768.0, + "step": 676 + }, + { + "epoch": 0.03420228352025866, + "grad_norm": 10.078665733337402, + "learning_rate": 4.818713450292398e-06, + "loss": 0.9454, + "num_tokens": 1695307.0, + "step": 677 + }, + { + "epoch": 0.03425280387996363, + "grad_norm": 7.403512954711914, + "learning_rate": 4.7953216374269005e-06, + "loss": 1.0681, + "num_tokens": 1697957.0, + "step": 678 + }, + { + "epoch": 0.03430332423966859, + "grad_norm": 9.447266578674316, + "learning_rate": 4.771929824561404e-06, + "loss": 1.3238, + "num_tokens": 1700160.0, + "step": 679 + }, + { + "epoch": 0.034353844599373545, + "grad_norm": 7.928319931030273, + "learning_rate": 4.7485380116959065e-06, + "loss": 0.9619, + "num_tokens": 1702629.0, + "step": 680 + }, + { + "epoch": 0.03440436495907851, + "grad_norm": 7.2455058097839355, + "learning_rate": 4.72514619883041e-06, + "loss": 1.2186, + "num_tokens": 1705604.0, + "step": 681 + }, + { + "epoch": 0.03445488531878347, + "grad_norm": 7.508686542510986, + "learning_rate": 4.7017543859649125e-06, + "loss": 1.073, + "num_tokens": 1708136.0, + "step": 682 + }, + { + "epoch": 0.03450540567848843, + "grad_norm": 7.66443395614624, + "learning_rate": 4.678362573099415e-06, + "loss": 1.2102, + "num_tokens": 1711233.0, + "step": 683 + }, + { + "epoch": 0.034555926038193394, + "grad_norm": 8.47956371307373, + "learning_rate": 4.6549707602339184e-06, + "loss": 0.9389, + "num_tokens": 1713419.0, + "step": 684 + }, + { + "epoch": 0.03460644639789835, + "grad_norm": 7.613623142242432, + "learning_rate": 4.631578947368421e-06, + "loss": 0.9957, + "num_tokens": 1716204.0, + "step": 685 + }, + { + "epoch": 0.03465696675760331, + "grad_norm": 7.50279426574707, + "learning_rate": 4.6081871345029244e-06, + "loss": 0.971, + "num_tokens": 1718615.0, + "step": 686 + }, + { + "epoch": 0.034707487117308276, + "grad_norm": 7.75341796875, + "learning_rate": 4.584795321637428e-06, + "loss": 1.0518, + "num_tokens": 1721795.0, + "step": 687 + }, + { + "epoch": 0.034758007477013235, + "grad_norm": 6.846672534942627, + "learning_rate": 4.56140350877193e-06, + "loss": 0.9708, + "num_tokens": 1724706.0, + "step": 688 + }, + { + "epoch": 0.0348085278367182, + "grad_norm": 7.4471282958984375, + "learning_rate": 4.538011695906433e-06, + "loss": 1.0469, + "num_tokens": 1727549.0, + "step": 689 + }, + { + "epoch": 0.03485904819642316, + "grad_norm": 6.3521318435668945, + "learning_rate": 4.5146198830409364e-06, + "loss": 0.9187, + "num_tokens": 1730394.0, + "step": 690 + }, + { + "epoch": 0.03490956855612812, + "grad_norm": 9.35522174835205, + "learning_rate": 4.491228070175439e-06, + "loss": 1.3445, + "num_tokens": 1733007.0, + "step": 691 + }, + { + "epoch": 0.03496008891583308, + "grad_norm": 7.456794261932373, + "learning_rate": 4.467836257309942e-06, + "loss": 0.9988, + "num_tokens": 1735193.0, + "step": 692 + }, + { + "epoch": 0.03501060927553804, + "grad_norm": 8.50855541229248, + "learning_rate": 4.444444444444444e-06, + "loss": 1.2059, + "num_tokens": 1737545.0, + "step": 693 + }, + { + "epoch": 0.035061129635243, + "grad_norm": 7.270936489105225, + "learning_rate": 4.4210526315789476e-06, + "loss": 1.1404, + "num_tokens": 1740271.0, + "step": 694 + }, + { + "epoch": 0.035111649994947966, + "grad_norm": 7.3652448654174805, + "learning_rate": 4.397660818713451e-06, + "loss": 0.9482, + "num_tokens": 1742919.0, + "step": 695 + }, + { + "epoch": 0.035162170354652925, + "grad_norm": 7.948293685913086, + "learning_rate": 4.3742690058479536e-06, + "loss": 1.0666, + "num_tokens": 1745354.0, + "step": 696 + }, + { + "epoch": 0.03521269071435788, + "grad_norm": 6.10773229598999, + "learning_rate": 4.350877192982457e-06, + "loss": 0.9303, + "num_tokens": 1748584.0, + "step": 697 + }, + { + "epoch": 0.03526321107406285, + "grad_norm": 7.173102855682373, + "learning_rate": 4.3274853801169596e-06, + "loss": 0.995, + "num_tokens": 1751255.0, + "step": 698 + }, + { + "epoch": 0.03531373143376781, + "grad_norm": 7.086891174316406, + "learning_rate": 4.304093567251462e-06, + "loss": 1.2673, + "num_tokens": 1754311.0, + "step": 699 + }, + { + "epoch": 0.03536425179347277, + "grad_norm": 6.675343036651611, + "learning_rate": 4.2807017543859656e-06, + "loss": 1.0129, + "num_tokens": 1757241.0, + "step": 700 + }, + { + "epoch": 0.03541477215317773, + "grad_norm": 7.812734603881836, + "learning_rate": 4.257309941520468e-06, + "loss": 1.3095, + "num_tokens": 1759868.0, + "step": 701 + }, + { + "epoch": 0.03546529251288269, + "grad_norm": 8.058785438537598, + "learning_rate": 4.2339181286549715e-06, + "loss": 0.8997, + "num_tokens": 1762109.0, + "step": 702 + }, + { + "epoch": 0.035515812872587656, + "grad_norm": 9.06352424621582, + "learning_rate": 4.210526315789474e-06, + "loss": 1.2765, + "num_tokens": 1764737.0, + "step": 703 + }, + { + "epoch": 0.035566333232292614, + "grad_norm": 7.435845375061035, + "learning_rate": 4.187134502923977e-06, + "loss": 1.213, + "num_tokens": 1767721.0, + "step": 704 + }, + { + "epoch": 0.03561685359199757, + "grad_norm": 7.39074182510376, + "learning_rate": 4.16374269005848e-06, + "loss": 1.0324, + "num_tokens": 1770252.0, + "step": 705 + }, + { + "epoch": 0.03566737395170254, + "grad_norm": 8.23583698272705, + "learning_rate": 4.140350877192983e-06, + "loss": 1.1888, + "num_tokens": 1772559.0, + "step": 706 + }, + { + "epoch": 0.0357178943114075, + "grad_norm": 7.588513374328613, + "learning_rate": 4.116959064327486e-06, + "loss": 1.2651, + "num_tokens": 1775337.0, + "step": 707 + }, + { + "epoch": 0.035768414671112456, + "grad_norm": 7.138592720031738, + "learning_rate": 4.093567251461989e-06, + "loss": 0.914, + "num_tokens": 1777859.0, + "step": 708 + }, + { + "epoch": 0.03581893503081742, + "grad_norm": 7.7559404373168945, + "learning_rate": 4.070175438596491e-06, + "loss": 0.969, + "num_tokens": 1780048.0, + "step": 709 + }, + { + "epoch": 0.03586945539052238, + "grad_norm": 6.944403648376465, + "learning_rate": 4.046783625730995e-06, + "loss": 1.0751, + "num_tokens": 1782941.0, + "step": 710 + }, + { + "epoch": 0.03591997575022734, + "grad_norm": 9.960785865783691, + "learning_rate": 4.023391812865497e-06, + "loss": 1.2743, + "num_tokens": 1785071.0, + "step": 711 + }, + { + "epoch": 0.035970496109932304, + "grad_norm": 9.703871726989746, + "learning_rate": 4.000000000000001e-06, + "loss": 1.164, + "num_tokens": 1786819.0, + "step": 712 + }, + { + "epoch": 0.03602101646963726, + "grad_norm": 7.795428276062012, + "learning_rate": 3.976608187134503e-06, + "loss": 0.8724, + "num_tokens": 1789112.0, + "step": 713 + }, + { + "epoch": 0.03607153682934223, + "grad_norm": 7.782642364501953, + "learning_rate": 3.953216374269006e-06, + "loss": 1.173, + "num_tokens": 1792035.0, + "step": 714 + }, + { + "epoch": 0.03612205718904719, + "grad_norm": 8.358377456665039, + "learning_rate": 3.929824561403509e-06, + "loss": 1.0219, + "num_tokens": 1794132.0, + "step": 715 + }, + { + "epoch": 0.036172577548752145, + "grad_norm": 8.246298789978027, + "learning_rate": 3.906432748538012e-06, + "loss": 1.1607, + "num_tokens": 1796165.0, + "step": 716 + }, + { + "epoch": 0.03622309790845711, + "grad_norm": 7.625123977661133, + "learning_rate": 3.883040935672515e-06, + "loss": 1.0824, + "num_tokens": 1799181.0, + "step": 717 + }, + { + "epoch": 0.03627361826816207, + "grad_norm": 9.469755172729492, + "learning_rate": 3.859649122807018e-06, + "loss": 1.3813, + "num_tokens": 1801458.0, + "step": 718 + }, + { + "epoch": 0.03632413862786703, + "grad_norm": 7.422763824462891, + "learning_rate": 3.83625730994152e-06, + "loss": 1.0302, + "num_tokens": 1803998.0, + "step": 719 + }, + { + "epoch": 0.036374658987571994, + "grad_norm": 8.212010383605957, + "learning_rate": 3.812865497076024e-06, + "loss": 1.1324, + "num_tokens": 1806536.0, + "step": 720 + }, + { + "epoch": 0.03642517934727695, + "grad_norm": 8.374848365783691, + "learning_rate": 3.789473684210527e-06, + "loss": 1.2069, + "num_tokens": 1808718.0, + "step": 721 + }, + { + "epoch": 0.03647569970698191, + "grad_norm": 7.5553765296936035, + "learning_rate": 3.7660818713450298e-06, + "loss": 0.9659, + "num_tokens": 1811216.0, + "step": 722 + }, + { + "epoch": 0.036526220066686876, + "grad_norm": 7.314768314361572, + "learning_rate": 3.7426900584795324e-06, + "loss": 1.1061, + "num_tokens": 1813826.0, + "step": 723 + }, + { + "epoch": 0.036576740426391835, + "grad_norm": 9.64307689666748, + "learning_rate": 3.7192982456140354e-06, + "loss": 0.8974, + "num_tokens": 1815386.0, + "step": 724 + }, + { + "epoch": 0.036627260786096794, + "grad_norm": 8.944470405578613, + "learning_rate": 3.6959064327485384e-06, + "loss": 0.9039, + "num_tokens": 1817754.0, + "step": 725 + }, + { + "epoch": 0.03667778114580176, + "grad_norm": 7.976940631866455, + "learning_rate": 3.6725146198830414e-06, + "loss": 1.2097, + "num_tokens": 1820242.0, + "step": 726 + }, + { + "epoch": 0.03672830150550672, + "grad_norm": 7.656001091003418, + "learning_rate": 3.6491228070175443e-06, + "loss": 1.2878, + "num_tokens": 1822914.0, + "step": 727 + }, + { + "epoch": 0.03677882186521168, + "grad_norm": 7.573875904083252, + "learning_rate": 3.625730994152047e-06, + "loss": 1.038, + "num_tokens": 1825849.0, + "step": 728 + }, + { + "epoch": 0.03682934222491664, + "grad_norm": 8.411222457885742, + "learning_rate": 3.60233918128655e-06, + "loss": 1.135, + "num_tokens": 1827851.0, + "step": 729 + }, + { + "epoch": 0.0368798625846216, + "grad_norm": 7.37949800491333, + "learning_rate": 3.578947368421053e-06, + "loss": 1.0599, + "num_tokens": 1830695.0, + "step": 730 + }, + { + "epoch": 0.036930382944326566, + "grad_norm": 8.992451667785645, + "learning_rate": 3.555555555555556e-06, + "loss": 0.8728, + "num_tokens": 1832458.0, + "step": 731 + }, + { + "epoch": 0.036980903304031525, + "grad_norm": 7.820367336273193, + "learning_rate": 3.532163742690059e-06, + "loss": 0.9547, + "num_tokens": 1834649.0, + "step": 732 + }, + { + "epoch": 0.03703142366373648, + "grad_norm": 7.444509029388428, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.0135, + "num_tokens": 1837215.0, + "step": 733 + }, + { + "epoch": 0.03708194402344145, + "grad_norm": 8.364083290100098, + "learning_rate": 3.4853801169590645e-06, + "loss": 1.0698, + "num_tokens": 1839859.0, + "step": 734 + }, + { + "epoch": 0.03713246438314641, + "grad_norm": 9.103501319885254, + "learning_rate": 3.4619883040935675e-06, + "loss": 1.1663, + "num_tokens": 1841931.0, + "step": 735 + }, + { + "epoch": 0.037182984742851366, + "grad_norm": 7.133014678955078, + "learning_rate": 3.4385964912280705e-06, + "loss": 0.8265, + "num_tokens": 1844242.0, + "step": 736 + }, + { + "epoch": 0.03723350510255633, + "grad_norm": 7.1370086669921875, + "learning_rate": 3.4152046783625735e-06, + "loss": 1.1043, + "num_tokens": 1847191.0, + "step": 737 + }, + { + "epoch": 0.03728402546226129, + "grad_norm": 7.817936420440674, + "learning_rate": 3.391812865497076e-06, + "loss": 0.8801, + "num_tokens": 1849393.0, + "step": 738 + }, + { + "epoch": 0.037334545821966256, + "grad_norm": 7.88914680480957, + "learning_rate": 3.368421052631579e-06, + "loss": 1.0541, + "num_tokens": 1852431.0, + "step": 739 + }, + { + "epoch": 0.037385066181671214, + "grad_norm": 6.993699550628662, + "learning_rate": 3.345029239766082e-06, + "loss": 1.0171, + "num_tokens": 1854950.0, + "step": 740 + }, + { + "epoch": 0.03743558654137617, + "grad_norm": 7.112022399902344, + "learning_rate": 3.321637426900585e-06, + "loss": 1.1324, + "num_tokens": 1858077.0, + "step": 741 + }, + { + "epoch": 0.03748610690108114, + "grad_norm": 8.271239280700684, + "learning_rate": 3.2982456140350885e-06, + "loss": 1.2346, + "num_tokens": 1860820.0, + "step": 742 + }, + { + "epoch": 0.0375366272607861, + "grad_norm": 8.271240234375, + "learning_rate": 3.2748538011695906e-06, + "loss": 1.1738, + "num_tokens": 1863288.0, + "step": 743 + }, + { + "epoch": 0.037587147620491056, + "grad_norm": 7.763464450836182, + "learning_rate": 3.2514619883040936e-06, + "loss": 1.1241, + "num_tokens": 1865761.0, + "step": 744 + }, + { + "epoch": 0.03763766798019602, + "grad_norm": 8.51482105255127, + "learning_rate": 3.2280701754385966e-06, + "loss": 0.9584, + "num_tokens": 1868060.0, + "step": 745 + }, + { + "epoch": 0.03768818833990098, + "grad_norm": 6.616514682769775, + "learning_rate": 3.2046783625731e-06, + "loss": 1.1224, + "num_tokens": 1870937.0, + "step": 746 + }, + { + "epoch": 0.03773870869960594, + "grad_norm": 8.514359474182129, + "learning_rate": 3.181286549707603e-06, + "loss": 0.9877, + "num_tokens": 1872817.0, + "step": 747 + }, + { + "epoch": 0.037789229059310904, + "grad_norm": 7.269506454467773, + "learning_rate": 3.157894736842105e-06, + "loss": 0.8583, + "num_tokens": 1875372.0, + "step": 748 + }, + { + "epoch": 0.03783974941901586, + "grad_norm": 8.217002868652344, + "learning_rate": 3.1345029239766086e-06, + "loss": 0.9026, + "num_tokens": 1877964.0, + "step": 749 + }, + { + "epoch": 0.03789026977872082, + "grad_norm": 7.169931888580322, + "learning_rate": 3.1111111111111116e-06, + "loss": 0.8889, + "num_tokens": 1880246.0, + "step": 750 + }, + { + "epoch": 0.03794079013842579, + "grad_norm": 9.068582534790039, + "learning_rate": 3.0877192982456146e-06, + "loss": 1.0274, + "num_tokens": 1882208.0, + "step": 751 + }, + { + "epoch": 0.037991310498130745, + "grad_norm": 7.811886310577393, + "learning_rate": 3.0643274853801176e-06, + "loss": 1.0275, + "num_tokens": 1884497.0, + "step": 752 + }, + { + "epoch": 0.03804183085783571, + "grad_norm": 7.560867786407471, + "learning_rate": 3.04093567251462e-06, + "loss": 1.2229, + "num_tokens": 1887931.0, + "step": 753 + }, + { + "epoch": 0.03809235121754067, + "grad_norm": 10.090558052062988, + "learning_rate": 3.017543859649123e-06, + "loss": 1.1087, + "num_tokens": 1889507.0, + "step": 754 + }, + { + "epoch": 0.03814287157724563, + "grad_norm": 7.652523994445801, + "learning_rate": 2.994152046783626e-06, + "loss": 1.0768, + "num_tokens": 1891792.0, + "step": 755 + }, + { + "epoch": 0.038193391936950594, + "grad_norm": 9.040148735046387, + "learning_rate": 2.970760233918129e-06, + "loss": 1.063, + "num_tokens": 1893867.0, + "step": 756 + }, + { + "epoch": 0.03824391229665555, + "grad_norm": 9.025566101074219, + "learning_rate": 2.9473684210526317e-06, + "loss": 1.3458, + "num_tokens": 1896398.0, + "step": 757 + }, + { + "epoch": 0.03829443265636051, + "grad_norm": 8.337574005126953, + "learning_rate": 2.9239766081871347e-06, + "loss": 1.2345, + "num_tokens": 1898876.0, + "step": 758 + }, + { + "epoch": 0.038344953016065476, + "grad_norm": 7.611428737640381, + "learning_rate": 2.9005847953216377e-06, + "loss": 1.2304, + "num_tokens": 1901818.0, + "step": 759 + }, + { + "epoch": 0.038395473375770435, + "grad_norm": 8.04947280883789, + "learning_rate": 2.8771929824561407e-06, + "loss": 0.9243, + "num_tokens": 1904039.0, + "step": 760 + }, + { + "epoch": 0.038445993735475394, + "grad_norm": 7.743322849273682, + "learning_rate": 2.8538011695906437e-06, + "loss": 0.7342, + "num_tokens": 1906383.0, + "step": 761 + }, + { + "epoch": 0.03849651409518036, + "grad_norm": 8.0184965133667, + "learning_rate": 2.8304093567251463e-06, + "loss": 0.8679, + "num_tokens": 1908483.0, + "step": 762 + }, + { + "epoch": 0.03854703445488532, + "grad_norm": 7.2956767082214355, + "learning_rate": 2.8070175438596493e-06, + "loss": 1.2036, + "num_tokens": 1911515.0, + "step": 763 + }, + { + "epoch": 0.03859755481459028, + "grad_norm": 7.843127727508545, + "learning_rate": 2.7836257309941523e-06, + "loss": 1.079, + "num_tokens": 1914125.0, + "step": 764 + }, + { + "epoch": 0.03864807517429524, + "grad_norm": 7.176553249359131, + "learning_rate": 2.7602339181286553e-06, + "loss": 1.3325, + "num_tokens": 1917755.0, + "step": 765 + }, + { + "epoch": 0.0386985955340002, + "grad_norm": 7.983217716217041, + "learning_rate": 2.7368421052631583e-06, + "loss": 1.0674, + "num_tokens": 1920047.0, + "step": 766 + }, + { + "epoch": 0.038749115893705166, + "grad_norm": 8.650400161743164, + "learning_rate": 2.713450292397661e-06, + "loss": 0.9351, + "num_tokens": 1922172.0, + "step": 767 + }, + { + "epoch": 0.038799636253410125, + "grad_norm": 7.661126136779785, + "learning_rate": 2.690058479532164e-06, + "loss": 0.996, + "num_tokens": 1925295.0, + "step": 768 + }, + { + "epoch": 0.03885015661311508, + "grad_norm": 7.986940860748291, + "learning_rate": 2.666666666666667e-06, + "loss": 1.0135, + "num_tokens": 1928039.0, + "step": 769 + }, + { + "epoch": 0.03890067697282005, + "grad_norm": 8.285640716552734, + "learning_rate": 2.64327485380117e-06, + "loss": 0.9372, + "num_tokens": 1930413.0, + "step": 770 + }, + { + "epoch": 0.03895119733252501, + "grad_norm": 10.60688304901123, + "learning_rate": 2.619883040935673e-06, + "loss": 1.2594, + "num_tokens": 1932018.0, + "step": 771 + }, + { + "epoch": 0.039001717692229966, + "grad_norm": 9.230753898620605, + "learning_rate": 2.5964912280701754e-06, + "loss": 1.3233, + "num_tokens": 1934212.0, + "step": 772 + }, + { + "epoch": 0.03905223805193493, + "grad_norm": 6.611055850982666, + "learning_rate": 2.5730994152046784e-06, + "loss": 0.8819, + "num_tokens": 1936726.0, + "step": 773 + }, + { + "epoch": 0.03910275841163989, + "grad_norm": 7.984802722930908, + "learning_rate": 2.5497076023391814e-06, + "loss": 1.0696, + "num_tokens": 1939405.0, + "step": 774 + }, + { + "epoch": 0.03915327877134485, + "grad_norm": 9.643743515014648, + "learning_rate": 2.5263157894736844e-06, + "loss": 0.9771, + "num_tokens": 1941191.0, + "step": 775 + }, + { + "epoch": 0.039203799131049814, + "grad_norm": 8.063180923461914, + "learning_rate": 2.5029239766081874e-06, + "loss": 1.2027, + "num_tokens": 1943754.0, + "step": 776 + }, + { + "epoch": 0.03925431949075477, + "grad_norm": 9.447686195373535, + "learning_rate": 2.4795321637426904e-06, + "loss": 1.1171, + "num_tokens": 1945603.0, + "step": 777 + }, + { + "epoch": 0.03930483985045974, + "grad_norm": 8.815810203552246, + "learning_rate": 2.456140350877193e-06, + "loss": 0.8979, + "num_tokens": 1947407.0, + "step": 778 + }, + { + "epoch": 0.0393553602101647, + "grad_norm": 7.199434757232666, + "learning_rate": 2.432748538011696e-06, + "loss": 1.0377, + "num_tokens": 1950316.0, + "step": 779 + }, + { + "epoch": 0.039405880569869656, + "grad_norm": 7.688726902008057, + "learning_rate": 2.409356725146199e-06, + "loss": 1.3621, + "num_tokens": 1953387.0, + "step": 780 + }, + { + "epoch": 0.03945640092957462, + "grad_norm": 11.592430114746094, + "learning_rate": 2.385964912280702e-06, + "loss": 0.9384, + "num_tokens": 1954842.0, + "step": 781 + }, + { + "epoch": 0.03950692128927958, + "grad_norm": 7.321707248687744, + "learning_rate": 2.362573099415205e-06, + "loss": 1.2496, + "num_tokens": 1957757.0, + "step": 782 + }, + { + "epoch": 0.03955744164898454, + "grad_norm": 7.553733825683594, + "learning_rate": 2.3391812865497075e-06, + "loss": 0.8977, + "num_tokens": 1960169.0, + "step": 783 + }, + { + "epoch": 0.039607962008689504, + "grad_norm": 8.673162460327148, + "learning_rate": 2.3157894736842105e-06, + "loss": 1.0852, + "num_tokens": 1962432.0, + "step": 784 + }, + { + "epoch": 0.03965848236839446, + "grad_norm": 6.952296257019043, + "learning_rate": 2.292397660818714e-06, + "loss": 0.9882, + "num_tokens": 1965369.0, + "step": 785 + }, + { + "epoch": 0.03970900272809942, + "grad_norm": 9.117610931396484, + "learning_rate": 2.2690058479532165e-06, + "loss": 1.0798, + "num_tokens": 1967384.0, + "step": 786 + }, + { + "epoch": 0.03975952308780439, + "grad_norm": 7.989621162414551, + "learning_rate": 2.2456140350877195e-06, + "loss": 0.9831, + "num_tokens": 1969931.0, + "step": 787 + }, + { + "epoch": 0.039810043447509345, + "grad_norm": 10.279632568359375, + "learning_rate": 2.222222222222222e-06, + "loss": 1.0699, + "num_tokens": 1971703.0, + "step": 788 + }, + { + "epoch": 0.039860563807214304, + "grad_norm": 7.154787063598633, + "learning_rate": 2.1988304093567255e-06, + "loss": 0.9893, + "num_tokens": 1974277.0, + "step": 789 + }, + { + "epoch": 0.03991108416691927, + "grad_norm": 8.738092422485352, + "learning_rate": 2.1754385964912285e-06, + "loss": 0.9226, + "num_tokens": 1976464.0, + "step": 790 + }, + { + "epoch": 0.03996160452662423, + "grad_norm": 7.969451427459717, + "learning_rate": 2.152046783625731e-06, + "loss": 1.2201, + "num_tokens": 1979142.0, + "step": 791 + }, + { + "epoch": 0.040012124886329194, + "grad_norm": 8.174847602844238, + "learning_rate": 2.128654970760234e-06, + "loss": 1.005, + "num_tokens": 1981640.0, + "step": 792 + }, + { + "epoch": 0.04006264524603415, + "grad_norm": 8.039349555969238, + "learning_rate": 2.105263157894737e-06, + "loss": 1.2439, + "num_tokens": 1984629.0, + "step": 793 + }, + { + "epoch": 0.04011316560573911, + "grad_norm": 7.2468647956848145, + "learning_rate": 2.08187134502924e-06, + "loss": 1.1084, + "num_tokens": 1987710.0, + "step": 794 + }, + { + "epoch": 0.040163685965444076, + "grad_norm": 8.117630958557129, + "learning_rate": 2.058479532163743e-06, + "loss": 0.9846, + "num_tokens": 1989843.0, + "step": 795 + }, + { + "epoch": 0.040214206325149035, + "grad_norm": 9.395238876342773, + "learning_rate": 2.0350877192982456e-06, + "loss": 1.3396, + "num_tokens": 1991919.0, + "step": 796 + }, + { + "epoch": 0.040264726684853994, + "grad_norm": 9.018524169921875, + "learning_rate": 2.0116959064327486e-06, + "loss": 1.4016, + "num_tokens": 1994542.0, + "step": 797 + }, + { + "epoch": 0.04031524704455896, + "grad_norm": 8.211974143981934, + "learning_rate": 1.9883040935672516e-06, + "loss": 1.1266, + "num_tokens": 1997033.0, + "step": 798 + }, + { + "epoch": 0.04036576740426392, + "grad_norm": 7.088583946228027, + "learning_rate": 1.9649122807017546e-06, + "loss": 1.1829, + "num_tokens": 2000160.0, + "step": 799 + }, + { + "epoch": 0.040416287763968876, + "grad_norm": 5.988295555114746, + "learning_rate": 1.9415204678362576e-06, + "loss": 1.0047, + "num_tokens": 2003561.0, + "step": 800 + }, + { + "epoch": 0.04046680812367384, + "grad_norm": 6.9581098556518555, + "learning_rate": 1.91812865497076e-06, + "loss": 1.0078, + "num_tokens": 2006460.0, + "step": 801 + }, + { + "epoch": 0.0405173284833788, + "grad_norm": 7.740414619445801, + "learning_rate": 1.8947368421052634e-06, + "loss": 1.1291, + "num_tokens": 2008884.0, + "step": 802 + }, + { + "epoch": 0.040567848843083766, + "grad_norm": 7.810000419616699, + "learning_rate": 1.8713450292397662e-06, + "loss": 1.2379, + "num_tokens": 2011827.0, + "step": 803 + }, + { + "epoch": 0.040618369202788725, + "grad_norm": 8.57465648651123, + "learning_rate": 1.8479532163742692e-06, + "loss": 0.9495, + "num_tokens": 2013880.0, + "step": 804 + }, + { + "epoch": 0.04066888956249368, + "grad_norm": 8.974221229553223, + "learning_rate": 1.8245614035087722e-06, + "loss": 1.0639, + "num_tokens": 2015994.0, + "step": 805 + }, + { + "epoch": 0.04071940992219865, + "grad_norm": 9.056292533874512, + "learning_rate": 1.801169590643275e-06, + "loss": 1.2913, + "num_tokens": 2018137.0, + "step": 806 + }, + { + "epoch": 0.04076993028190361, + "grad_norm": 8.951916694641113, + "learning_rate": 1.777777777777778e-06, + "loss": 0.8866, + "num_tokens": 2020073.0, + "step": 807 + }, + { + "epoch": 0.040820450641608566, + "grad_norm": 8.470370292663574, + "learning_rate": 1.7543859649122807e-06, + "loss": 1.2217, + "num_tokens": 2022868.0, + "step": 808 + }, + { + "epoch": 0.04087097100131353, + "grad_norm": 7.56195068359375, + "learning_rate": 1.7309941520467837e-06, + "loss": 1.0755, + "num_tokens": 2025380.0, + "step": 809 + }, + { + "epoch": 0.04092149136101849, + "grad_norm": 8.2066068649292, + "learning_rate": 1.7076023391812867e-06, + "loss": 1.0243, + "num_tokens": 2027597.0, + "step": 810 + }, + { + "epoch": 0.04097201172072345, + "grad_norm": 8.10889720916748, + "learning_rate": 1.6842105263157895e-06, + "loss": 0.915, + "num_tokens": 2030015.0, + "step": 811 + }, + { + "epoch": 0.041022532080428414, + "grad_norm": 7.229382038116455, + "learning_rate": 1.6608187134502925e-06, + "loss": 1.0504, + "num_tokens": 2032587.0, + "step": 812 + }, + { + "epoch": 0.04107305244013337, + "grad_norm": 7.046276092529297, + "learning_rate": 1.6374269005847953e-06, + "loss": 1.2248, + "num_tokens": 2035825.0, + "step": 813 + }, + { + "epoch": 0.04112357279983833, + "grad_norm": 11.137956619262695, + "learning_rate": 1.6140350877192983e-06, + "loss": 1.3771, + "num_tokens": 2037823.0, + "step": 814 + }, + { + "epoch": 0.0411740931595433, + "grad_norm": 7.041586875915527, + "learning_rate": 1.5906432748538015e-06, + "loss": 1.1341, + "num_tokens": 2040644.0, + "step": 815 + }, + { + "epoch": 0.041224613519248256, + "grad_norm": 8.43753433227539, + "learning_rate": 1.5672514619883043e-06, + "loss": 1.1734, + "num_tokens": 2043451.0, + "step": 816 + }, + { + "epoch": 0.04127513387895322, + "grad_norm": 7.803639888763428, + "learning_rate": 1.5438596491228073e-06, + "loss": 1.0219, + "num_tokens": 2045716.0, + "step": 817 + }, + { + "epoch": 0.04132565423865818, + "grad_norm": 8.355565071105957, + "learning_rate": 1.52046783625731e-06, + "loss": 1.1636, + "num_tokens": 2048078.0, + "step": 818 + }, + { + "epoch": 0.04137617459836314, + "grad_norm": 7.0593719482421875, + "learning_rate": 1.497076023391813e-06, + "loss": 1.1711, + "num_tokens": 2051289.0, + "step": 819 + }, + { + "epoch": 0.041426694958068104, + "grad_norm": 7.683343410491943, + "learning_rate": 1.4736842105263159e-06, + "loss": 1.0823, + "num_tokens": 2053840.0, + "step": 820 + }, + { + "epoch": 0.04147721531777306, + "grad_norm": 7.099639415740967, + "learning_rate": 1.4502923976608189e-06, + "loss": 0.9888, + "num_tokens": 2056574.0, + "step": 821 + }, + { + "epoch": 0.04152773567747802, + "grad_norm": 7.390129089355469, + "learning_rate": 1.4269005847953219e-06, + "loss": 0.9027, + "num_tokens": 2059150.0, + "step": 822 + }, + { + "epoch": 0.04157825603718299, + "grad_norm": 7.881406784057617, + "learning_rate": 1.4035087719298246e-06, + "loss": 1.1576, + "num_tokens": 2061871.0, + "step": 823 + }, + { + "epoch": 0.041628776396887945, + "grad_norm": 8.215656280517578, + "learning_rate": 1.3801169590643276e-06, + "loss": 1.2287, + "num_tokens": 2064119.0, + "step": 824 + }, + { + "epoch": 0.041679296756592904, + "grad_norm": 8.807376861572266, + "learning_rate": 1.3567251461988304e-06, + "loss": 1.1597, + "num_tokens": 2066710.0, + "step": 825 + }, + { + "epoch": 0.04172981711629787, + "grad_norm": 10.283782005310059, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.3864, + "num_tokens": 2068836.0, + "step": 826 + }, + { + "epoch": 0.04178033747600283, + "grad_norm": 8.01341438293457, + "learning_rate": 1.3099415204678364e-06, + "loss": 0.9962, + "num_tokens": 2071042.0, + "step": 827 + }, + { + "epoch": 0.041830857835707794, + "grad_norm": 6.652347087860107, + "learning_rate": 1.2865497076023392e-06, + "loss": 0.9938, + "num_tokens": 2074111.0, + "step": 828 + }, + { + "epoch": 0.04188137819541275, + "grad_norm": 8.33005142211914, + "learning_rate": 1.2631578947368422e-06, + "loss": 1.2122, + "num_tokens": 2076286.0, + "step": 829 + }, + { + "epoch": 0.04193189855511771, + "grad_norm": 8.675084114074707, + "learning_rate": 1.2397660818713452e-06, + "loss": 1.2915, + "num_tokens": 2079255.0, + "step": 830 + }, + { + "epoch": 0.041982418914822676, + "grad_norm": 8.922797203063965, + "learning_rate": 1.216374269005848e-06, + "loss": 0.9995, + "num_tokens": 2080988.0, + "step": 831 + }, + { + "epoch": 0.042032939274527635, + "grad_norm": 7.27795934677124, + "learning_rate": 1.192982456140351e-06, + "loss": 0.9171, + "num_tokens": 2083442.0, + "step": 832 + }, + { + "epoch": 0.042083459634232594, + "grad_norm": 7.669681549072266, + "learning_rate": 1.1695906432748538e-06, + "loss": 1.1638, + "num_tokens": 2086087.0, + "step": 833 + }, + { + "epoch": 0.04213397999393756, + "grad_norm": 7.834378719329834, + "learning_rate": 1.146198830409357e-06, + "loss": 0.9933, + "num_tokens": 2088702.0, + "step": 834 + }, + { + "epoch": 0.04218450035364252, + "grad_norm": 7.798491477966309, + "learning_rate": 1.1228070175438598e-06, + "loss": 1.1184, + "num_tokens": 2091018.0, + "step": 835 + }, + { + "epoch": 0.042235020713347476, + "grad_norm": 9.016721725463867, + "learning_rate": 1.0994152046783627e-06, + "loss": 1.1585, + "num_tokens": 2093548.0, + "step": 836 + }, + { + "epoch": 0.04228554107305244, + "grad_norm": 7.152202606201172, + "learning_rate": 1.0760233918128655e-06, + "loss": 1.117, + "num_tokens": 2096084.0, + "step": 837 + }, + { + "epoch": 0.0423360614327574, + "grad_norm": 6.945435047149658, + "learning_rate": 1.0526315789473685e-06, + "loss": 1.1221, + "num_tokens": 2098957.0, + "step": 838 + }, + { + "epoch": 0.04238658179246236, + "grad_norm": 7.152926445007324, + "learning_rate": 1.0292397660818715e-06, + "loss": 1.2523, + "num_tokens": 2102371.0, + "step": 839 + }, + { + "epoch": 0.042437102152167325, + "grad_norm": 7.688709259033203, + "learning_rate": 1.0058479532163743e-06, + "loss": 1.2237, + "num_tokens": 2105072.0, + "step": 840 + }, + { + "epoch": 0.04248762251187228, + "grad_norm": 7.0494513511657715, + "learning_rate": 9.824561403508773e-07, + "loss": 0.8976, + "num_tokens": 2108148.0, + "step": 841 + }, + { + "epoch": 0.04253814287157725, + "grad_norm": 7.69727897644043, + "learning_rate": 9.5906432748538e-07, + "loss": 1.2246, + "num_tokens": 2111036.0, + "step": 842 + }, + { + "epoch": 0.04258866323128221, + "grad_norm": 7.972696304321289, + "learning_rate": 9.356725146198831e-07, + "loss": 1.213, + "num_tokens": 2113586.0, + "step": 843 + }, + { + "epoch": 0.042639183590987166, + "grad_norm": 8.766176223754883, + "learning_rate": 9.122807017543861e-07, + "loss": 1.2092, + "num_tokens": 2115818.0, + "step": 844 + }, + { + "epoch": 0.04268970395069213, + "grad_norm": 8.013957977294922, + "learning_rate": 8.88888888888889e-07, + "loss": 0.9124, + "num_tokens": 2118011.0, + "step": 845 + }, + { + "epoch": 0.04274022431039709, + "grad_norm": 9.98165512084961, + "learning_rate": 8.654970760233919e-07, + "loss": 1.0624, + "num_tokens": 2119642.0, + "step": 846 + }, + { + "epoch": 0.04279074467010205, + "grad_norm": 8.050074577331543, + "learning_rate": 8.421052631578948e-07, + "loss": 1.2296, + "num_tokens": 2122420.0, + "step": 847 + }, + { + "epoch": 0.042841265029807014, + "grad_norm": 9.614937782287598, + "learning_rate": 8.187134502923977e-07, + "loss": 1.1791, + "num_tokens": 2124677.0, + "step": 848 + }, + { + "epoch": 0.04289178538951197, + "grad_norm": 9.438997268676758, + "learning_rate": 7.953216374269008e-07, + "loss": 1.3136, + "num_tokens": 2126694.0, + "step": 849 + }, + { + "epoch": 0.04294230574921693, + "grad_norm": 6.885341167449951, + "learning_rate": 7.719298245614036e-07, + "loss": 1.0465, + "num_tokens": 2129810.0, + "step": 850 + }, + { + "epoch": 0.0429928261089219, + "grad_norm": 7.8129987716674805, + "learning_rate": 7.485380116959065e-07, + "loss": 1.0448, + "num_tokens": 2132221.0, + "step": 851 + }, + { + "epoch": 0.043043346468626856, + "grad_norm": 8.12570571899414, + "learning_rate": 7.251461988304094e-07, + "loss": 1.1046, + "num_tokens": 2134522.0, + "step": 852 + }, + { + "epoch": 0.043093866828331814, + "grad_norm": 7.0929765701293945, + "learning_rate": 7.017543859649123e-07, + "loss": 1.2739, + "num_tokens": 2138194.0, + "step": 853 + }, + { + "epoch": 0.04314438718803678, + "grad_norm": 9.29609203338623, + "learning_rate": 6.783625730994152e-07, + "loss": 0.9849, + "num_tokens": 2140254.0, + "step": 854 + }, + { + "epoch": 0.04319490754774174, + "grad_norm": 7.017616271972656, + "learning_rate": 6.549707602339182e-07, + "loss": 0.8731, + "num_tokens": 2142805.0, + "step": 855 + }, + { + "epoch": 0.043245427907446704, + "grad_norm": 13.537893295288086, + "learning_rate": 6.315789473684211e-07, + "loss": 1.1423, + "num_tokens": 2144411.0, + "step": 856 + }, + { + "epoch": 0.04329594826715166, + "grad_norm": 7.984657287597656, + "learning_rate": 6.08187134502924e-07, + "loss": 1.1312, + "num_tokens": 2147347.0, + "step": 857 + }, + { + "epoch": 0.04334646862685662, + "grad_norm": 6.264598369598389, + "learning_rate": 5.847953216374269e-07, + "loss": 0.9945, + "num_tokens": 2150592.0, + "step": 858 + }, + { + "epoch": 0.04339698898656159, + "grad_norm": 7.226726055145264, + "learning_rate": 5.614035087719299e-07, + "loss": 1.2162, + "num_tokens": 2153510.0, + "step": 859 + }, + { + "epoch": 0.043447509346266545, + "grad_norm": 6.643986701965332, + "learning_rate": 5.380116959064328e-07, + "loss": 1.1668, + "num_tokens": 2156787.0, + "step": 860 + }, + { + "epoch": 0.043498029705971504, + "grad_norm": 7.309080123901367, + "learning_rate": 5.146198830409358e-07, + "loss": 0.9684, + "num_tokens": 2159289.0, + "step": 861 + }, + { + "epoch": 0.04354855006567647, + "grad_norm": 9.393473625183105, + "learning_rate": 4.912280701754387e-07, + "loss": 0.9306, + "num_tokens": 2161002.0, + "step": 862 + }, + { + "epoch": 0.04359907042538143, + "grad_norm": 7.447899341583252, + "learning_rate": 4.6783625730994155e-07, + "loss": 0.9799, + "num_tokens": 2163601.0, + "step": 863 + }, + { + "epoch": 0.04364959078508639, + "grad_norm": 6.775162220001221, + "learning_rate": 4.444444444444445e-07, + "loss": 0.9197, + "num_tokens": 2166354.0, + "step": 864 + }, + { + "epoch": 0.04370011114479135, + "grad_norm": 8.611212730407715, + "learning_rate": 4.210526315789474e-07, + "loss": 1.2166, + "num_tokens": 2168993.0, + "step": 865 + }, + { + "epoch": 0.04375063150449631, + "grad_norm": 9.290692329406738, + "learning_rate": 3.976608187134504e-07, + "loss": 1.0802, + "num_tokens": 2171503.0, + "step": 866 + }, + { + "epoch": 0.043801151864201276, + "grad_norm": 8.401673316955566, + "learning_rate": 3.7426900584795327e-07, + "loss": 1.1856, + "num_tokens": 2173768.0, + "step": 867 + }, + { + "epoch": 0.043851672223906235, + "grad_norm": 7.8749189376831055, + "learning_rate": 3.5087719298245616e-07, + "loss": 1.1462, + "num_tokens": 2176249.0, + "step": 868 + }, + { + "epoch": 0.043902192583611194, + "grad_norm": 8.682783126831055, + "learning_rate": 3.274853801169591e-07, + "loss": 0.9509, + "num_tokens": 2178570.0, + "step": 869 + }, + { + "epoch": 0.04395271294331616, + "grad_norm": 9.697423934936523, + "learning_rate": 3.04093567251462e-07, + "loss": 1.0019, + "num_tokens": 2180467.0, + "step": 870 + }, + { + "epoch": 0.04400323330302112, + "grad_norm": 7.6728291511535645, + "learning_rate": 2.8070175438596494e-07, + "loss": 0.9297, + "num_tokens": 2182948.0, + "step": 871 + }, + { + "epoch": 0.044053753662726076, + "grad_norm": 7.218677997589111, + "learning_rate": 2.573099415204679e-07, + "loss": 0.9871, + "num_tokens": 2185567.0, + "step": 872 + }, + { + "epoch": 0.04410427402243104, + "grad_norm": 10.747563362121582, + "learning_rate": 2.3391812865497077e-07, + "loss": 0.9979, + "num_tokens": 2187013.0, + "step": 873 + }, + { + "epoch": 0.044154794382136, + "grad_norm": 6.815659523010254, + "learning_rate": 2.105263157894737e-07, + "loss": 1.2435, + "num_tokens": 2190562.0, + "step": 874 + }, + { + "epoch": 0.04420531474184096, + "grad_norm": 7.546515464782715, + "learning_rate": 1.8713450292397663e-07, + "loss": 1.0881, + "num_tokens": 2193167.0, + "step": 875 + }, + { + "epoch": 0.044255835101545925, + "grad_norm": 7.302131652832031, + "learning_rate": 1.6374269005847955e-07, + "loss": 1.0084, + "num_tokens": 2195822.0, + "step": 876 + }, + { + "epoch": 0.04430635546125088, + "grad_norm": 11.80875301361084, + "learning_rate": 1.4035087719298247e-07, + "loss": 1.2438, + "num_tokens": 2197358.0, + "step": 877 + }, + { + "epoch": 0.04435687582095584, + "grad_norm": 7.794970989227295, + "learning_rate": 1.1695906432748539e-07, + "loss": 1.1909, + "num_tokens": 2199833.0, + "step": 878 + }, + { + "epoch": 0.04440739618066081, + "grad_norm": 8.794548988342285, + "learning_rate": 9.356725146198832e-08, + "loss": 1.3243, + "num_tokens": 2202398.0, + "step": 879 + }, + { + "epoch": 0.044457916540365766, + "grad_norm": 6.986159324645996, + "learning_rate": 7.017543859649123e-08, + "loss": 1.0617, + "num_tokens": 2205223.0, + "step": 880 + }, + { + "epoch": 0.04450843690007073, + "grad_norm": 7.635352611541748, + "learning_rate": 4.678362573099416e-08, + "loss": 0.982, + "num_tokens": 2207936.0, + "step": 881 + }, + { + "epoch": 0.04455895725977569, + "grad_norm": 7.313915729522705, + "learning_rate": 2.339181286549708e-08, + "loss": 0.9467, + "num_tokens": 2210256.0, + "step": 882 + }, + { + "epoch": 0.04455895725977569, + "step": 882, + "total_flos": 5.546190381567181e+16, + "train_loss": 1.1311103994208398, + "train_runtime": 369.4151, + "train_samples_per_second": 19.1, + "train_steps_per_second": 2.388 + } + ], + "logging_steps": 1, + "max_steps": 882, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.546190381567181e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}