diff --git "a/checkpoint-2500/trainer_state.json" "b/checkpoint-2500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2500/trainer_state.json" @@ -0,0 +1,17521 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2219753360737697, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008887901344295079, + "grad_norm": 0.8810775279998779, + "learning_rate": 0.0, + "loss": 1.9172, + "step": 1 + }, + { + "epoch": 0.0017775802688590157, + "grad_norm": 0.7469067573547363, + "learning_rate": 2.1533827903669654e-05, + "loss": 1.8244, + "step": 2 + }, + { + "epoch": 0.0026663704032885236, + "grad_norm": 0.6811410784721375, + "learning_rate": 3.413030972429927e-05, + "loss": 1.926, + "step": 3 + }, + { + "epoch": 0.0035551605377180315, + "grad_norm": 0.5220848321914673, + "learning_rate": 4.306765580733931e-05, + "loss": 1.8387, + "step": 4 + }, + { + "epoch": 0.004443950672147539, + "grad_norm": 0.5415656566619873, + "learning_rate": 5e-05, + "loss": 1.8832, + "step": 5 + }, + { + "epoch": 0.005332740806577047, + "grad_norm": 0.3646649718284607, + "learning_rate": 5.5664137627968925e-05, + "loss": 1.8791, + "step": 6 + }, + { + "epoch": 0.006221530941006555, + "grad_norm": 0.42480552196502686, + "learning_rate": 6.0453097756108376e-05, + "loss": 1.7676, + "step": 7 + }, + { + "epoch": 0.007110321075436063, + "grad_norm": 0.38259413838386536, + "learning_rate": 6.460148371100896e-05, + "loss": 1.7888, + "step": 8 + }, + { + "epoch": 0.00799911120986557, + "grad_norm": 0.3880155086517334, + "learning_rate": 6.826061944859854e-05, + "loss": 1.7697, + "step": 9 + }, + { + "epoch": 0.008887901344295079, + "grad_norm": 0.5344939231872559, + "learning_rate": 7.153382790366967e-05, + "loss": 1.6064, + "step": 10 + }, + { + "epoch": 0.009776691478724587, + "grad_norm": 0.42493271827697754, + "learning_rate": 7.449480512024892e-05, + "loss": 1.7261, + "step": 11 + }, + { + "epoch": 0.010665481613154094, + "grad_norm": 0.49020832777023315, + "learning_rate": 7.719796553163858e-05, + "loss": 1.7062, + "step": 12 + }, + { + "epoch": 0.011554271747583602, + "grad_norm": 0.5008671283721924, + "learning_rate": 7.968463205835412e-05, + "loss": 1.7161, + "step": 13 + }, + { + "epoch": 0.01244306188201311, + "grad_norm": 0.6378142833709717, + "learning_rate": 8.198692565977803e-05, + "loss": 1.5226, + "step": 14 + }, + { + "epoch": 0.013331852016442618, + "grad_norm": 0.6101044416427612, + "learning_rate": 8.413030972429928e-05, + "loss": 1.5718, + "step": 15 + }, + { + "epoch": 0.014220642150872126, + "grad_norm": 0.4299439787864685, + "learning_rate": 8.613531161467861e-05, + "loss": 1.7133, + "step": 16 + }, + { + "epoch": 0.015109432285301634, + "grad_norm": 0.48655468225479126, + "learning_rate": 8.80187213861294e-05, + "loss": 1.5746, + "step": 17 + }, + { + "epoch": 0.01599822241973114, + "grad_norm": 0.577294647693634, + "learning_rate": 8.979444735226819e-05, + "loss": 1.4959, + "step": 18 + }, + { + "epoch": 0.016887012554160648, + "grad_norm": 0.5080670714378357, + "learning_rate": 9.147414002175752e-05, + "loss": 1.4947, + "step": 19 + }, + { + "epoch": 0.017775802688590157, + "grad_norm": 0.6289512515068054, + "learning_rate": 9.306765580733931e-05, + "loss": 1.5576, + "step": 20 + }, + { + "epoch": 0.018664592823019664, + "grad_norm": 0.5262102484703064, + "learning_rate": 9.458340748040766e-05, + "loss": 1.5228, + "step": 21 + }, + { + "epoch": 0.019553382957449173, + "grad_norm": 0.4739653170108795, + "learning_rate": 9.602863302391859e-05, + "loss": 1.5751, + "step": 22 + }, + { + "epoch": 0.02044217309187868, + "grad_norm": 0.4563744068145752, + "learning_rate": 9.740960467331899e-05, + "loss": 1.4833, + "step": 23 + }, + { + "epoch": 0.02133096322630819, + "grad_norm": 0.5045209527015686, + "learning_rate": 9.873179343530825e-05, + "loss": 1.5092, + "step": 24 + }, + { + "epoch": 0.022219753360737695, + "grad_norm": 0.3677147328853607, + "learning_rate": 0.0001, + "loss": 1.5253, + "step": 25 + }, + { + "epoch": 0.023108543495167205, + "grad_norm": 0.5037134289741516, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 26 + }, + { + "epoch": 0.02399733362959671, + "grad_norm": 0.5134053230285645, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 27 + }, + { + "epoch": 0.02488612376402622, + "grad_norm": 0.4475038945674896, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 28 + }, + { + "epoch": 0.025774913898455726, + "grad_norm": 0.4869753122329712, + "learning_rate": 0.0001, + "loss": 1.4236, + "step": 29 + }, + { + "epoch": 0.026663704032885236, + "grad_norm": 0.3995779752731323, + "learning_rate": 0.0001, + "loss": 1.4746, + "step": 30 + }, + { + "epoch": 0.027552494167314742, + "grad_norm": 0.43385857343673706, + "learning_rate": 0.0001, + "loss": 1.4692, + "step": 31 + }, + { + "epoch": 0.028441284301744252, + "grad_norm": 0.46782711148262024, + "learning_rate": 0.0001, + "loss": 1.414, + "step": 32 + }, + { + "epoch": 0.029330074436173758, + "grad_norm": 0.5026598572731018, + "learning_rate": 0.0001, + "loss": 1.348, + "step": 33 + }, + { + "epoch": 0.030218864570603268, + "grad_norm": 0.3947531580924988, + "learning_rate": 0.0001, + "loss": 1.325, + "step": 34 + }, + { + "epoch": 0.031107654705032774, + "grad_norm": 0.4685068428516388, + "learning_rate": 0.0001, + "loss": 1.3592, + "step": 35 + }, + { + "epoch": 0.03199644483946228, + "grad_norm": 0.5283563733100891, + "learning_rate": 0.0001, + "loss": 1.4594, + "step": 36 + }, + { + "epoch": 0.03288523497389179, + "grad_norm": 0.48610830307006836, + "learning_rate": 0.0001, + "loss": 1.4441, + "step": 37 + }, + { + "epoch": 0.033774025108321296, + "grad_norm": 0.46293529868125916, + "learning_rate": 0.0001, + "loss": 1.3483, + "step": 38 + }, + { + "epoch": 0.034662815242750805, + "grad_norm": 0.5796862244606018, + "learning_rate": 0.0001, + "loss": 1.3338, + "step": 39 + }, + { + "epoch": 0.035551605377180315, + "grad_norm": 0.5200364589691162, + "learning_rate": 0.0001, + "loss": 1.3805, + "step": 40 + }, + { + "epoch": 0.036440395511609824, + "grad_norm": 0.4546230137348175, + "learning_rate": 0.0001, + "loss": 1.3318, + "step": 41 + }, + { + "epoch": 0.03732918564603933, + "grad_norm": 0.5082724094390869, + "learning_rate": 0.0001, + "loss": 1.2617, + "step": 42 + }, + { + "epoch": 0.03821797578046884, + "grad_norm": 0.4639153480529785, + "learning_rate": 0.0001, + "loss": 1.2998, + "step": 43 + }, + { + "epoch": 0.039106765914898346, + "grad_norm": 0.6027429699897766, + "learning_rate": 0.0001, + "loss": 1.3551, + "step": 44 + }, + { + "epoch": 0.039995556049327856, + "grad_norm": 0.4588245451450348, + "learning_rate": 0.0001, + "loss": 1.297, + "step": 45 + }, + { + "epoch": 0.04088434618375736, + "grad_norm": 0.49096405506134033, + "learning_rate": 0.0001, + "loss": 1.3503, + "step": 46 + }, + { + "epoch": 0.04177313631818687, + "grad_norm": 0.5192633867263794, + "learning_rate": 0.0001, + "loss": 1.3139, + "step": 47 + }, + { + "epoch": 0.04266192645261638, + "grad_norm": 0.5095855593681335, + "learning_rate": 0.0001, + "loss": 1.2607, + "step": 48 + }, + { + "epoch": 0.04355071658704588, + "grad_norm": 0.5086390376091003, + "learning_rate": 0.0001, + "loss": 1.294, + "step": 49 + }, + { + "epoch": 0.04443950672147539, + "grad_norm": 0.5870110988616943, + "learning_rate": 0.0001, + "loss": 1.3086, + "step": 50 + }, + { + "epoch": 0.0453282968559049, + "grad_norm": 0.5581070184707642, + "learning_rate": 0.0001, + "loss": 1.2506, + "step": 51 + }, + { + "epoch": 0.04621708699033441, + "grad_norm": 0.5818192362785339, + "learning_rate": 0.0001, + "loss": 1.3169, + "step": 52 + }, + { + "epoch": 0.04710587712476391, + "grad_norm": 0.612249493598938, + "learning_rate": 0.0001, + "loss": 1.2275, + "step": 53 + }, + { + "epoch": 0.04799466725919342, + "grad_norm": 0.712792158126831, + "learning_rate": 0.0001, + "loss": 1.2525, + "step": 54 + }, + { + "epoch": 0.04888345739362293, + "grad_norm": 0.6237956285476685, + "learning_rate": 0.0001, + "loss": 1.21, + "step": 55 + }, + { + "epoch": 0.04977224752805244, + "grad_norm": 0.5647529363632202, + "learning_rate": 0.0001, + "loss": 1.2615, + "step": 56 + }, + { + "epoch": 0.05066103766248194, + "grad_norm": 0.5913922786712646, + "learning_rate": 0.0001, + "loss": 1.227, + "step": 57 + }, + { + "epoch": 0.05154982779691145, + "grad_norm": 0.5439374446868896, + "learning_rate": 0.0001, + "loss": 1.2046, + "step": 58 + }, + { + "epoch": 0.05243861793134096, + "grad_norm": 0.4831898510456085, + "learning_rate": 0.0001, + "loss": 1.2106, + "step": 59 + }, + { + "epoch": 0.05332740806577047, + "grad_norm": 0.47741156816482544, + "learning_rate": 0.0001, + "loss": 1.2144, + "step": 60 + }, + { + "epoch": 0.054216198200199975, + "grad_norm": 0.4893709123134613, + "learning_rate": 0.0001, + "loss": 1.1871, + "step": 61 + }, + { + "epoch": 0.055104988334629484, + "grad_norm": 0.5424261689186096, + "learning_rate": 0.0001, + "loss": 1.238, + "step": 62 + }, + { + "epoch": 0.055993778469058994, + "grad_norm": 0.47771304845809937, + "learning_rate": 0.0001, + "loss": 1.2568, + "step": 63 + }, + { + "epoch": 0.056882568603488504, + "grad_norm": 0.6287993788719177, + "learning_rate": 0.0001, + "loss": 1.2713, + "step": 64 + }, + { + "epoch": 0.057771358737918006, + "grad_norm": 0.5192376971244812, + "learning_rate": 0.0001, + "loss": 1.1294, + "step": 65 + }, + { + "epoch": 0.058660148872347516, + "grad_norm": 0.5453503131866455, + "learning_rate": 0.0001, + "loss": 1.1706, + "step": 66 + }, + { + "epoch": 0.059548939006777026, + "grad_norm": 0.44728267192840576, + "learning_rate": 0.0001, + "loss": 1.2275, + "step": 67 + }, + { + "epoch": 0.060437729141206535, + "grad_norm": 0.561209499835968, + "learning_rate": 0.0001, + "loss": 1.1971, + "step": 68 + }, + { + "epoch": 0.06132651927563604, + "grad_norm": 0.518390417098999, + "learning_rate": 0.0001, + "loss": 1.2264, + "step": 69 + }, + { + "epoch": 0.06221530941006555, + "grad_norm": 0.5499495267868042, + "learning_rate": 0.0001, + "loss": 1.1627, + "step": 70 + }, + { + "epoch": 0.06310409954449506, + "grad_norm": 0.46714553236961365, + "learning_rate": 0.0001, + "loss": 1.1767, + "step": 71 + }, + { + "epoch": 0.06399288967892457, + "grad_norm": 0.5339136123657227, + "learning_rate": 0.0001, + "loss": 1.129, + "step": 72 + }, + { + "epoch": 0.06488167981335408, + "grad_norm": 0.6434009671211243, + "learning_rate": 0.0001, + "loss": 1.213, + "step": 73 + }, + { + "epoch": 0.06577046994778359, + "grad_norm": 0.6940732002258301, + "learning_rate": 0.0001, + "loss": 1.1562, + "step": 74 + }, + { + "epoch": 0.06665926008221308, + "grad_norm": 0.5630552768707275, + "learning_rate": 0.0001, + "loss": 1.208, + "step": 75 + }, + { + "epoch": 0.06754805021664259, + "grad_norm": 0.561046838760376, + "learning_rate": 0.0001, + "loss": 1.168, + "step": 76 + }, + { + "epoch": 0.0684368403510721, + "grad_norm": 0.4617985486984253, + "learning_rate": 0.0001, + "loss": 1.1431, + "step": 77 + }, + { + "epoch": 0.06932563048550161, + "grad_norm": 0.5184434652328491, + "learning_rate": 0.0001, + "loss": 1.1934, + "step": 78 + }, + { + "epoch": 0.07021442061993112, + "grad_norm": 0.6089707016944885, + "learning_rate": 0.0001, + "loss": 1.2368, + "step": 79 + }, + { + "epoch": 0.07110321075436063, + "grad_norm": 0.5018100738525391, + "learning_rate": 0.0001, + "loss": 1.1722, + "step": 80 + }, + { + "epoch": 0.07199200088879014, + "grad_norm": 0.49356648325920105, + "learning_rate": 0.0001, + "loss": 1.1999, + "step": 81 + }, + { + "epoch": 0.07288079102321965, + "grad_norm": 0.544902503490448, + "learning_rate": 0.0001, + "loss": 1.1496, + "step": 82 + }, + { + "epoch": 0.07376958115764914, + "grad_norm": 0.534565806388855, + "learning_rate": 0.0001, + "loss": 1.0753, + "step": 83 + }, + { + "epoch": 0.07465837129207865, + "grad_norm": 0.6407784223556519, + "learning_rate": 0.0001, + "loss": 1.1728, + "step": 84 + }, + { + "epoch": 0.07554716142650816, + "grad_norm": 0.619192361831665, + "learning_rate": 0.0001, + "loss": 1.179, + "step": 85 + }, + { + "epoch": 0.07643595156093767, + "grad_norm": 0.5248496532440186, + "learning_rate": 0.0001, + "loss": 1.1668, + "step": 86 + }, + { + "epoch": 0.07732474169536718, + "grad_norm": 0.5901935696601868, + "learning_rate": 0.0001, + "loss": 1.1634, + "step": 87 + }, + { + "epoch": 0.07821353182979669, + "grad_norm": 0.5717945694923401, + "learning_rate": 0.0001, + "loss": 1.046, + "step": 88 + }, + { + "epoch": 0.0791023219642262, + "grad_norm": 0.5313799381256104, + "learning_rate": 0.0001, + "loss": 1.1541, + "step": 89 + }, + { + "epoch": 0.07999111209865571, + "grad_norm": 0.5816580057144165, + "learning_rate": 0.0001, + "loss": 1.1505, + "step": 90 + }, + { + "epoch": 0.08087990223308521, + "grad_norm": 0.6545852422714233, + "learning_rate": 0.0001, + "loss": 1.1687, + "step": 91 + }, + { + "epoch": 0.08176869236751472, + "grad_norm": 0.5649644136428833, + "learning_rate": 0.0001, + "loss": 1.2657, + "step": 92 + }, + { + "epoch": 0.08265748250194423, + "grad_norm": 0.5403311848640442, + "learning_rate": 0.0001, + "loss": 1.1041, + "step": 93 + }, + { + "epoch": 0.08354627263637374, + "grad_norm": 0.6377498507499695, + "learning_rate": 0.0001, + "loss": 1.1621, + "step": 94 + }, + { + "epoch": 0.08443506277080325, + "grad_norm": 0.5470308065414429, + "learning_rate": 0.0001, + "loss": 1.0845, + "step": 95 + }, + { + "epoch": 0.08532385290523276, + "grad_norm": 0.4540124833583832, + "learning_rate": 0.0001, + "loss": 1.1723, + "step": 96 + }, + { + "epoch": 0.08621264303966227, + "grad_norm": 0.49754512310028076, + "learning_rate": 0.0001, + "loss": 1.165, + "step": 97 + }, + { + "epoch": 0.08710143317409176, + "grad_norm": 0.5652042031288147, + "learning_rate": 0.0001, + "loss": 1.0729, + "step": 98 + }, + { + "epoch": 0.08799022330852127, + "grad_norm": 0.5990537405014038, + "learning_rate": 0.0001, + "loss": 1.1192, + "step": 99 + }, + { + "epoch": 0.08887901344295078, + "grad_norm": 0.49589407444000244, + "learning_rate": 0.0001, + "loss": 1.0975, + "step": 100 + }, + { + "epoch": 0.08976780357738029, + "grad_norm": 0.4846036732196808, + "learning_rate": 0.0001, + "loss": 1.1206, + "step": 101 + }, + { + "epoch": 0.0906565937118098, + "grad_norm": 0.5197208523750305, + "learning_rate": 0.0001, + "loss": 1.1676, + "step": 102 + }, + { + "epoch": 0.09154538384623931, + "grad_norm": 0.500135600566864, + "learning_rate": 0.0001, + "loss": 1.0938, + "step": 103 + }, + { + "epoch": 0.09243417398066882, + "grad_norm": 0.48206326365470886, + "learning_rate": 0.0001, + "loss": 1.1561, + "step": 104 + }, + { + "epoch": 0.09332296411509833, + "grad_norm": 0.5363126397132874, + "learning_rate": 0.0001, + "loss": 1.1397, + "step": 105 + }, + { + "epoch": 0.09421175424952782, + "grad_norm": 0.5342210531234741, + "learning_rate": 0.0001, + "loss": 1.1213, + "step": 106 + }, + { + "epoch": 0.09510054438395733, + "grad_norm": 0.47415241599082947, + "learning_rate": 0.0001, + "loss": 1.1009, + "step": 107 + }, + { + "epoch": 0.09598933451838684, + "grad_norm": 0.5735252499580383, + "learning_rate": 0.0001, + "loss": 1.0854, + "step": 108 + }, + { + "epoch": 0.09687812465281635, + "grad_norm": 0.6142700910568237, + "learning_rate": 0.0001, + "loss": 1.2542, + "step": 109 + }, + { + "epoch": 0.09776691478724586, + "grad_norm": 0.5673418045043945, + "learning_rate": 0.0001, + "loss": 1.0388, + "step": 110 + }, + { + "epoch": 0.09865570492167537, + "grad_norm": 0.5416164398193359, + "learning_rate": 0.0001, + "loss": 1.0302, + "step": 111 + }, + { + "epoch": 0.09954449505610488, + "grad_norm": 0.48338207602500916, + "learning_rate": 0.0001, + "loss": 1.1465, + "step": 112 + }, + { + "epoch": 0.10043328519053439, + "grad_norm": 0.49852466583251953, + "learning_rate": 0.0001, + "loss": 1.1088, + "step": 113 + }, + { + "epoch": 0.10132207532496389, + "grad_norm": 0.5490244626998901, + "learning_rate": 0.0001, + "loss": 1.0804, + "step": 114 + }, + { + "epoch": 0.1022108654593934, + "grad_norm": 7.056009769439697, + "learning_rate": 0.0001, + "loss": 1.094, + "step": 115 + }, + { + "epoch": 0.1030996555938229, + "grad_norm": 0.5697766542434692, + "learning_rate": 0.0001, + "loss": 1.0271, + "step": 116 + }, + { + "epoch": 0.10398844572825242, + "grad_norm": 0.5138140320777893, + "learning_rate": 0.0001, + "loss": 1.0057, + "step": 117 + }, + { + "epoch": 0.10487723586268193, + "grad_norm": 0.4911749064922333, + "learning_rate": 0.0001, + "loss": 1.1375, + "step": 118 + }, + { + "epoch": 0.10576602599711143, + "grad_norm": 0.45733195543289185, + "learning_rate": 0.0001, + "loss": 1.0889, + "step": 119 + }, + { + "epoch": 0.10665481613154094, + "grad_norm": 0.4950946569442749, + "learning_rate": 0.0001, + "loss": 1.1143, + "step": 120 + }, + { + "epoch": 0.10754360626597045, + "grad_norm": 0.5536718964576721, + "learning_rate": 0.0001, + "loss": 1.1188, + "step": 121 + }, + { + "epoch": 0.10843239640039995, + "grad_norm": 0.5105839967727661, + "learning_rate": 0.0001, + "loss": 1.0501, + "step": 122 + }, + { + "epoch": 0.10932118653482946, + "grad_norm": 0.5183722972869873, + "learning_rate": 0.0001, + "loss": 1.0766, + "step": 123 + }, + { + "epoch": 0.11020997666925897, + "grad_norm": 0.5385981798171997, + "learning_rate": 0.0001, + "loss": 1.0419, + "step": 124 + }, + { + "epoch": 0.11109876680368848, + "grad_norm": 0.6438923478126526, + "learning_rate": 0.0001, + "loss": 1.1333, + "step": 125 + }, + { + "epoch": 0.11198755693811799, + "grad_norm": 0.4993588328361511, + "learning_rate": 0.0001, + "loss": 1.0504, + "step": 126 + }, + { + "epoch": 0.1128763470725475, + "grad_norm": 0.5645109415054321, + "learning_rate": 0.0001, + "loss": 1.0817, + "step": 127 + }, + { + "epoch": 0.11376513720697701, + "grad_norm": 0.5064727067947388, + "learning_rate": 0.0001, + "loss": 1.1588, + "step": 128 + }, + { + "epoch": 0.11465392734140652, + "grad_norm": 0.4977114498615265, + "learning_rate": 0.0001, + "loss": 1.0903, + "step": 129 + }, + { + "epoch": 0.11554271747583601, + "grad_norm": 0.5678680539131165, + "learning_rate": 0.0001, + "loss": 1.1107, + "step": 130 + }, + { + "epoch": 0.11643150761026552, + "grad_norm": 0.5574091076850891, + "learning_rate": 0.0001, + "loss": 1.1081, + "step": 131 + }, + { + "epoch": 0.11732029774469503, + "grad_norm": 0.5073116421699524, + "learning_rate": 0.0001, + "loss": 1.0487, + "step": 132 + }, + { + "epoch": 0.11820908787912454, + "grad_norm": 0.4510885179042816, + "learning_rate": 0.0001, + "loss": 1.0262, + "step": 133 + }, + { + "epoch": 0.11909787801355405, + "grad_norm": 0.5130484104156494, + "learning_rate": 0.0001, + "loss": 1.0612, + "step": 134 + }, + { + "epoch": 0.11998666814798356, + "grad_norm": 0.55837082862854, + "learning_rate": 0.0001, + "loss": 1.1003, + "step": 135 + }, + { + "epoch": 0.12087545828241307, + "grad_norm": 0.563966691493988, + "learning_rate": 0.0001, + "loss": 1.1106, + "step": 136 + }, + { + "epoch": 0.12176424841684258, + "grad_norm": 0.495710164308548, + "learning_rate": 0.0001, + "loss": 1.077, + "step": 137 + }, + { + "epoch": 0.12265303855127208, + "grad_norm": 0.4716346561908722, + "learning_rate": 0.0001, + "loss": 1.0224, + "step": 138 + }, + { + "epoch": 0.12354182868570159, + "grad_norm": 0.5098469257354736, + "learning_rate": 0.0001, + "loss": 1.0821, + "step": 139 + }, + { + "epoch": 0.1244306188201311, + "grad_norm": 0.4919681251049042, + "learning_rate": 0.0001, + "loss": 1.0588, + "step": 140 + }, + { + "epoch": 0.12531940895456062, + "grad_norm": 0.4506363570690155, + "learning_rate": 0.0001, + "loss": 1.0579, + "step": 141 + }, + { + "epoch": 0.12620819908899011, + "grad_norm": 0.6020563244819641, + "learning_rate": 0.0001, + "loss": 1.0176, + "step": 142 + }, + { + "epoch": 0.1270969892234196, + "grad_norm": 0.6274687051773071, + "learning_rate": 0.0001, + "loss": 1.2257, + "step": 143 + }, + { + "epoch": 0.12798577935784913, + "grad_norm": 0.49272266030311584, + "learning_rate": 0.0001, + "loss": 1.0729, + "step": 144 + }, + { + "epoch": 0.12887456949227863, + "grad_norm": 0.5412996411323547, + "learning_rate": 0.0001, + "loss": 1.1025, + "step": 145 + }, + { + "epoch": 0.12976335962670815, + "grad_norm": 0.5278550386428833, + "learning_rate": 0.0001, + "loss": 1.136, + "step": 146 + }, + { + "epoch": 0.13065214976113765, + "grad_norm": 0.5578097701072693, + "learning_rate": 0.0001, + "loss": 1.0407, + "step": 147 + }, + { + "epoch": 0.13154093989556717, + "grad_norm": 0.5495776534080505, + "learning_rate": 0.0001, + "loss": 1.084, + "step": 148 + }, + { + "epoch": 0.13242973002999667, + "grad_norm": 0.6431498527526855, + "learning_rate": 0.0001, + "loss": 1.0953, + "step": 149 + }, + { + "epoch": 0.13331852016442616, + "grad_norm": 0.5592057108879089, + "learning_rate": 0.0001, + "loss": 1.0847, + "step": 150 + }, + { + "epoch": 0.1342073102988557, + "grad_norm": 0.6379444599151611, + "learning_rate": 0.0001, + "loss": 1.0631, + "step": 151 + }, + { + "epoch": 0.13509610043328518, + "grad_norm": 0.5859350562095642, + "learning_rate": 0.0001, + "loss": 1.069, + "step": 152 + }, + { + "epoch": 0.1359848905677147, + "grad_norm": 0.5171725153923035, + "learning_rate": 0.0001, + "loss": 1.0135, + "step": 153 + }, + { + "epoch": 0.1368736807021442, + "grad_norm": 0.4534023702144623, + "learning_rate": 0.0001, + "loss": 1.0122, + "step": 154 + }, + { + "epoch": 0.13776247083657372, + "grad_norm": 0.49811944365501404, + "learning_rate": 0.0001, + "loss": 1.0196, + "step": 155 + }, + { + "epoch": 0.13865126097100322, + "grad_norm": 0.46456894278526306, + "learning_rate": 0.0001, + "loss": 1.0723, + "step": 156 + }, + { + "epoch": 0.13954005110543272, + "grad_norm": 0.5349675416946411, + "learning_rate": 0.0001, + "loss": 1.1131, + "step": 157 + }, + { + "epoch": 0.14042884123986224, + "grad_norm": 0.466975599527359, + "learning_rate": 0.0001, + "loss": 1.0752, + "step": 158 + }, + { + "epoch": 0.14131763137429174, + "grad_norm": 0.5469770431518555, + "learning_rate": 0.0001, + "loss": 1.0607, + "step": 159 + }, + { + "epoch": 0.14220642150872126, + "grad_norm": 0.5208465456962585, + "learning_rate": 0.0001, + "loss": 0.9892, + "step": 160 + }, + { + "epoch": 0.14309521164315075, + "grad_norm": 0.543846845626831, + "learning_rate": 0.0001, + "loss": 1.0861, + "step": 161 + }, + { + "epoch": 0.14398400177758028, + "grad_norm": 0.5700931549072266, + "learning_rate": 0.0001, + "loss": 0.9992, + "step": 162 + }, + { + "epoch": 0.14487279191200977, + "grad_norm": 0.5593817830085754, + "learning_rate": 0.0001, + "loss": 1.0803, + "step": 163 + }, + { + "epoch": 0.1457615820464393, + "grad_norm": 0.5960149765014648, + "learning_rate": 0.0001, + "loss": 1.1054, + "step": 164 + }, + { + "epoch": 0.1466503721808688, + "grad_norm": 0.5227658152580261, + "learning_rate": 0.0001, + "loss": 1.1041, + "step": 165 + }, + { + "epoch": 0.1475391623152983, + "grad_norm": 0.5318516492843628, + "learning_rate": 0.0001, + "loss": 1.1705, + "step": 166 + }, + { + "epoch": 0.1484279524497278, + "grad_norm": 0.49774935841560364, + "learning_rate": 0.0001, + "loss": 1.0942, + "step": 167 + }, + { + "epoch": 0.1493167425841573, + "grad_norm": 0.5588480234146118, + "learning_rate": 0.0001, + "loss": 1.1274, + "step": 168 + }, + { + "epoch": 0.15020553271858683, + "grad_norm": 0.5155318379402161, + "learning_rate": 0.0001, + "loss": 1.0727, + "step": 169 + }, + { + "epoch": 0.15109432285301633, + "grad_norm": 0.5063637495040894, + "learning_rate": 0.0001, + "loss": 1.1389, + "step": 170 + }, + { + "epoch": 0.15198311298744585, + "grad_norm": 0.5275964736938477, + "learning_rate": 0.0001, + "loss": 1.0594, + "step": 171 + }, + { + "epoch": 0.15287190312187535, + "grad_norm": 0.48888012766838074, + "learning_rate": 0.0001, + "loss": 1.0238, + "step": 172 + }, + { + "epoch": 0.15376069325630484, + "grad_norm": 0.6187731623649597, + "learning_rate": 0.0001, + "loss": 1.0535, + "step": 173 + }, + { + "epoch": 0.15464948339073437, + "grad_norm": 0.53126460313797, + "learning_rate": 0.0001, + "loss": 1.026, + "step": 174 + }, + { + "epoch": 0.15553827352516386, + "grad_norm": 0.5398485064506531, + "learning_rate": 0.0001, + "loss": 1.069, + "step": 175 + }, + { + "epoch": 0.15642706365959338, + "grad_norm": 0.5406534075737, + "learning_rate": 0.0001, + "loss": 1.0529, + "step": 176 + }, + { + "epoch": 0.15731585379402288, + "grad_norm": 0.48404720425605774, + "learning_rate": 0.0001, + "loss": 1.0038, + "step": 177 + }, + { + "epoch": 0.1582046439284524, + "grad_norm": 0.5885335206985474, + "learning_rate": 0.0001, + "loss": 1.0029, + "step": 178 + }, + { + "epoch": 0.1590934340628819, + "grad_norm": 0.5530521869659424, + "learning_rate": 0.0001, + "loss": 1.1171, + "step": 179 + }, + { + "epoch": 0.15998222419731142, + "grad_norm": 0.5618231892585754, + "learning_rate": 0.0001, + "loss": 1.1232, + "step": 180 + }, + { + "epoch": 0.16087101433174092, + "grad_norm": 0.5092931389808655, + "learning_rate": 0.0001, + "loss": 1.0364, + "step": 181 + }, + { + "epoch": 0.16175980446617041, + "grad_norm": 0.44638335704803467, + "learning_rate": 0.0001, + "loss": 1.0221, + "step": 182 + }, + { + "epoch": 0.16264859460059994, + "grad_norm": 0.4906376004219055, + "learning_rate": 0.0001, + "loss": 1.023, + "step": 183 + }, + { + "epoch": 0.16353738473502943, + "grad_norm": 0.5148441195487976, + "learning_rate": 0.0001, + "loss": 1.0451, + "step": 184 + }, + { + "epoch": 0.16442617486945896, + "grad_norm": 0.5504122972488403, + "learning_rate": 0.0001, + "loss": 1.1301, + "step": 185 + }, + { + "epoch": 0.16531496500388845, + "grad_norm": 0.5042161345481873, + "learning_rate": 0.0001, + "loss": 1.0545, + "step": 186 + }, + { + "epoch": 0.16620375513831798, + "grad_norm": 0.4888613820075989, + "learning_rate": 0.0001, + "loss": 1.0766, + "step": 187 + }, + { + "epoch": 0.16709254527274747, + "grad_norm": 0.46498018503189087, + "learning_rate": 0.0001, + "loss": 1.0935, + "step": 188 + }, + { + "epoch": 0.16798133540717697, + "grad_norm": 0.4573257863521576, + "learning_rate": 0.0001, + "loss": 1.0511, + "step": 189 + }, + { + "epoch": 0.1688701255416065, + "grad_norm": 0.4621419608592987, + "learning_rate": 0.0001, + "loss": 1.026, + "step": 190 + }, + { + "epoch": 0.169758915676036, + "grad_norm": 0.5240814685821533, + "learning_rate": 0.0001, + "loss": 0.9897, + "step": 191 + }, + { + "epoch": 0.1706477058104655, + "grad_norm": 0.4366868734359741, + "learning_rate": 0.0001, + "loss": 1.0274, + "step": 192 + }, + { + "epoch": 0.171536495944895, + "grad_norm": 0.574884831905365, + "learning_rate": 0.0001, + "loss": 1.0192, + "step": 193 + }, + { + "epoch": 0.17242528607932453, + "grad_norm": 0.5254986882209778, + "learning_rate": 0.0001, + "loss": 1.0043, + "step": 194 + }, + { + "epoch": 0.17331407621375403, + "grad_norm": 0.6957873106002808, + "learning_rate": 0.0001, + "loss": 1.0281, + "step": 195 + }, + { + "epoch": 0.17420286634818352, + "grad_norm": 0.5131996273994446, + "learning_rate": 0.0001, + "loss": 1.0363, + "step": 196 + }, + { + "epoch": 0.17509165648261305, + "grad_norm": 0.5054270029067993, + "learning_rate": 0.0001, + "loss": 1.0708, + "step": 197 + }, + { + "epoch": 0.17598044661704254, + "grad_norm": 0.5029573440551758, + "learning_rate": 0.0001, + "loss": 1.0562, + "step": 198 + }, + { + "epoch": 0.17686923675147206, + "grad_norm": 0.5139032602310181, + "learning_rate": 0.0001, + "loss": 0.995, + "step": 199 + }, + { + "epoch": 0.17775802688590156, + "grad_norm": 0.4500792324542999, + "learning_rate": 0.0001, + "loss": 1.1012, + "step": 200 + }, + { + "epoch": 0.17864681702033108, + "grad_norm": 0.5789426565170288, + "learning_rate": 0.0001, + "loss": 1.1203, + "step": 201 + }, + { + "epoch": 0.17953560715476058, + "grad_norm": 0.48242732882499695, + "learning_rate": 0.0001, + "loss": 1.0759, + "step": 202 + }, + { + "epoch": 0.1804243972891901, + "grad_norm": 0.5667662024497986, + "learning_rate": 0.0001, + "loss": 1.0595, + "step": 203 + }, + { + "epoch": 0.1813131874236196, + "grad_norm": 0.6725609302520752, + "learning_rate": 0.0001, + "loss": 1.0049, + "step": 204 + }, + { + "epoch": 0.1822019775580491, + "grad_norm": 0.5460247993469238, + "learning_rate": 0.0001, + "loss": 1.0646, + "step": 205 + }, + { + "epoch": 0.18309076769247862, + "grad_norm": 0.49915874004364014, + "learning_rate": 0.0001, + "loss": 1.1236, + "step": 206 + }, + { + "epoch": 0.1839795578269081, + "grad_norm": 0.522266149520874, + "learning_rate": 0.0001, + "loss": 1.0688, + "step": 207 + }, + { + "epoch": 0.18486834796133764, + "grad_norm": 0.4875168800354004, + "learning_rate": 0.0001, + "loss": 1.0596, + "step": 208 + }, + { + "epoch": 0.18575713809576713, + "grad_norm": 0.5118414759635925, + "learning_rate": 0.0001, + "loss": 1.0139, + "step": 209 + }, + { + "epoch": 0.18664592823019666, + "grad_norm": 0.5071278214454651, + "learning_rate": 0.0001, + "loss": 1.1415, + "step": 210 + }, + { + "epoch": 0.18753471836462615, + "grad_norm": 0.5847178101539612, + "learning_rate": 0.0001, + "loss": 1.1495, + "step": 211 + }, + { + "epoch": 0.18842350849905565, + "grad_norm": 0.45916759967803955, + "learning_rate": 0.0001, + "loss": 1.0898, + "step": 212 + }, + { + "epoch": 0.18931229863348517, + "grad_norm": 0.4967547655105591, + "learning_rate": 0.0001, + "loss": 0.946, + "step": 213 + }, + { + "epoch": 0.19020108876791467, + "grad_norm": 0.48724737763404846, + "learning_rate": 0.0001, + "loss": 1.0353, + "step": 214 + }, + { + "epoch": 0.1910898789023442, + "grad_norm": 0.4389554262161255, + "learning_rate": 0.0001, + "loss": 1.0129, + "step": 215 + }, + { + "epoch": 0.19197866903677369, + "grad_norm": 0.4678506553173065, + "learning_rate": 0.0001, + "loss": 1.0903, + "step": 216 + }, + { + "epoch": 0.1928674591712032, + "grad_norm": 0.5293782353401184, + "learning_rate": 0.0001, + "loss": 1.0739, + "step": 217 + }, + { + "epoch": 0.1937562493056327, + "grad_norm": 0.45177680253982544, + "learning_rate": 0.0001, + "loss": 1.1057, + "step": 218 + }, + { + "epoch": 0.19464503944006223, + "grad_norm": 0.4647086560726166, + "learning_rate": 0.0001, + "loss": 1.0894, + "step": 219 + }, + { + "epoch": 0.19553382957449172, + "grad_norm": 0.5464223027229309, + "learning_rate": 0.0001, + "loss": 1.0606, + "step": 220 + }, + { + "epoch": 0.19642261970892122, + "grad_norm": 0.5713245868682861, + "learning_rate": 0.0001, + "loss": 1.0621, + "step": 221 + }, + { + "epoch": 0.19731140984335074, + "grad_norm": 0.558101236820221, + "learning_rate": 0.0001, + "loss": 1.0829, + "step": 222 + }, + { + "epoch": 0.19820019997778024, + "grad_norm": 0.775971531867981, + "learning_rate": 0.0001, + "loss": 0.9982, + "step": 223 + }, + { + "epoch": 0.19908899011220976, + "grad_norm": 0.4644327163696289, + "learning_rate": 0.0001, + "loss": 1.0883, + "step": 224 + }, + { + "epoch": 0.19997778024663926, + "grad_norm": 0.4939100444316864, + "learning_rate": 0.0001, + "loss": 1.0367, + "step": 225 + }, + { + "epoch": 0.20086657038106878, + "grad_norm": 0.43425729870796204, + "learning_rate": 0.0001, + "loss": 1.0038, + "step": 226 + }, + { + "epoch": 0.20175536051549828, + "grad_norm": 0.508567750453949, + "learning_rate": 0.0001, + "loss": 1.1134, + "step": 227 + }, + { + "epoch": 0.20264415064992777, + "grad_norm": 0.397948294878006, + "learning_rate": 0.0001, + "loss": 0.9837, + "step": 228 + }, + { + "epoch": 0.2035329407843573, + "grad_norm": 0.5290727019309998, + "learning_rate": 0.0001, + "loss": 1.0965, + "step": 229 + }, + { + "epoch": 0.2044217309187868, + "grad_norm": 0.5163105130195618, + "learning_rate": 0.0001, + "loss": 0.9927, + "step": 230 + }, + { + "epoch": 0.20531052105321632, + "grad_norm": 0.4882635474205017, + "learning_rate": 0.0001, + "loss": 1.0316, + "step": 231 + }, + { + "epoch": 0.2061993111876458, + "grad_norm": 0.471646785736084, + "learning_rate": 0.0001, + "loss": 1.0645, + "step": 232 + }, + { + "epoch": 0.20708810132207534, + "grad_norm": 0.5159053206443787, + "learning_rate": 0.0001, + "loss": 1.0898, + "step": 233 + }, + { + "epoch": 0.20797689145650483, + "grad_norm": 0.5597699880599976, + "learning_rate": 0.0001, + "loss": 1.0726, + "step": 234 + }, + { + "epoch": 0.20886568159093435, + "grad_norm": 0.463168203830719, + "learning_rate": 0.0001, + "loss": 1.0349, + "step": 235 + }, + { + "epoch": 0.20975447172536385, + "grad_norm": 0.47213372588157654, + "learning_rate": 0.0001, + "loss": 1.08, + "step": 236 + }, + { + "epoch": 0.21064326185979335, + "grad_norm": 0.46547219157218933, + "learning_rate": 0.0001, + "loss": 1.084, + "step": 237 + }, + { + "epoch": 0.21153205199422287, + "grad_norm": 0.4608129858970642, + "learning_rate": 0.0001, + "loss": 1.0441, + "step": 238 + }, + { + "epoch": 0.21242084212865237, + "grad_norm": 0.5103969573974609, + "learning_rate": 0.0001, + "loss": 1.0845, + "step": 239 + }, + { + "epoch": 0.2133096322630819, + "grad_norm": 0.48750364780426025, + "learning_rate": 0.0001, + "loss": 1.01, + "step": 240 + }, + { + "epoch": 0.21419842239751138, + "grad_norm": 0.4955364465713501, + "learning_rate": 0.0001, + "loss": 1.0824, + "step": 241 + }, + { + "epoch": 0.2150872125319409, + "grad_norm": 0.4669419527053833, + "learning_rate": 0.0001, + "loss": 1.0061, + "step": 242 + }, + { + "epoch": 0.2159760026663704, + "grad_norm": 0.4938861131668091, + "learning_rate": 0.0001, + "loss": 0.9739, + "step": 243 + }, + { + "epoch": 0.2168647928007999, + "grad_norm": 0.5369840860366821, + "learning_rate": 0.0001, + "loss": 1.1085, + "step": 244 + }, + { + "epoch": 0.21775358293522942, + "grad_norm": 0.5239328742027283, + "learning_rate": 0.0001, + "loss": 0.9869, + "step": 245 + }, + { + "epoch": 0.21864237306965892, + "grad_norm": 0.518187403678894, + "learning_rate": 0.0001, + "loss": 1.046, + "step": 246 + }, + { + "epoch": 0.21953116320408844, + "grad_norm": 0.44194117188453674, + "learning_rate": 0.0001, + "loss": 1.0234, + "step": 247 + }, + { + "epoch": 0.22041995333851794, + "grad_norm": 0.44847285747528076, + "learning_rate": 0.0001, + "loss": 1.0692, + "step": 248 + }, + { + "epoch": 0.22130874347294746, + "grad_norm": 0.4702758491039276, + "learning_rate": 0.0001, + "loss": 1.1231, + "step": 249 + }, + { + "epoch": 0.22219753360737696, + "grad_norm": 0.43607068061828613, + "learning_rate": 0.0001, + "loss": 1.0197, + "step": 250 + }, + { + "epoch": 0.22308632374180645, + "grad_norm": 0.5397356748580933, + "learning_rate": 0.0001, + "loss": 1.0527, + "step": 251 + }, + { + "epoch": 0.22397511387623598, + "grad_norm": 0.5345639586448669, + "learning_rate": 0.0001, + "loss": 1.0119, + "step": 252 + }, + { + "epoch": 0.22486390401066547, + "grad_norm": 0.45763713121414185, + "learning_rate": 0.0001, + "loss": 1.0296, + "step": 253 + }, + { + "epoch": 0.225752694145095, + "grad_norm": 0.49265143275260925, + "learning_rate": 0.0001, + "loss": 1.0171, + "step": 254 + }, + { + "epoch": 0.2266414842795245, + "grad_norm": 0.5668004751205444, + "learning_rate": 0.0001, + "loss": 1.0085, + "step": 255 + }, + { + "epoch": 0.22753027441395401, + "grad_norm": 0.5550284385681152, + "learning_rate": 0.0001, + "loss": 1.0006, + "step": 256 + }, + { + "epoch": 0.2284190645483835, + "grad_norm": 0.45947736501693726, + "learning_rate": 0.0001, + "loss": 1.0089, + "step": 257 + }, + { + "epoch": 0.22930785468281303, + "grad_norm": 0.5964300632476807, + "learning_rate": 0.0001, + "loss": 1.0854, + "step": 258 + }, + { + "epoch": 0.23019664481724253, + "grad_norm": 0.5322341322898865, + "learning_rate": 0.0001, + "loss": 1.0521, + "step": 259 + }, + { + "epoch": 0.23108543495167203, + "grad_norm": 0.48153069615364075, + "learning_rate": 0.0001, + "loss": 0.9444, + "step": 260 + }, + { + "epoch": 0.23197422508610155, + "grad_norm": 0.45373499393463135, + "learning_rate": 0.0001, + "loss": 1.047, + "step": 261 + }, + { + "epoch": 0.23286301522053104, + "grad_norm": 0.43222519755363464, + "learning_rate": 0.0001, + "loss": 0.973, + "step": 262 + }, + { + "epoch": 0.23375180535496057, + "grad_norm": 0.5555695295333862, + "learning_rate": 0.0001, + "loss": 1.0127, + "step": 263 + }, + { + "epoch": 0.23464059548939006, + "grad_norm": 0.47663983702659607, + "learning_rate": 0.0001, + "loss": 1.0688, + "step": 264 + }, + { + "epoch": 0.2355293856238196, + "grad_norm": 0.5549951791763306, + "learning_rate": 0.0001, + "loss": 1.0433, + "step": 265 + }, + { + "epoch": 0.23641817575824908, + "grad_norm": 0.4606119990348816, + "learning_rate": 0.0001, + "loss": 0.9985, + "step": 266 + }, + { + "epoch": 0.23730696589267858, + "grad_norm": 0.5273025631904602, + "learning_rate": 0.0001, + "loss": 1.018, + "step": 267 + }, + { + "epoch": 0.2381957560271081, + "grad_norm": 0.49982166290283203, + "learning_rate": 0.0001, + "loss": 0.9592, + "step": 268 + }, + { + "epoch": 0.2390845461615376, + "grad_norm": 0.5359534621238708, + "learning_rate": 0.0001, + "loss": 1.0568, + "step": 269 + }, + { + "epoch": 0.23997333629596712, + "grad_norm": 0.48276546597480774, + "learning_rate": 0.0001, + "loss": 1.0315, + "step": 270 + }, + { + "epoch": 0.24086212643039662, + "grad_norm": 0.48728588223457336, + "learning_rate": 0.0001, + "loss": 1.0827, + "step": 271 + }, + { + "epoch": 0.24175091656482614, + "grad_norm": 2.621769428253174, + "learning_rate": 0.0001, + "loss": 0.997, + "step": 272 + }, + { + "epoch": 0.24263970669925564, + "grad_norm": 0.5768371820449829, + "learning_rate": 0.0001, + "loss": 0.9872, + "step": 273 + }, + { + "epoch": 0.24352849683368516, + "grad_norm": 0.5989674925804138, + "learning_rate": 0.0001, + "loss": 0.9232, + "step": 274 + }, + { + "epoch": 0.24441728696811466, + "grad_norm": 0.5455823540687561, + "learning_rate": 0.0001, + "loss": 1.0062, + "step": 275 + }, + { + "epoch": 0.24530607710254415, + "grad_norm": 0.5664008855819702, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 276 + }, + { + "epoch": 0.24619486723697367, + "grad_norm": 0.5294925570487976, + "learning_rate": 0.0001, + "loss": 1.0285, + "step": 277 + }, + { + "epoch": 0.24708365737140317, + "grad_norm": 0.5203514099121094, + "learning_rate": 0.0001, + "loss": 1.0388, + "step": 278 + }, + { + "epoch": 0.2479724475058327, + "grad_norm": 0.5114679336547852, + "learning_rate": 0.0001, + "loss": 1.0388, + "step": 279 + }, + { + "epoch": 0.2488612376402622, + "grad_norm": 0.48817357420921326, + "learning_rate": 0.0001, + "loss": 0.9768, + "step": 280 + }, + { + "epoch": 0.2497500277746917, + "grad_norm": 0.4958358108997345, + "learning_rate": 0.0001, + "loss": 1.0239, + "step": 281 + }, + { + "epoch": 0.25063881790912124, + "grad_norm": 0.4603129029273987, + "learning_rate": 0.0001, + "loss": 1.0371, + "step": 282 + }, + { + "epoch": 0.25152760804355073, + "grad_norm": 0.48228880763053894, + "learning_rate": 0.0001, + "loss": 1.04, + "step": 283 + }, + { + "epoch": 0.25241639817798023, + "grad_norm": 0.5364854335784912, + "learning_rate": 0.0001, + "loss": 1.0049, + "step": 284 + }, + { + "epoch": 0.2533051883124097, + "grad_norm": 0.46868863701820374, + "learning_rate": 0.0001, + "loss": 0.9842, + "step": 285 + }, + { + "epoch": 0.2541939784468392, + "grad_norm": 0.4731464385986328, + "learning_rate": 0.0001, + "loss": 1.0195, + "step": 286 + }, + { + "epoch": 0.25508276858126877, + "grad_norm": 0.5144749879837036, + "learning_rate": 0.0001, + "loss": 1.0673, + "step": 287 + }, + { + "epoch": 0.25597155871569827, + "grad_norm": 0.44826894998550415, + "learning_rate": 0.0001, + "loss": 0.9981, + "step": 288 + }, + { + "epoch": 0.25686034885012776, + "grad_norm": 0.4612467885017395, + "learning_rate": 0.0001, + "loss": 1.0318, + "step": 289 + }, + { + "epoch": 0.25774913898455726, + "grad_norm": 0.4774060845375061, + "learning_rate": 0.0001, + "loss": 0.9806, + "step": 290 + }, + { + "epoch": 0.25863792911898675, + "grad_norm": 0.4614105820655823, + "learning_rate": 0.0001, + "loss": 1.0404, + "step": 291 + }, + { + "epoch": 0.2595267192534163, + "grad_norm": 0.47314900159835815, + "learning_rate": 0.0001, + "loss": 0.9932, + "step": 292 + }, + { + "epoch": 0.2604155093878458, + "grad_norm": 0.5009798407554626, + "learning_rate": 0.0001, + "loss": 1.1416, + "step": 293 + }, + { + "epoch": 0.2613042995222753, + "grad_norm": 0.4358547329902649, + "learning_rate": 0.0001, + "loss": 1.0379, + "step": 294 + }, + { + "epoch": 0.2621930896567048, + "grad_norm": 0.5527064204216003, + "learning_rate": 0.0001, + "loss": 0.9843, + "step": 295 + }, + { + "epoch": 0.26308187979113434, + "grad_norm": 0.47958534955978394, + "learning_rate": 0.0001, + "loss": 1.0235, + "step": 296 + }, + { + "epoch": 0.26397066992556384, + "grad_norm": 0.4394091069698334, + "learning_rate": 0.0001, + "loss": 0.9711, + "step": 297 + }, + { + "epoch": 0.26485946005999333, + "grad_norm": 0.5663338899612427, + "learning_rate": 0.0001, + "loss": 1.029, + "step": 298 + }, + { + "epoch": 0.26574825019442283, + "grad_norm": 0.5038536787033081, + "learning_rate": 0.0001, + "loss": 1.0097, + "step": 299 + }, + { + "epoch": 0.2666370403288523, + "grad_norm": 0.5386257767677307, + "learning_rate": 0.0001, + "loss": 1.0539, + "step": 300 + }, + { + "epoch": 0.2675258304632819, + "grad_norm": 0.4892950654029846, + "learning_rate": 0.0001, + "loss": 0.99, + "step": 301 + }, + { + "epoch": 0.2684146205977114, + "grad_norm": 0.5311320424079895, + "learning_rate": 0.0001, + "loss": 1.0922, + "step": 302 + }, + { + "epoch": 0.26930341073214087, + "grad_norm": 0.4981628656387329, + "learning_rate": 0.0001, + "loss": 1.0332, + "step": 303 + }, + { + "epoch": 0.27019220086657036, + "grad_norm": 0.6175600290298462, + "learning_rate": 0.0001, + "loss": 1.0552, + "step": 304 + }, + { + "epoch": 0.2710809910009999, + "grad_norm": 0.5665920376777649, + "learning_rate": 0.0001, + "loss": 0.9807, + "step": 305 + }, + { + "epoch": 0.2719697811354294, + "grad_norm": 0.4487343430519104, + "learning_rate": 0.0001, + "loss": 0.9961, + "step": 306 + }, + { + "epoch": 0.2728585712698589, + "grad_norm": 0.4994884133338928, + "learning_rate": 0.0001, + "loss": 0.9809, + "step": 307 + }, + { + "epoch": 0.2737473614042884, + "grad_norm": 0.5361630916595459, + "learning_rate": 0.0001, + "loss": 1.0396, + "step": 308 + }, + { + "epoch": 0.2746361515387179, + "grad_norm": 0.49830833077430725, + "learning_rate": 0.0001, + "loss": 0.966, + "step": 309 + }, + { + "epoch": 0.27552494167314745, + "grad_norm": 0.5319890975952148, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 310 + }, + { + "epoch": 0.27641373180757695, + "grad_norm": 0.5802849531173706, + "learning_rate": 0.0001, + "loss": 1.0198, + "step": 311 + }, + { + "epoch": 0.27730252194200644, + "grad_norm": 0.5048151016235352, + "learning_rate": 0.0001, + "loss": 0.9996, + "step": 312 + }, + { + "epoch": 0.27819131207643594, + "grad_norm": 0.44712725281715393, + "learning_rate": 0.0001, + "loss": 0.9757, + "step": 313 + }, + { + "epoch": 0.27908010221086543, + "grad_norm": 0.48854947090148926, + "learning_rate": 0.0001, + "loss": 1.0514, + "step": 314 + }, + { + "epoch": 0.279968892345295, + "grad_norm": 0.479911208152771, + "learning_rate": 0.0001, + "loss": 0.9588, + "step": 315 + }, + { + "epoch": 0.2808576824797245, + "grad_norm": 0.46088626980781555, + "learning_rate": 0.0001, + "loss": 1.0577, + "step": 316 + }, + { + "epoch": 0.281746472614154, + "grad_norm": 0.5798197388648987, + "learning_rate": 0.0001, + "loss": 1.0011, + "step": 317 + }, + { + "epoch": 0.28263526274858347, + "grad_norm": 0.45959633588790894, + "learning_rate": 0.0001, + "loss": 0.9358, + "step": 318 + }, + { + "epoch": 0.283524052883013, + "grad_norm": 0.4906099736690521, + "learning_rate": 0.0001, + "loss": 0.9432, + "step": 319 + }, + { + "epoch": 0.2844128430174425, + "grad_norm": 0.4779829680919647, + "learning_rate": 0.0001, + "loss": 0.9471, + "step": 320 + }, + { + "epoch": 0.285301633151872, + "grad_norm": 0.5950011014938354, + "learning_rate": 0.0001, + "loss": 1.0712, + "step": 321 + }, + { + "epoch": 0.2861904232863015, + "grad_norm": 0.5539191365242004, + "learning_rate": 0.0001, + "loss": 1.0329, + "step": 322 + }, + { + "epoch": 0.287079213420731, + "grad_norm": 0.44738510251045227, + "learning_rate": 0.0001, + "loss": 0.9935, + "step": 323 + }, + { + "epoch": 0.28796800355516056, + "grad_norm": 0.4675084054470062, + "learning_rate": 0.0001, + "loss": 0.9979, + "step": 324 + }, + { + "epoch": 0.28885679368959005, + "grad_norm": 0.38852187991142273, + "learning_rate": 0.0001, + "loss": 0.9497, + "step": 325 + }, + { + "epoch": 0.28974558382401955, + "grad_norm": 0.5008799433708191, + "learning_rate": 0.0001, + "loss": 0.9904, + "step": 326 + }, + { + "epoch": 0.29063437395844904, + "grad_norm": 0.4492400288581848, + "learning_rate": 0.0001, + "loss": 1.0489, + "step": 327 + }, + { + "epoch": 0.2915231640928786, + "grad_norm": 0.48956772685050964, + "learning_rate": 0.0001, + "loss": 1.0061, + "step": 328 + }, + { + "epoch": 0.2924119542273081, + "grad_norm": 0.5612773299217224, + "learning_rate": 0.0001, + "loss": 1.0947, + "step": 329 + }, + { + "epoch": 0.2933007443617376, + "grad_norm": 0.5352462530136108, + "learning_rate": 0.0001, + "loss": 1.1113, + "step": 330 + }, + { + "epoch": 0.2941895344961671, + "grad_norm": 0.43017029762268066, + "learning_rate": 0.0001, + "loss": 1.0291, + "step": 331 + }, + { + "epoch": 0.2950783246305966, + "grad_norm": 0.5087767839431763, + "learning_rate": 0.0001, + "loss": 1.0897, + "step": 332 + }, + { + "epoch": 0.29596711476502613, + "grad_norm": 0.38609907031059265, + "learning_rate": 0.0001, + "loss": 0.9629, + "step": 333 + }, + { + "epoch": 0.2968559048994556, + "grad_norm": 0.4797438979148865, + "learning_rate": 0.0001, + "loss": 0.9808, + "step": 334 + }, + { + "epoch": 0.2977446950338851, + "grad_norm": 0.4882568418979645, + "learning_rate": 0.0001, + "loss": 1.0812, + "step": 335 + }, + { + "epoch": 0.2986334851683146, + "grad_norm": 0.4409843385219574, + "learning_rate": 0.0001, + "loss": 1.0034, + "step": 336 + }, + { + "epoch": 0.2995222753027441, + "grad_norm": 0.5104953646659851, + "learning_rate": 0.0001, + "loss": 0.9186, + "step": 337 + }, + { + "epoch": 0.30041106543717366, + "grad_norm": 0.45493751764297485, + "learning_rate": 0.0001, + "loss": 0.9059, + "step": 338 + }, + { + "epoch": 0.30129985557160316, + "grad_norm": 0.4311971068382263, + "learning_rate": 0.0001, + "loss": 1.042, + "step": 339 + }, + { + "epoch": 0.30218864570603265, + "grad_norm": 0.43054771423339844, + "learning_rate": 0.0001, + "loss": 1.0336, + "step": 340 + }, + { + "epoch": 0.30307743584046215, + "grad_norm": 0.4950178861618042, + "learning_rate": 0.0001, + "loss": 1.0515, + "step": 341 + }, + { + "epoch": 0.3039662259748917, + "grad_norm": 0.4074246287345886, + "learning_rate": 0.0001, + "loss": 0.9163, + "step": 342 + }, + { + "epoch": 0.3048550161093212, + "grad_norm": 0.45394015312194824, + "learning_rate": 0.0001, + "loss": 1.0499, + "step": 343 + }, + { + "epoch": 0.3057438062437507, + "grad_norm": 0.453685998916626, + "learning_rate": 0.0001, + "loss": 1.021, + "step": 344 + }, + { + "epoch": 0.3066325963781802, + "grad_norm": 0.4687478542327881, + "learning_rate": 0.0001, + "loss": 1.0305, + "step": 345 + }, + { + "epoch": 0.3075213865126097, + "grad_norm": 0.4402327239513397, + "learning_rate": 0.0001, + "loss": 0.9547, + "step": 346 + }, + { + "epoch": 0.30841017664703924, + "grad_norm": 0.41369926929473877, + "learning_rate": 0.0001, + "loss": 0.981, + "step": 347 + }, + { + "epoch": 0.30929896678146873, + "grad_norm": 0.4821121096611023, + "learning_rate": 0.0001, + "loss": 0.9427, + "step": 348 + }, + { + "epoch": 0.3101877569158982, + "grad_norm": 0.4524393379688263, + "learning_rate": 0.0001, + "loss": 1.0277, + "step": 349 + }, + { + "epoch": 0.3110765470503277, + "grad_norm": 0.49870559573173523, + "learning_rate": 0.0001, + "loss": 1.0075, + "step": 350 + }, + { + "epoch": 0.3119653371847573, + "grad_norm": 0.5591082572937012, + "learning_rate": 0.0001, + "loss": 0.9901, + "step": 351 + }, + { + "epoch": 0.31285412731918677, + "grad_norm": 0.4724654257297516, + "learning_rate": 0.0001, + "loss": 0.9909, + "step": 352 + }, + { + "epoch": 0.31374291745361627, + "grad_norm": 0.4514271020889282, + "learning_rate": 0.0001, + "loss": 1.0259, + "step": 353 + }, + { + "epoch": 0.31463170758804576, + "grad_norm": 0.5684020519256592, + "learning_rate": 0.0001, + "loss": 0.9933, + "step": 354 + }, + { + "epoch": 0.31552049772247526, + "grad_norm": 0.442047119140625, + "learning_rate": 0.0001, + "loss": 1.0394, + "step": 355 + }, + { + "epoch": 0.3164092878569048, + "grad_norm": 0.47112616896629333, + "learning_rate": 0.0001, + "loss": 0.9287, + "step": 356 + }, + { + "epoch": 0.3172980779913343, + "grad_norm": 0.5187697410583496, + "learning_rate": 0.0001, + "loss": 0.9476, + "step": 357 + }, + { + "epoch": 0.3181868681257638, + "grad_norm": 0.43961942195892334, + "learning_rate": 0.0001, + "loss": 0.9474, + "step": 358 + }, + { + "epoch": 0.3190756582601933, + "grad_norm": 0.47375670075416565, + "learning_rate": 0.0001, + "loss": 1.0088, + "step": 359 + }, + { + "epoch": 0.31996444839462285, + "grad_norm": 0.4810321629047394, + "learning_rate": 0.0001, + "loss": 0.9628, + "step": 360 + }, + { + "epoch": 0.32085323852905234, + "grad_norm": 0.5187336802482605, + "learning_rate": 0.0001, + "loss": 1.0055, + "step": 361 + }, + { + "epoch": 0.32174202866348184, + "grad_norm": 0.47308310866355896, + "learning_rate": 0.0001, + "loss": 1.0005, + "step": 362 + }, + { + "epoch": 0.32263081879791133, + "grad_norm": 1.5481101274490356, + "learning_rate": 0.0001, + "loss": 1.0754, + "step": 363 + }, + { + "epoch": 0.32351960893234083, + "grad_norm": 0.4808347523212433, + "learning_rate": 0.0001, + "loss": 1.0567, + "step": 364 + }, + { + "epoch": 0.3244083990667704, + "grad_norm": 0.40874922275543213, + "learning_rate": 0.0001, + "loss": 1.0153, + "step": 365 + }, + { + "epoch": 0.3252971892011999, + "grad_norm": 0.5171230435371399, + "learning_rate": 0.0001, + "loss": 0.9808, + "step": 366 + }, + { + "epoch": 0.3261859793356294, + "grad_norm": 0.48159992694854736, + "learning_rate": 0.0001, + "loss": 0.9873, + "step": 367 + }, + { + "epoch": 0.32707476947005887, + "grad_norm": 0.44044238328933716, + "learning_rate": 0.0001, + "loss": 0.9608, + "step": 368 + }, + { + "epoch": 0.32796355960448836, + "grad_norm": 0.4674980342388153, + "learning_rate": 0.0001, + "loss": 0.9263, + "step": 369 + }, + { + "epoch": 0.3288523497389179, + "grad_norm": 0.5395987033843994, + "learning_rate": 0.0001, + "loss": 0.9548, + "step": 370 + }, + { + "epoch": 0.3297411398733474, + "grad_norm": 0.49539071321487427, + "learning_rate": 0.0001, + "loss": 1.0202, + "step": 371 + }, + { + "epoch": 0.3306299300077769, + "grad_norm": 0.4859803020954132, + "learning_rate": 0.0001, + "loss": 0.9388, + "step": 372 + }, + { + "epoch": 0.3315187201422064, + "grad_norm": 0.4504952132701874, + "learning_rate": 0.0001, + "loss": 1.0092, + "step": 373 + }, + { + "epoch": 0.33240751027663595, + "grad_norm": 0.5386714339256287, + "learning_rate": 0.0001, + "loss": 1.0461, + "step": 374 + }, + { + "epoch": 0.33329630041106545, + "grad_norm": 0.5611424446105957, + "learning_rate": 0.0001, + "loss": 1.0967, + "step": 375 + }, + { + "epoch": 0.33418509054549494, + "grad_norm": 0.44047975540161133, + "learning_rate": 0.0001, + "loss": 0.8973, + "step": 376 + }, + { + "epoch": 0.33507388067992444, + "grad_norm": 0.5137032866477966, + "learning_rate": 0.0001, + "loss": 1.0303, + "step": 377 + }, + { + "epoch": 0.33596267081435394, + "grad_norm": 0.47674351930618286, + "learning_rate": 0.0001, + "loss": 1.0713, + "step": 378 + }, + { + "epoch": 0.3368514609487835, + "grad_norm": 0.4222189486026764, + "learning_rate": 0.0001, + "loss": 0.9894, + "step": 379 + }, + { + "epoch": 0.337740251083213, + "grad_norm": 0.41975629329681396, + "learning_rate": 0.0001, + "loss": 0.9526, + "step": 380 + }, + { + "epoch": 0.3386290412176425, + "grad_norm": 0.4654853045940399, + "learning_rate": 0.0001, + "loss": 0.8748, + "step": 381 + }, + { + "epoch": 0.339517831352072, + "grad_norm": 0.39208483695983887, + "learning_rate": 0.0001, + "loss": 0.9799, + "step": 382 + }, + { + "epoch": 0.3404066214865015, + "grad_norm": 0.5432955026626587, + "learning_rate": 0.0001, + "loss": 1.061, + "step": 383 + }, + { + "epoch": 0.341295411620931, + "grad_norm": 0.44643473625183105, + "learning_rate": 0.0001, + "loss": 0.9823, + "step": 384 + }, + { + "epoch": 0.3421842017553605, + "grad_norm": 0.5307298302650452, + "learning_rate": 0.0001, + "loss": 1.0406, + "step": 385 + }, + { + "epoch": 0.34307299188979, + "grad_norm": 0.5106935501098633, + "learning_rate": 0.0001, + "loss": 1.0473, + "step": 386 + }, + { + "epoch": 0.3439617820242195, + "grad_norm": 0.4799475371837616, + "learning_rate": 0.0001, + "loss": 1.0546, + "step": 387 + }, + { + "epoch": 0.34485057215864906, + "grad_norm": 0.37563350796699524, + "learning_rate": 0.0001, + "loss": 1.0025, + "step": 388 + }, + { + "epoch": 0.34573936229307856, + "grad_norm": 0.43214109539985657, + "learning_rate": 0.0001, + "loss": 0.9771, + "step": 389 + }, + { + "epoch": 0.34662815242750805, + "grad_norm": 0.41184356808662415, + "learning_rate": 0.0001, + "loss": 0.9997, + "step": 390 + }, + { + "epoch": 0.34751694256193755, + "grad_norm": 0.49145662784576416, + "learning_rate": 0.0001, + "loss": 1.0838, + "step": 391 + }, + { + "epoch": 0.34840573269636704, + "grad_norm": 0.39192360639572144, + "learning_rate": 0.0001, + "loss": 0.9671, + "step": 392 + }, + { + "epoch": 0.3492945228307966, + "grad_norm": 0.49615415930747986, + "learning_rate": 0.0001, + "loss": 0.9974, + "step": 393 + }, + { + "epoch": 0.3501833129652261, + "grad_norm": 0.48595911264419556, + "learning_rate": 0.0001, + "loss": 0.9552, + "step": 394 + }, + { + "epoch": 0.3510721030996556, + "grad_norm": 0.4774535894393921, + "learning_rate": 0.0001, + "loss": 0.9908, + "step": 395 + }, + { + "epoch": 0.3519608932340851, + "grad_norm": 0.42019304633140564, + "learning_rate": 0.0001, + "loss": 0.9919, + "step": 396 + }, + { + "epoch": 0.35284968336851463, + "grad_norm": 0.4863130450248718, + "learning_rate": 0.0001, + "loss": 0.9888, + "step": 397 + }, + { + "epoch": 0.35373847350294413, + "grad_norm": 0.5684654712677002, + "learning_rate": 0.0001, + "loss": 1.0284, + "step": 398 + }, + { + "epoch": 0.3546272636373736, + "grad_norm": 0.466160386800766, + "learning_rate": 0.0001, + "loss": 0.999, + "step": 399 + }, + { + "epoch": 0.3555160537718031, + "grad_norm": 0.4259321689605713, + "learning_rate": 0.0001, + "loss": 0.9475, + "step": 400 + }, + { + "epoch": 0.3564048439062326, + "grad_norm": 0.4329473376274109, + "learning_rate": 0.0001, + "loss": 0.971, + "step": 401 + }, + { + "epoch": 0.35729363404066217, + "grad_norm": 0.44069400429725647, + "learning_rate": 0.0001, + "loss": 1.0213, + "step": 402 + }, + { + "epoch": 0.35818242417509166, + "grad_norm": 0.46225759387016296, + "learning_rate": 0.0001, + "loss": 0.9891, + "step": 403 + }, + { + "epoch": 0.35907121430952116, + "grad_norm": 0.4280588924884796, + "learning_rate": 0.0001, + "loss": 1.1054, + "step": 404 + }, + { + "epoch": 0.35996000444395065, + "grad_norm": 0.4039415419101715, + "learning_rate": 0.0001, + "loss": 0.9852, + "step": 405 + }, + { + "epoch": 0.3608487945783802, + "grad_norm": 0.45364150404930115, + "learning_rate": 0.0001, + "loss": 1.0471, + "step": 406 + }, + { + "epoch": 0.3617375847128097, + "grad_norm": 0.5891258716583252, + "learning_rate": 0.0001, + "loss": 1.103, + "step": 407 + }, + { + "epoch": 0.3626263748472392, + "grad_norm": 0.47017595171928406, + "learning_rate": 0.0001, + "loss": 0.8947, + "step": 408 + }, + { + "epoch": 0.3635151649816687, + "grad_norm": 0.43023166060447693, + "learning_rate": 0.0001, + "loss": 1.0324, + "step": 409 + }, + { + "epoch": 0.3644039551160982, + "grad_norm": 0.45753541588783264, + "learning_rate": 0.0001, + "loss": 0.9737, + "step": 410 + }, + { + "epoch": 0.36529274525052774, + "grad_norm": 0.48378103971481323, + "learning_rate": 0.0001, + "loss": 1.059, + "step": 411 + }, + { + "epoch": 0.36618153538495724, + "grad_norm": 0.4665151834487915, + "learning_rate": 0.0001, + "loss": 0.9722, + "step": 412 + }, + { + "epoch": 0.36707032551938673, + "grad_norm": 5.9081315994262695, + "learning_rate": 0.0001, + "loss": 0.9235, + "step": 413 + }, + { + "epoch": 0.3679591156538162, + "grad_norm": 0.42533057928085327, + "learning_rate": 0.0001, + "loss": 0.9788, + "step": 414 + }, + { + "epoch": 0.3688479057882458, + "grad_norm": 0.5004814267158508, + "learning_rate": 0.0001, + "loss": 1.003, + "step": 415 + }, + { + "epoch": 0.3697366959226753, + "grad_norm": 0.610554039478302, + "learning_rate": 0.0001, + "loss": 1.0865, + "step": 416 + }, + { + "epoch": 0.37062548605710477, + "grad_norm": 0.49171337485313416, + "learning_rate": 0.0001, + "loss": 1.0182, + "step": 417 + }, + { + "epoch": 0.37151427619153427, + "grad_norm": 0.47732532024383545, + "learning_rate": 0.0001, + "loss": 0.9888, + "step": 418 + }, + { + "epoch": 0.37240306632596376, + "grad_norm": 0.418824702501297, + "learning_rate": 0.0001, + "loss": 1.0082, + "step": 419 + }, + { + "epoch": 0.3732918564603933, + "grad_norm": 0.4232107698917389, + "learning_rate": 0.0001, + "loss": 0.9977, + "step": 420 + }, + { + "epoch": 0.3741806465948228, + "grad_norm": 0.45239681005477905, + "learning_rate": 0.0001, + "loss": 0.9741, + "step": 421 + }, + { + "epoch": 0.3750694367292523, + "grad_norm": 0.4842644929885864, + "learning_rate": 0.0001, + "loss": 0.9677, + "step": 422 + }, + { + "epoch": 0.3759582268636818, + "grad_norm": 0.478947252035141, + "learning_rate": 0.0001, + "loss": 0.9718, + "step": 423 + }, + { + "epoch": 0.3768470169981113, + "grad_norm": 0.5387521386146545, + "learning_rate": 0.0001, + "loss": 1.0737, + "step": 424 + }, + { + "epoch": 0.37773580713254085, + "grad_norm": 0.5804430246353149, + "learning_rate": 0.0001, + "loss": 1.0273, + "step": 425 + }, + { + "epoch": 0.37862459726697034, + "grad_norm": 0.4965243935585022, + "learning_rate": 0.0001, + "loss": 1.1163, + "step": 426 + }, + { + "epoch": 0.37951338740139984, + "grad_norm": 0.5330107808113098, + "learning_rate": 0.0001, + "loss": 1.0256, + "step": 427 + }, + { + "epoch": 0.38040217753582933, + "grad_norm": 0.4444815218448639, + "learning_rate": 0.0001, + "loss": 0.9643, + "step": 428 + }, + { + "epoch": 0.3812909676702589, + "grad_norm": 0.5239233374595642, + "learning_rate": 0.0001, + "loss": 0.9893, + "step": 429 + }, + { + "epoch": 0.3821797578046884, + "grad_norm": 0.4272307753562927, + "learning_rate": 0.0001, + "loss": 0.9896, + "step": 430 + }, + { + "epoch": 0.3830685479391179, + "grad_norm": 0.422820121049881, + "learning_rate": 0.0001, + "loss": 0.9722, + "step": 431 + }, + { + "epoch": 0.38395733807354737, + "grad_norm": 0.45066556334495544, + "learning_rate": 0.0001, + "loss": 0.9266, + "step": 432 + }, + { + "epoch": 0.38484612820797687, + "grad_norm": 0.40709954500198364, + "learning_rate": 0.0001, + "loss": 0.9006, + "step": 433 + }, + { + "epoch": 0.3857349183424064, + "grad_norm": 0.45301303267478943, + "learning_rate": 0.0001, + "loss": 0.9844, + "step": 434 + }, + { + "epoch": 0.3866237084768359, + "grad_norm": 0.4441263675689697, + "learning_rate": 0.0001, + "loss": 1.0039, + "step": 435 + }, + { + "epoch": 0.3875124986112654, + "grad_norm": 0.4564574062824249, + "learning_rate": 0.0001, + "loss": 1.0397, + "step": 436 + }, + { + "epoch": 0.3884012887456949, + "grad_norm": 0.5104243755340576, + "learning_rate": 0.0001, + "loss": 1.06, + "step": 437 + }, + { + "epoch": 0.38929007888012446, + "grad_norm": 0.4639466106891632, + "learning_rate": 0.0001, + "loss": 1.028, + "step": 438 + }, + { + "epoch": 0.39017886901455395, + "grad_norm": 0.4268662929534912, + "learning_rate": 0.0001, + "loss": 0.9971, + "step": 439 + }, + { + "epoch": 0.39106765914898345, + "grad_norm": 0.4981948435306549, + "learning_rate": 0.0001, + "loss": 0.9946, + "step": 440 + }, + { + "epoch": 0.39195644928341294, + "grad_norm": 0.4488162696361542, + "learning_rate": 0.0001, + "loss": 0.9536, + "step": 441 + }, + { + "epoch": 0.39284523941784244, + "grad_norm": 0.4636482298374176, + "learning_rate": 0.0001, + "loss": 1.1277, + "step": 442 + }, + { + "epoch": 0.393734029552272, + "grad_norm": 0.46133679151535034, + "learning_rate": 0.0001, + "loss": 0.9451, + "step": 443 + }, + { + "epoch": 0.3946228196867015, + "grad_norm": 0.4021439850330353, + "learning_rate": 0.0001, + "loss": 1.0127, + "step": 444 + }, + { + "epoch": 0.395511609821131, + "grad_norm": 0.46771371364593506, + "learning_rate": 0.0001, + "loss": 1.0103, + "step": 445 + }, + { + "epoch": 0.3964003999555605, + "grad_norm": 0.5152266025543213, + "learning_rate": 0.0001, + "loss": 1.0373, + "step": 446 + }, + { + "epoch": 0.39728919008999, + "grad_norm": 0.42209741473197937, + "learning_rate": 0.0001, + "loss": 0.9229, + "step": 447 + }, + { + "epoch": 0.3981779802244195, + "grad_norm": 0.46712151169776917, + "learning_rate": 0.0001, + "loss": 0.9512, + "step": 448 + }, + { + "epoch": 0.399066770358849, + "grad_norm": 4.181483268737793, + "learning_rate": 0.0001, + "loss": 0.9367, + "step": 449 + }, + { + "epoch": 0.3999555604932785, + "grad_norm": 0.3845648467540741, + "learning_rate": 0.0001, + "loss": 0.9741, + "step": 450 + }, + { + "epoch": 0.400844350627708, + "grad_norm": 2.03074049949646, + "learning_rate": 0.0001, + "loss": 0.9775, + "step": 451 + }, + { + "epoch": 0.40173314076213756, + "grad_norm": 0.5015456080436707, + "learning_rate": 0.0001, + "loss": 1.0234, + "step": 452 + }, + { + "epoch": 0.40262193089656706, + "grad_norm": 0.4838273525238037, + "learning_rate": 0.0001, + "loss": 0.9653, + "step": 453 + }, + { + "epoch": 0.40351072103099656, + "grad_norm": 0.5604076981544495, + "learning_rate": 0.0001, + "loss": 0.9732, + "step": 454 + }, + { + "epoch": 0.40439951116542605, + "grad_norm": 0.5440881252288818, + "learning_rate": 0.0001, + "loss": 1.0301, + "step": 455 + }, + { + "epoch": 0.40528830129985555, + "grad_norm": 0.9089745879173279, + "learning_rate": 0.0001, + "loss": 1.0371, + "step": 456 + }, + { + "epoch": 0.4061770914342851, + "grad_norm": 0.4472905993461609, + "learning_rate": 0.0001, + "loss": 1.0375, + "step": 457 + }, + { + "epoch": 0.4070658815687146, + "grad_norm": 0.4706718325614929, + "learning_rate": 0.0001, + "loss": 0.9966, + "step": 458 + }, + { + "epoch": 0.4079546717031441, + "grad_norm": 0.48551270365715027, + "learning_rate": 0.0001, + "loss": 0.9577, + "step": 459 + }, + { + "epoch": 0.4088434618375736, + "grad_norm": 0.45593079924583435, + "learning_rate": 0.0001, + "loss": 0.99, + "step": 460 + }, + { + "epoch": 0.40973225197200314, + "grad_norm": 0.4240357577800751, + "learning_rate": 0.0001, + "loss": 1.0105, + "step": 461 + }, + { + "epoch": 0.41062104210643263, + "grad_norm": 0.44513946771621704, + "learning_rate": 0.0001, + "loss": 0.9697, + "step": 462 + }, + { + "epoch": 0.41150983224086213, + "grad_norm": 0.43077144026756287, + "learning_rate": 0.0001, + "loss": 1.0168, + "step": 463 + }, + { + "epoch": 0.4123986223752916, + "grad_norm": 0.47029492259025574, + "learning_rate": 0.0001, + "loss": 0.94, + "step": 464 + }, + { + "epoch": 0.4132874125097211, + "grad_norm": 0.44067198038101196, + "learning_rate": 0.0001, + "loss": 0.9443, + "step": 465 + }, + { + "epoch": 0.41417620264415067, + "grad_norm": 0.6291742324829102, + "learning_rate": 0.0001, + "loss": 0.9927, + "step": 466 + }, + { + "epoch": 0.41506499277858017, + "grad_norm": 0.49682438373565674, + "learning_rate": 0.0001, + "loss": 0.9411, + "step": 467 + }, + { + "epoch": 0.41595378291300966, + "grad_norm": 0.4587923586368561, + "learning_rate": 0.0001, + "loss": 1.0423, + "step": 468 + }, + { + "epoch": 0.41684257304743916, + "grad_norm": 0.5601244568824768, + "learning_rate": 0.0001, + "loss": 1.0682, + "step": 469 + }, + { + "epoch": 0.4177313631818687, + "grad_norm": 0.4534125328063965, + "learning_rate": 0.0001, + "loss": 1.0591, + "step": 470 + }, + { + "epoch": 0.4186201533162982, + "grad_norm": 0.469804972410202, + "learning_rate": 0.0001, + "loss": 1.0893, + "step": 471 + }, + { + "epoch": 0.4195089434507277, + "grad_norm": 0.4769747257232666, + "learning_rate": 0.0001, + "loss": 0.9974, + "step": 472 + }, + { + "epoch": 0.4203977335851572, + "grad_norm": 0.4488048553466797, + "learning_rate": 0.0001, + "loss": 0.9681, + "step": 473 + }, + { + "epoch": 0.4212865237195867, + "grad_norm": 0.4423130750656128, + "learning_rate": 0.0001, + "loss": 1.0732, + "step": 474 + }, + { + "epoch": 0.42217531385401624, + "grad_norm": 0.40248462557792664, + "learning_rate": 0.0001, + "loss": 0.9631, + "step": 475 + }, + { + "epoch": 0.42306410398844574, + "grad_norm": 0.4668256640434265, + "learning_rate": 0.0001, + "loss": 0.9771, + "step": 476 + }, + { + "epoch": 0.42395289412287523, + "grad_norm": 0.46433717012405396, + "learning_rate": 0.0001, + "loss": 1.02, + "step": 477 + }, + { + "epoch": 0.42484168425730473, + "grad_norm": 0.8196445107460022, + "learning_rate": 0.0001, + "loss": 0.9596, + "step": 478 + }, + { + "epoch": 0.4257304743917342, + "grad_norm": 0.5228718519210815, + "learning_rate": 0.0001, + "loss": 1.0023, + "step": 479 + }, + { + "epoch": 0.4266192645261638, + "grad_norm": 0.4933236241340637, + "learning_rate": 0.0001, + "loss": 0.936, + "step": 480 + }, + { + "epoch": 0.4275080546605933, + "grad_norm": 0.4738243818283081, + "learning_rate": 0.0001, + "loss": 0.9954, + "step": 481 + }, + { + "epoch": 0.42839684479502277, + "grad_norm": 0.4303346872329712, + "learning_rate": 0.0001, + "loss": 1.0311, + "step": 482 + }, + { + "epoch": 0.42928563492945226, + "grad_norm": 0.3946980834007263, + "learning_rate": 0.0001, + "loss": 0.9655, + "step": 483 + }, + { + "epoch": 0.4301744250638818, + "grad_norm": 0.4781205952167511, + "learning_rate": 0.0001, + "loss": 1.0161, + "step": 484 + }, + { + "epoch": 0.4310632151983113, + "grad_norm": 0.41279280185699463, + "learning_rate": 0.0001, + "loss": 0.984, + "step": 485 + }, + { + "epoch": 0.4319520053327408, + "grad_norm": 0.42448437213897705, + "learning_rate": 0.0001, + "loss": 1.0044, + "step": 486 + }, + { + "epoch": 0.4328407954671703, + "grad_norm": 0.42571067810058594, + "learning_rate": 0.0001, + "loss": 0.9963, + "step": 487 + }, + { + "epoch": 0.4337295856015998, + "grad_norm": 0.43724721670150757, + "learning_rate": 0.0001, + "loss": 0.9868, + "step": 488 + }, + { + "epoch": 0.43461837573602935, + "grad_norm": 0.506829023361206, + "learning_rate": 0.0001, + "loss": 1.045, + "step": 489 + }, + { + "epoch": 0.43550716587045885, + "grad_norm": 0.3961758315563202, + "learning_rate": 0.0001, + "loss": 0.9235, + "step": 490 + }, + { + "epoch": 0.43639595600488834, + "grad_norm": 0.41079434752464294, + "learning_rate": 0.0001, + "loss": 0.9944, + "step": 491 + }, + { + "epoch": 0.43728474613931784, + "grad_norm": 0.4370504915714264, + "learning_rate": 0.0001, + "loss": 0.9925, + "step": 492 + }, + { + "epoch": 0.4381735362737474, + "grad_norm": 0.4316272735595703, + "learning_rate": 0.0001, + "loss": 1.0229, + "step": 493 + }, + { + "epoch": 0.4390623264081769, + "grad_norm": 0.4933658540248871, + "learning_rate": 0.0001, + "loss": 1.0974, + "step": 494 + }, + { + "epoch": 0.4399511165426064, + "grad_norm": 0.42228391766548157, + "learning_rate": 0.0001, + "loss": 1.0277, + "step": 495 + }, + { + "epoch": 0.4408399066770359, + "grad_norm": 0.45421552658081055, + "learning_rate": 0.0001, + "loss": 0.8842, + "step": 496 + }, + { + "epoch": 0.44172869681146537, + "grad_norm": 0.4748377501964569, + "learning_rate": 0.0001, + "loss": 0.9488, + "step": 497 + }, + { + "epoch": 0.4426174869458949, + "grad_norm": 0.4040847718715668, + "learning_rate": 0.0001, + "loss": 0.9696, + "step": 498 + }, + { + "epoch": 0.4435062770803244, + "grad_norm": 0.42285656929016113, + "learning_rate": 0.0001, + "loss": 0.9637, + "step": 499 + }, + { + "epoch": 0.4443950672147539, + "grad_norm": 0.3885044455528259, + "learning_rate": 0.0001, + "loss": 0.985, + "step": 500 + }, + { + "epoch": 0.4452838573491834, + "grad_norm": 0.45078715682029724, + "learning_rate": 0.0001, + "loss": 0.9538, + "step": 501 + }, + { + "epoch": 0.4461726474836129, + "grad_norm": 0.4214499294757843, + "learning_rate": 0.0001, + "loss": 0.8488, + "step": 502 + }, + { + "epoch": 0.44706143761804246, + "grad_norm": 0.6104442477226257, + "learning_rate": 0.0001, + "loss": 1.0681, + "step": 503 + }, + { + "epoch": 0.44795022775247195, + "grad_norm": 0.49673840403556824, + "learning_rate": 0.0001, + "loss": 0.9875, + "step": 504 + }, + { + "epoch": 0.44883901788690145, + "grad_norm": 0.4738848805427551, + "learning_rate": 0.0001, + "loss": 0.9738, + "step": 505 + }, + { + "epoch": 0.44972780802133094, + "grad_norm": 0.4084794223308563, + "learning_rate": 0.0001, + "loss": 0.9617, + "step": 506 + }, + { + "epoch": 0.4506165981557605, + "grad_norm": 0.43893682956695557, + "learning_rate": 0.0001, + "loss": 0.9553, + "step": 507 + }, + { + "epoch": 0.45150538829019, + "grad_norm": 0.4741009771823883, + "learning_rate": 0.0001, + "loss": 0.9863, + "step": 508 + }, + { + "epoch": 0.4523941784246195, + "grad_norm": 0.42300963401794434, + "learning_rate": 0.0001, + "loss": 0.9669, + "step": 509 + }, + { + "epoch": 0.453282968559049, + "grad_norm": 0.3663196265697479, + "learning_rate": 0.0001, + "loss": 0.9606, + "step": 510 + }, + { + "epoch": 0.4541717586934785, + "grad_norm": 0.4289178252220154, + "learning_rate": 0.0001, + "loss": 0.9093, + "step": 511 + }, + { + "epoch": 0.45506054882790803, + "grad_norm": 0.41236793994903564, + "learning_rate": 0.0001, + "loss": 0.9582, + "step": 512 + }, + { + "epoch": 0.4559493389623375, + "grad_norm": 0.38569167256355286, + "learning_rate": 0.0001, + "loss": 0.9804, + "step": 513 + }, + { + "epoch": 0.456838129096767, + "grad_norm": 0.42629268765449524, + "learning_rate": 0.0001, + "loss": 0.9111, + "step": 514 + }, + { + "epoch": 0.4577269192311965, + "grad_norm": 0.4302125573158264, + "learning_rate": 0.0001, + "loss": 0.9435, + "step": 515 + }, + { + "epoch": 0.45861570936562607, + "grad_norm": 0.46809178590774536, + "learning_rate": 0.0001, + "loss": 1.0059, + "step": 516 + }, + { + "epoch": 0.45950449950005556, + "grad_norm": 0.4712200164794922, + "learning_rate": 0.0001, + "loss": 0.9681, + "step": 517 + }, + { + "epoch": 0.46039328963448506, + "grad_norm": 0.4966319501399994, + "learning_rate": 0.0001, + "loss": 1.0278, + "step": 518 + }, + { + "epoch": 0.46128207976891455, + "grad_norm": 0.41810038685798645, + "learning_rate": 0.0001, + "loss": 1.0128, + "step": 519 + }, + { + "epoch": 0.46217086990334405, + "grad_norm": 0.4766371250152588, + "learning_rate": 0.0001, + "loss": 0.9306, + "step": 520 + }, + { + "epoch": 0.4630596600377736, + "grad_norm": 0.47531601786613464, + "learning_rate": 0.0001, + "loss": 1.0213, + "step": 521 + }, + { + "epoch": 0.4639484501722031, + "grad_norm": 0.4246899485588074, + "learning_rate": 0.0001, + "loss": 0.9415, + "step": 522 + }, + { + "epoch": 0.4648372403066326, + "grad_norm": 0.38327568769454956, + "learning_rate": 0.0001, + "loss": 0.976, + "step": 523 + }, + { + "epoch": 0.4657260304410621, + "grad_norm": 0.45601171255111694, + "learning_rate": 0.0001, + "loss": 0.9481, + "step": 524 + }, + { + "epoch": 0.4666148205754916, + "grad_norm": 0.4424237608909607, + "learning_rate": 0.0001, + "loss": 0.974, + "step": 525 + }, + { + "epoch": 0.46750361070992114, + "grad_norm": 0.45187127590179443, + "learning_rate": 0.0001, + "loss": 1.1636, + "step": 526 + }, + { + "epoch": 0.46839240084435063, + "grad_norm": 0.44865912199020386, + "learning_rate": 0.0001, + "loss": 0.9951, + "step": 527 + }, + { + "epoch": 0.4692811909787801, + "grad_norm": 0.40876081585884094, + "learning_rate": 0.0001, + "loss": 0.9765, + "step": 528 + }, + { + "epoch": 0.4701699811132096, + "grad_norm": 0.3936661183834076, + "learning_rate": 0.0001, + "loss": 0.9935, + "step": 529 + }, + { + "epoch": 0.4710587712476392, + "grad_norm": 0.422152578830719, + "learning_rate": 0.0001, + "loss": 0.9956, + "step": 530 + }, + { + "epoch": 0.47194756138206867, + "grad_norm": 0.40520817041397095, + "learning_rate": 0.0001, + "loss": 0.9609, + "step": 531 + }, + { + "epoch": 0.47283635151649817, + "grad_norm": 0.43614640831947327, + "learning_rate": 0.0001, + "loss": 1.0283, + "step": 532 + }, + { + "epoch": 0.47372514165092766, + "grad_norm": 0.4783385694026947, + "learning_rate": 0.0001, + "loss": 1.0388, + "step": 533 + }, + { + "epoch": 0.47461393178535716, + "grad_norm": 0.4622490704059601, + "learning_rate": 0.0001, + "loss": 1.088, + "step": 534 + }, + { + "epoch": 0.4755027219197867, + "grad_norm": 0.43488460779190063, + "learning_rate": 0.0001, + "loss": 0.9386, + "step": 535 + }, + { + "epoch": 0.4763915120542162, + "grad_norm": 0.40164855122566223, + "learning_rate": 0.0001, + "loss": 0.9476, + "step": 536 + }, + { + "epoch": 0.4772803021886457, + "grad_norm": 0.35324132442474365, + "learning_rate": 0.0001, + "loss": 0.9957, + "step": 537 + }, + { + "epoch": 0.4781690923230752, + "grad_norm": 0.39218297600746155, + "learning_rate": 0.0001, + "loss": 0.9682, + "step": 538 + }, + { + "epoch": 0.47905788245750475, + "grad_norm": 0.4563474953174591, + "learning_rate": 0.0001, + "loss": 0.9479, + "step": 539 + }, + { + "epoch": 0.47994667259193424, + "grad_norm": 0.4176347255706787, + "learning_rate": 0.0001, + "loss": 0.9101, + "step": 540 + }, + { + "epoch": 0.48083546272636374, + "grad_norm": 0.38945290446281433, + "learning_rate": 0.0001, + "loss": 0.9083, + "step": 541 + }, + { + "epoch": 0.48172425286079323, + "grad_norm": 0.35722264647483826, + "learning_rate": 0.0001, + "loss": 0.8814, + "step": 542 + }, + { + "epoch": 0.48261304299522273, + "grad_norm": 0.43666157126426697, + "learning_rate": 0.0001, + "loss": 0.9738, + "step": 543 + }, + { + "epoch": 0.4835018331296523, + "grad_norm": 0.462503045797348, + "learning_rate": 0.0001, + "loss": 0.9315, + "step": 544 + }, + { + "epoch": 0.4843906232640818, + "grad_norm": 0.48999452590942383, + "learning_rate": 0.0001, + "loss": 1.024, + "step": 545 + }, + { + "epoch": 0.4852794133985113, + "grad_norm": 0.5173038840293884, + "learning_rate": 0.0001, + "loss": 0.885, + "step": 546 + }, + { + "epoch": 0.48616820353294077, + "grad_norm": 0.442202627658844, + "learning_rate": 0.0001, + "loss": 0.9667, + "step": 547 + }, + { + "epoch": 0.4870569936673703, + "grad_norm": 0.5005183219909668, + "learning_rate": 0.0001, + "loss": 1.0566, + "step": 548 + }, + { + "epoch": 0.4879457838017998, + "grad_norm": 0.3552423417568207, + "learning_rate": 0.0001, + "loss": 0.9279, + "step": 549 + }, + { + "epoch": 0.4888345739362293, + "grad_norm": 0.46777617931365967, + "learning_rate": 0.0001, + "loss": 0.9534, + "step": 550 + }, + { + "epoch": 0.4897233640706588, + "grad_norm": 0.43491941690444946, + "learning_rate": 0.0001, + "loss": 1.069, + "step": 551 + }, + { + "epoch": 0.4906121542050883, + "grad_norm": 0.4217972755432129, + "learning_rate": 0.0001, + "loss": 1.0236, + "step": 552 + }, + { + "epoch": 0.49150094433951785, + "grad_norm": 0.43809306621551514, + "learning_rate": 0.0001, + "loss": 0.9967, + "step": 553 + }, + { + "epoch": 0.49238973447394735, + "grad_norm": 0.41759732365608215, + "learning_rate": 0.0001, + "loss": 0.9486, + "step": 554 + }, + { + "epoch": 0.49327852460837684, + "grad_norm": 0.4331734776496887, + "learning_rate": 0.0001, + "loss": 0.9531, + "step": 555 + }, + { + "epoch": 0.49416731474280634, + "grad_norm": 0.4199782907962799, + "learning_rate": 0.0001, + "loss": 0.9904, + "step": 556 + }, + { + "epoch": 0.49505610487723584, + "grad_norm": 0.4305680990219116, + "learning_rate": 0.0001, + "loss": 0.909, + "step": 557 + }, + { + "epoch": 0.4959448950116654, + "grad_norm": 0.5136706233024597, + "learning_rate": 0.0001, + "loss": 1.121, + "step": 558 + }, + { + "epoch": 0.4968336851460949, + "grad_norm": 0.429557204246521, + "learning_rate": 0.0001, + "loss": 0.9809, + "step": 559 + }, + { + "epoch": 0.4977224752805244, + "grad_norm": 0.41944217681884766, + "learning_rate": 0.0001, + "loss": 0.9802, + "step": 560 + }, + { + "epoch": 0.4986112654149539, + "grad_norm": 0.43246370553970337, + "learning_rate": 0.0001, + "loss": 0.9973, + "step": 561 + }, + { + "epoch": 0.4995000555493834, + "grad_norm": 0.3798202872276306, + "learning_rate": 0.0001, + "loss": 0.9367, + "step": 562 + }, + { + "epoch": 0.5003888456838129, + "grad_norm": 0.43042704463005066, + "learning_rate": 0.0001, + "loss": 1.075, + "step": 563 + }, + { + "epoch": 0.5012776358182425, + "grad_norm": 0.3733251392841339, + "learning_rate": 0.0001, + "loss": 0.9731, + "step": 564 + }, + { + "epoch": 0.5021664259526719, + "grad_norm": 0.423252135515213, + "learning_rate": 0.0001, + "loss": 0.9788, + "step": 565 + }, + { + "epoch": 0.5030552160871015, + "grad_norm": 0.43651118874549866, + "learning_rate": 0.0001, + "loss": 0.9289, + "step": 566 + }, + { + "epoch": 0.5039440062215309, + "grad_norm": 0.44451677799224854, + "learning_rate": 0.0001, + "loss": 0.938, + "step": 567 + }, + { + "epoch": 0.5048327963559605, + "grad_norm": 0.4594007730484009, + "learning_rate": 0.0001, + "loss": 0.9562, + "step": 568 + }, + { + "epoch": 0.50572158649039, + "grad_norm": 0.5364981889724731, + "learning_rate": 0.0001, + "loss": 0.9239, + "step": 569 + }, + { + "epoch": 0.5066103766248194, + "grad_norm": 0.45558059215545654, + "learning_rate": 0.0001, + "loss": 1.0017, + "step": 570 + }, + { + "epoch": 0.507499166759249, + "grad_norm": 0.42612048983573914, + "learning_rate": 0.0001, + "loss": 0.9401, + "step": 571 + }, + { + "epoch": 0.5083879568936784, + "grad_norm": 0.4538538157939911, + "learning_rate": 0.0001, + "loss": 0.9979, + "step": 572 + }, + { + "epoch": 0.509276747028108, + "grad_norm": 1.2720234394073486, + "learning_rate": 0.0001, + "loss": 1.0537, + "step": 573 + }, + { + "epoch": 0.5101655371625375, + "grad_norm": 0.4332892894744873, + "learning_rate": 0.0001, + "loss": 0.9529, + "step": 574 + }, + { + "epoch": 0.511054327296967, + "grad_norm": 0.42090901732444763, + "learning_rate": 0.0001, + "loss": 0.8948, + "step": 575 + }, + { + "epoch": 0.5119431174313965, + "grad_norm": 0.4180974066257477, + "learning_rate": 0.0001, + "loss": 0.9472, + "step": 576 + }, + { + "epoch": 0.512831907565826, + "grad_norm": 0.44674041867256165, + "learning_rate": 0.0001, + "loss": 1.0032, + "step": 577 + }, + { + "epoch": 0.5137206977002555, + "grad_norm": 0.38132181763648987, + "learning_rate": 0.0001, + "loss": 0.9722, + "step": 578 + }, + { + "epoch": 0.5146094878346851, + "grad_norm": 0.425112247467041, + "learning_rate": 0.0001, + "loss": 1.0104, + "step": 579 + }, + { + "epoch": 0.5154982779691145, + "grad_norm": 0.4420212507247925, + "learning_rate": 0.0001, + "loss": 0.8951, + "step": 580 + }, + { + "epoch": 0.5163870681035441, + "grad_norm": 0.4047195613384247, + "learning_rate": 0.0001, + "loss": 0.9103, + "step": 581 + }, + { + "epoch": 0.5172758582379735, + "grad_norm": 0.4206266701221466, + "learning_rate": 0.0001, + "loss": 0.9872, + "step": 582 + }, + { + "epoch": 0.5181646483724031, + "grad_norm": 0.46447429060935974, + "learning_rate": 0.0001, + "loss": 0.9636, + "step": 583 + }, + { + "epoch": 0.5190534385068326, + "grad_norm": 0.467122882604599, + "learning_rate": 0.0001, + "loss": 0.9556, + "step": 584 + }, + { + "epoch": 0.519942228641262, + "grad_norm": 0.4438915252685547, + "learning_rate": 0.0001, + "loss": 0.9775, + "step": 585 + }, + { + "epoch": 0.5208310187756916, + "grad_norm": 0.4225422441959381, + "learning_rate": 0.0001, + "loss": 0.9315, + "step": 586 + }, + { + "epoch": 0.5217198089101212, + "grad_norm": 0.3874359726905823, + "learning_rate": 0.0001, + "loss": 0.9652, + "step": 587 + }, + { + "epoch": 0.5226085990445506, + "grad_norm": 0.4789721369743347, + "learning_rate": 0.0001, + "loss": 0.9336, + "step": 588 + }, + { + "epoch": 0.5234973891789801, + "grad_norm": 0.4012058675289154, + "learning_rate": 0.0001, + "loss": 0.9792, + "step": 589 + }, + { + "epoch": 0.5243861793134096, + "grad_norm": 0.4494520425796509, + "learning_rate": 0.0001, + "loss": 0.9143, + "step": 590 + }, + { + "epoch": 0.5252749694478391, + "grad_norm": 0.44614073634147644, + "learning_rate": 0.0001, + "loss": 0.9286, + "step": 591 + }, + { + "epoch": 0.5261637595822687, + "grad_norm": 0.4147653579711914, + "learning_rate": 0.0001, + "loss": 0.8982, + "step": 592 + }, + { + "epoch": 0.5270525497166981, + "grad_norm": 0.4356852173805237, + "learning_rate": 0.0001, + "loss": 0.9263, + "step": 593 + }, + { + "epoch": 0.5279413398511277, + "grad_norm": 0.4420524537563324, + "learning_rate": 0.0001, + "loss": 0.969, + "step": 594 + }, + { + "epoch": 0.5288301299855571, + "grad_norm": 0.4878003001213074, + "learning_rate": 0.0001, + "loss": 1.0299, + "step": 595 + }, + { + "epoch": 0.5297189201199867, + "grad_norm": 0.4248897433280945, + "learning_rate": 0.0001, + "loss": 1.0166, + "step": 596 + }, + { + "epoch": 0.5306077102544162, + "grad_norm": 0.3686140179634094, + "learning_rate": 0.0001, + "loss": 0.9452, + "step": 597 + }, + { + "epoch": 0.5314965003888457, + "grad_norm": 0.3767171800136566, + "learning_rate": 0.0001, + "loss": 1.0059, + "step": 598 + }, + { + "epoch": 0.5323852905232752, + "grad_norm": 0.41753292083740234, + "learning_rate": 0.0001, + "loss": 0.9117, + "step": 599 + }, + { + "epoch": 0.5332740806577047, + "grad_norm": 0.45836883783340454, + "learning_rate": 0.0001, + "loss": 1.0302, + "step": 600 + }, + { + "epoch": 0.5341628707921342, + "grad_norm": 0.40416955947875977, + "learning_rate": 0.0001, + "loss": 0.9387, + "step": 601 + }, + { + "epoch": 0.5350516609265638, + "grad_norm": 0.4028719365596771, + "learning_rate": 0.0001, + "loss": 1.0046, + "step": 602 + }, + { + "epoch": 0.5359404510609932, + "grad_norm": 0.42004886269569397, + "learning_rate": 0.0001, + "loss": 1.0475, + "step": 603 + }, + { + "epoch": 0.5368292411954227, + "grad_norm": 0.44581007957458496, + "learning_rate": 0.0001, + "loss": 0.8798, + "step": 604 + }, + { + "epoch": 0.5377180313298522, + "grad_norm": 0.4007730782032013, + "learning_rate": 0.0001, + "loss": 0.9578, + "step": 605 + }, + { + "epoch": 0.5386068214642817, + "grad_norm": 0.45472684502601624, + "learning_rate": 0.0001, + "loss": 0.9609, + "step": 606 + }, + { + "epoch": 0.5394956115987113, + "grad_norm": 0.45508822798728943, + "learning_rate": 0.0001, + "loss": 0.9699, + "step": 607 + }, + { + "epoch": 0.5403844017331407, + "grad_norm": 0.40522894263267517, + "learning_rate": 0.0001, + "loss": 0.9791, + "step": 608 + }, + { + "epoch": 0.5412731918675703, + "grad_norm": 0.42480531334877014, + "learning_rate": 0.0001, + "loss": 1.041, + "step": 609 + }, + { + "epoch": 0.5421619820019998, + "grad_norm": 0.3351707458496094, + "learning_rate": 0.0001, + "loss": 1.0673, + "step": 610 + }, + { + "epoch": 0.5430507721364293, + "grad_norm": 0.5073234438896179, + "learning_rate": 0.0001, + "loss": 0.9928, + "step": 611 + }, + { + "epoch": 0.5439395622708588, + "grad_norm": 0.4208507239818573, + "learning_rate": 0.0001, + "loss": 0.9584, + "step": 612 + }, + { + "epoch": 0.5448283524052883, + "grad_norm": 0.4468097686767578, + "learning_rate": 0.0001, + "loss": 0.9225, + "step": 613 + }, + { + "epoch": 0.5457171425397178, + "grad_norm": 0.44044622778892517, + "learning_rate": 0.0001, + "loss": 0.9906, + "step": 614 + }, + { + "epoch": 0.5466059326741474, + "grad_norm": 0.5258206725120544, + "learning_rate": 0.0001, + "loss": 0.9886, + "step": 615 + }, + { + "epoch": 0.5474947228085768, + "grad_norm": 0.4392997622489929, + "learning_rate": 0.0001, + "loss": 1.0121, + "step": 616 + }, + { + "epoch": 0.5483835129430064, + "grad_norm": 0.4318770170211792, + "learning_rate": 0.0001, + "loss": 1.0073, + "step": 617 + }, + { + "epoch": 0.5492723030774358, + "grad_norm": 0.4536917805671692, + "learning_rate": 0.0001, + "loss": 0.9971, + "step": 618 + }, + { + "epoch": 0.5501610932118653, + "grad_norm": 0.39293742179870605, + "learning_rate": 0.0001, + "loss": 0.9785, + "step": 619 + }, + { + "epoch": 0.5510498833462949, + "grad_norm": 0.4010220766067505, + "learning_rate": 0.0001, + "loss": 0.9656, + "step": 620 + }, + { + "epoch": 0.5519386734807243, + "grad_norm": 0.43340232968330383, + "learning_rate": 0.0001, + "loss": 0.8983, + "step": 621 + }, + { + "epoch": 0.5528274636151539, + "grad_norm": 0.3749978840351105, + "learning_rate": 0.0001, + "loss": 0.8759, + "step": 622 + }, + { + "epoch": 0.5537162537495833, + "grad_norm": 0.47160354256629944, + "learning_rate": 0.0001, + "loss": 1.0252, + "step": 623 + }, + { + "epoch": 0.5546050438840129, + "grad_norm": 0.3324509859085083, + "learning_rate": 0.0001, + "loss": 0.9065, + "step": 624 + }, + { + "epoch": 0.5554938340184424, + "grad_norm": 0.3825232982635498, + "learning_rate": 0.0001, + "loss": 0.9536, + "step": 625 + }, + { + "epoch": 0.5563826241528719, + "grad_norm": 0.4211728870868683, + "learning_rate": 0.0001, + "loss": 0.8948, + "step": 626 + }, + { + "epoch": 0.5572714142873014, + "grad_norm": 0.40888741612434387, + "learning_rate": 0.0001, + "loss": 1.0129, + "step": 627 + }, + { + "epoch": 0.5581602044217309, + "grad_norm": 0.3999336063861847, + "learning_rate": 0.0001, + "loss": 0.9486, + "step": 628 + }, + { + "epoch": 0.5590489945561604, + "grad_norm": 0.3953676223754883, + "learning_rate": 0.0001, + "loss": 0.8835, + "step": 629 + }, + { + "epoch": 0.55993778469059, + "grad_norm": 3.312321901321411, + "learning_rate": 0.0001, + "loss": 1.0447, + "step": 630 + }, + { + "epoch": 0.5608265748250194, + "grad_norm": 0.6516892313957214, + "learning_rate": 0.0001, + "loss": 0.8964, + "step": 631 + }, + { + "epoch": 0.561715364959449, + "grad_norm": 0.43332597613334656, + "learning_rate": 0.0001, + "loss": 0.9386, + "step": 632 + }, + { + "epoch": 0.5626041550938785, + "grad_norm": 0.36072951555252075, + "learning_rate": 0.0001, + "loss": 0.9489, + "step": 633 + }, + { + "epoch": 0.563492945228308, + "grad_norm": 0.4039455056190491, + "learning_rate": 0.0001, + "loss": 0.9512, + "step": 634 + }, + { + "epoch": 0.5643817353627375, + "grad_norm": 0.40944796800613403, + "learning_rate": 0.0001, + "loss": 1.0909, + "step": 635 + }, + { + "epoch": 0.5652705254971669, + "grad_norm": 0.4490579068660736, + "learning_rate": 0.0001, + "loss": 0.955, + "step": 636 + }, + { + "epoch": 0.5661593156315965, + "grad_norm": 0.4226183593273163, + "learning_rate": 0.0001, + "loss": 0.9833, + "step": 637 + }, + { + "epoch": 0.567048105766026, + "grad_norm": 0.4099627733230591, + "learning_rate": 0.0001, + "loss": 0.9909, + "step": 638 + }, + { + "epoch": 0.5679368959004555, + "grad_norm": 0.39125704765319824, + "learning_rate": 0.0001, + "loss": 0.9875, + "step": 639 + }, + { + "epoch": 0.568825686034885, + "grad_norm": 0.3506604731082916, + "learning_rate": 0.0001, + "loss": 0.8747, + "step": 640 + }, + { + "epoch": 0.5697144761693145, + "grad_norm": 0.6626682281494141, + "learning_rate": 0.0001, + "loss": 1.0328, + "step": 641 + }, + { + "epoch": 0.570603266303744, + "grad_norm": 0.3994938135147095, + "learning_rate": 0.0001, + "loss": 0.9255, + "step": 642 + }, + { + "epoch": 0.5714920564381736, + "grad_norm": 0.4206639528274536, + "learning_rate": 0.0001, + "loss": 0.9222, + "step": 643 + }, + { + "epoch": 0.572380846572603, + "grad_norm": 0.4152818024158478, + "learning_rate": 0.0001, + "loss": 0.9515, + "step": 644 + }, + { + "epoch": 0.5732696367070326, + "grad_norm": 0.3834103047847748, + "learning_rate": 0.0001, + "loss": 0.9008, + "step": 645 + }, + { + "epoch": 0.574158426841462, + "grad_norm": 0.3906621038913727, + "learning_rate": 0.0001, + "loss": 1.0196, + "step": 646 + }, + { + "epoch": 0.5750472169758916, + "grad_norm": 0.4065912067890167, + "learning_rate": 0.0001, + "loss": 0.9741, + "step": 647 + }, + { + "epoch": 0.5759360071103211, + "grad_norm": 0.3870736360549927, + "learning_rate": 0.0001, + "loss": 0.9403, + "step": 648 + }, + { + "epoch": 0.5768247972447506, + "grad_norm": 0.40144529938697815, + "learning_rate": 0.0001, + "loss": 0.9931, + "step": 649 + }, + { + "epoch": 0.5777135873791801, + "grad_norm": 0.40022167563438416, + "learning_rate": 0.0001, + "loss": 0.9741, + "step": 650 + }, + { + "epoch": 0.5786023775136095, + "grad_norm": 0.4081610143184662, + "learning_rate": 0.0001, + "loss": 0.9509, + "step": 651 + }, + { + "epoch": 0.5794911676480391, + "grad_norm": 0.3786165714263916, + "learning_rate": 0.0001, + "loss": 0.9435, + "step": 652 + }, + { + "epoch": 0.5803799577824686, + "grad_norm": 0.3807113468647003, + "learning_rate": 0.0001, + "loss": 0.9534, + "step": 653 + }, + { + "epoch": 0.5812687479168981, + "grad_norm": 0.4126400053501129, + "learning_rate": 0.0001, + "loss": 0.8815, + "step": 654 + }, + { + "epoch": 0.5821575380513276, + "grad_norm": 0.46695202589035034, + "learning_rate": 0.0001, + "loss": 1.0453, + "step": 655 + }, + { + "epoch": 0.5830463281857572, + "grad_norm": 0.43947726488113403, + "learning_rate": 0.0001, + "loss": 0.9558, + "step": 656 + }, + { + "epoch": 0.5839351183201866, + "grad_norm": 0.4180644154548645, + "learning_rate": 0.0001, + "loss": 0.9144, + "step": 657 + }, + { + "epoch": 0.5848239084546162, + "grad_norm": 0.48162519931793213, + "learning_rate": 0.0001, + "loss": 1.0027, + "step": 658 + }, + { + "epoch": 0.5857126985890456, + "grad_norm": 0.42843928933143616, + "learning_rate": 0.0001, + "loss": 0.9259, + "step": 659 + }, + { + "epoch": 0.5866014887234752, + "grad_norm": 0.42587754130363464, + "learning_rate": 0.0001, + "loss": 0.9438, + "step": 660 + }, + { + "epoch": 0.5874902788579047, + "grad_norm": 0.42820873856544495, + "learning_rate": 0.0001, + "loss": 0.9788, + "step": 661 + }, + { + "epoch": 0.5883790689923342, + "grad_norm": 0.44260185956954956, + "learning_rate": 0.0001, + "loss": 0.951, + "step": 662 + }, + { + "epoch": 0.5892678591267637, + "grad_norm": 0.38898608088493347, + "learning_rate": 0.0001, + "loss": 0.9226, + "step": 663 + }, + { + "epoch": 0.5901566492611932, + "grad_norm": 0.4538962244987488, + "learning_rate": 0.0001, + "loss": 0.9951, + "step": 664 + }, + { + "epoch": 0.5910454393956227, + "grad_norm": 0.4434801936149597, + "learning_rate": 0.0001, + "loss": 0.9968, + "step": 665 + }, + { + "epoch": 0.5919342295300523, + "grad_norm": 2.7702479362487793, + "learning_rate": 0.0001, + "loss": 0.9657, + "step": 666 + }, + { + "epoch": 0.5928230196644817, + "grad_norm": 0.4107935130596161, + "learning_rate": 0.0001, + "loss": 0.8901, + "step": 667 + }, + { + "epoch": 0.5937118097989112, + "grad_norm": 5.212909698486328, + "learning_rate": 0.0001, + "loss": 0.8686, + "step": 668 + }, + { + "epoch": 0.5946005999333407, + "grad_norm": 0.4474343955516815, + "learning_rate": 0.0001, + "loss": 0.8641, + "step": 669 + }, + { + "epoch": 0.5954893900677702, + "grad_norm": 0.39664575457572937, + "learning_rate": 0.0001, + "loss": 0.9605, + "step": 670 + }, + { + "epoch": 0.5963781802021998, + "grad_norm": 0.38908809423446655, + "learning_rate": 0.0001, + "loss": 0.8851, + "step": 671 + }, + { + "epoch": 0.5972669703366292, + "grad_norm": 0.39720067381858826, + "learning_rate": 0.0001, + "loss": 0.9376, + "step": 672 + }, + { + "epoch": 0.5981557604710588, + "grad_norm": 0.444224089384079, + "learning_rate": 0.0001, + "loss": 0.9879, + "step": 673 + }, + { + "epoch": 0.5990445506054882, + "grad_norm": 0.461745023727417, + "learning_rate": 0.0001, + "loss": 1.0298, + "step": 674 + }, + { + "epoch": 0.5999333407399178, + "grad_norm": 0.38060104846954346, + "learning_rate": 0.0001, + "loss": 0.9133, + "step": 675 + }, + { + "epoch": 0.6008221308743473, + "grad_norm": 0.4152204990386963, + "learning_rate": 0.0001, + "loss": 1.0037, + "step": 676 + }, + { + "epoch": 0.6017109210087768, + "grad_norm": 0.4251076281070709, + "learning_rate": 0.0001, + "loss": 0.9217, + "step": 677 + }, + { + "epoch": 0.6025997111432063, + "grad_norm": 0.4050005376338959, + "learning_rate": 0.0001, + "loss": 1.0411, + "step": 678 + }, + { + "epoch": 0.6034885012776359, + "grad_norm": 0.4802723228931427, + "learning_rate": 0.0001, + "loss": 0.9966, + "step": 679 + }, + { + "epoch": 0.6043772914120653, + "grad_norm": 0.4158213436603546, + "learning_rate": 0.0001, + "loss": 0.9684, + "step": 680 + }, + { + "epoch": 0.6052660815464949, + "grad_norm": 0.4008488059043884, + "learning_rate": 0.0001, + "loss": 0.9684, + "step": 681 + }, + { + "epoch": 0.6061548716809243, + "grad_norm": 0.4021485149860382, + "learning_rate": 0.0001, + "loss": 0.8944, + "step": 682 + }, + { + "epoch": 0.6070436618153539, + "grad_norm": 0.36115503311157227, + "learning_rate": 0.0001, + "loss": 0.9677, + "step": 683 + }, + { + "epoch": 0.6079324519497834, + "grad_norm": 0.3866066336631775, + "learning_rate": 0.0001, + "loss": 0.9938, + "step": 684 + }, + { + "epoch": 0.6088212420842128, + "grad_norm": 0.47491082549095154, + "learning_rate": 0.0001, + "loss": 0.9778, + "step": 685 + }, + { + "epoch": 0.6097100322186424, + "grad_norm": 0.44795575737953186, + "learning_rate": 0.0001, + "loss": 0.9964, + "step": 686 + }, + { + "epoch": 0.6105988223530718, + "grad_norm": 0.48861074447631836, + "learning_rate": 0.0001, + "loss": 0.9276, + "step": 687 + }, + { + "epoch": 0.6114876124875014, + "grad_norm": 0.4077586233615875, + "learning_rate": 0.0001, + "loss": 0.9454, + "step": 688 + }, + { + "epoch": 0.6123764026219309, + "grad_norm": 0.3845427930355072, + "learning_rate": 0.0001, + "loss": 0.9704, + "step": 689 + }, + { + "epoch": 0.6132651927563604, + "grad_norm": 0.44110408425331116, + "learning_rate": 0.0001, + "loss": 1.0017, + "step": 690 + }, + { + "epoch": 0.6141539828907899, + "grad_norm": 0.38786497712135315, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 691 + }, + { + "epoch": 0.6150427730252194, + "grad_norm": 0.40381374955177307, + "learning_rate": 0.0001, + "loss": 0.9184, + "step": 692 + }, + { + "epoch": 0.6159315631596489, + "grad_norm": 0.4073936641216278, + "learning_rate": 0.0001, + "loss": 0.948, + "step": 693 + }, + { + "epoch": 0.6168203532940785, + "grad_norm": 0.39875441789627075, + "learning_rate": 0.0001, + "loss": 0.9917, + "step": 694 + }, + { + "epoch": 0.6177091434285079, + "grad_norm": 0.4240407943725586, + "learning_rate": 0.0001, + "loss": 0.9696, + "step": 695 + }, + { + "epoch": 0.6185979335629375, + "grad_norm": 0.4107338488101959, + "learning_rate": 0.0001, + "loss": 1.0108, + "step": 696 + }, + { + "epoch": 0.619486723697367, + "grad_norm": 0.4649637043476105, + "learning_rate": 0.0001, + "loss": 1.0222, + "step": 697 + }, + { + "epoch": 0.6203755138317965, + "grad_norm": 0.398387610912323, + "learning_rate": 0.0001, + "loss": 0.9952, + "step": 698 + }, + { + "epoch": 0.621264303966226, + "grad_norm": 0.4626375734806061, + "learning_rate": 0.0001, + "loss": 1.0333, + "step": 699 + }, + { + "epoch": 0.6221530941006554, + "grad_norm": 0.39919617772102356, + "learning_rate": 0.0001, + "loss": 0.8835, + "step": 700 + }, + { + "epoch": 0.623041884235085, + "grad_norm": 0.3973129391670227, + "learning_rate": 0.0001, + "loss": 1.0366, + "step": 701 + }, + { + "epoch": 0.6239306743695145, + "grad_norm": 0.4347308576107025, + "learning_rate": 0.0001, + "loss": 0.98, + "step": 702 + }, + { + "epoch": 0.624819464503944, + "grad_norm": 0.42371174693107605, + "learning_rate": 0.0001, + "loss": 0.9625, + "step": 703 + }, + { + "epoch": 0.6257082546383735, + "grad_norm": 0.3887942135334015, + "learning_rate": 0.0001, + "loss": 0.9995, + "step": 704 + }, + { + "epoch": 0.626597044772803, + "grad_norm": 0.41830766201019287, + "learning_rate": 0.0001, + "loss": 1.0613, + "step": 705 + }, + { + "epoch": 0.6274858349072325, + "grad_norm": 0.37216201424598694, + "learning_rate": 0.0001, + "loss": 0.9003, + "step": 706 + }, + { + "epoch": 0.6283746250416621, + "grad_norm": 0.3734080493450165, + "learning_rate": 0.0001, + "loss": 1.0583, + "step": 707 + }, + { + "epoch": 0.6292634151760915, + "grad_norm": 0.4222297966480255, + "learning_rate": 0.0001, + "loss": 0.8877, + "step": 708 + }, + { + "epoch": 0.6301522053105211, + "grad_norm": 0.4466676712036133, + "learning_rate": 0.0001, + "loss": 0.9868, + "step": 709 + }, + { + "epoch": 0.6310409954449505, + "grad_norm": 0.4170881509780884, + "learning_rate": 0.0001, + "loss": 0.9235, + "step": 710 + }, + { + "epoch": 0.6319297855793801, + "grad_norm": 0.42569923400878906, + "learning_rate": 0.0001, + "loss": 1.0111, + "step": 711 + }, + { + "epoch": 0.6328185757138096, + "grad_norm": 0.469657838344574, + "learning_rate": 0.0001, + "loss": 1.1188, + "step": 712 + }, + { + "epoch": 0.6337073658482391, + "grad_norm": 0.4612179696559906, + "learning_rate": 0.0001, + "loss": 1.0049, + "step": 713 + }, + { + "epoch": 0.6345961559826686, + "grad_norm": 0.4614596664905548, + "learning_rate": 0.0001, + "loss": 0.9497, + "step": 714 + }, + { + "epoch": 0.635484946117098, + "grad_norm": 0.4139200747013092, + "learning_rate": 0.0001, + "loss": 0.9691, + "step": 715 + }, + { + "epoch": 0.6363737362515276, + "grad_norm": 0.42002081871032715, + "learning_rate": 0.0001, + "loss": 0.9838, + "step": 716 + }, + { + "epoch": 0.6372625263859572, + "grad_norm": 0.4355138838291168, + "learning_rate": 0.0001, + "loss": 1.0074, + "step": 717 + }, + { + "epoch": 0.6381513165203866, + "grad_norm": 0.42147886753082275, + "learning_rate": 0.0001, + "loss": 0.9929, + "step": 718 + }, + { + "epoch": 0.6390401066548161, + "grad_norm": 0.4365898370742798, + "learning_rate": 0.0001, + "loss": 1.0325, + "step": 719 + }, + { + "epoch": 0.6399288967892457, + "grad_norm": 0.4107733964920044, + "learning_rate": 0.0001, + "loss": 0.9229, + "step": 720 + }, + { + "epoch": 0.6408176869236751, + "grad_norm": 0.4189467430114746, + "learning_rate": 0.0001, + "loss": 1.0041, + "step": 721 + }, + { + "epoch": 0.6417064770581047, + "grad_norm": 0.4112345278263092, + "learning_rate": 0.0001, + "loss": 1.017, + "step": 722 + }, + { + "epoch": 0.6425952671925341, + "grad_norm": 0.39325135946273804, + "learning_rate": 0.0001, + "loss": 0.9264, + "step": 723 + }, + { + "epoch": 0.6434840573269637, + "grad_norm": 0.38210329413414, + "learning_rate": 0.0001, + "loss": 0.9947, + "step": 724 + }, + { + "epoch": 0.6443728474613932, + "grad_norm": 0.4400337040424347, + "learning_rate": 0.0001, + "loss": 0.9118, + "step": 725 + }, + { + "epoch": 0.6452616375958227, + "grad_norm": 0.41924870014190674, + "learning_rate": 0.0001, + "loss": 1.0176, + "step": 726 + }, + { + "epoch": 0.6461504277302522, + "grad_norm": 0.37929868698120117, + "learning_rate": 0.0001, + "loss": 0.906, + "step": 727 + }, + { + "epoch": 0.6470392178646817, + "grad_norm": 0.41478589177131653, + "learning_rate": 0.0001, + "loss": 0.9984, + "step": 728 + }, + { + "epoch": 0.6479280079991112, + "grad_norm": 0.3930301368236542, + "learning_rate": 0.0001, + "loss": 0.9251, + "step": 729 + }, + { + "epoch": 0.6488167981335408, + "grad_norm": 0.3816480338573456, + "learning_rate": 0.0001, + "loss": 0.9501, + "step": 730 + }, + { + "epoch": 0.6497055882679702, + "grad_norm": 0.40965738892555237, + "learning_rate": 0.0001, + "loss": 0.8565, + "step": 731 + }, + { + "epoch": 0.6505943784023998, + "grad_norm": 0.42315876483917236, + "learning_rate": 0.0001, + "loss": 1.0507, + "step": 732 + }, + { + "epoch": 0.6514831685368292, + "grad_norm": 0.3918554484844208, + "learning_rate": 0.0001, + "loss": 0.9513, + "step": 733 + }, + { + "epoch": 0.6523719586712587, + "grad_norm": 0.42279428243637085, + "learning_rate": 0.0001, + "loss": 0.9699, + "step": 734 + }, + { + "epoch": 0.6532607488056883, + "grad_norm": 0.41292804479599, + "learning_rate": 0.0001, + "loss": 0.9509, + "step": 735 + }, + { + "epoch": 0.6541495389401177, + "grad_norm": 0.3694443106651306, + "learning_rate": 0.0001, + "loss": 0.8812, + "step": 736 + }, + { + "epoch": 0.6550383290745473, + "grad_norm": 0.42552173137664795, + "learning_rate": 0.0001, + "loss": 0.9628, + "step": 737 + }, + { + "epoch": 0.6559271192089767, + "grad_norm": 0.4035997986793518, + "learning_rate": 0.0001, + "loss": 0.9955, + "step": 738 + }, + { + "epoch": 0.6568159093434063, + "grad_norm": 0.38716575503349304, + "learning_rate": 0.0001, + "loss": 0.9339, + "step": 739 + }, + { + "epoch": 0.6577046994778358, + "grad_norm": 0.37816059589385986, + "learning_rate": 0.0001, + "loss": 1.0141, + "step": 740 + }, + { + "epoch": 0.6585934896122653, + "grad_norm": 0.39885637164115906, + "learning_rate": 0.0001, + "loss": 0.9982, + "step": 741 + }, + { + "epoch": 0.6594822797466948, + "grad_norm": 0.41604557633399963, + "learning_rate": 0.0001, + "loss": 0.9505, + "step": 742 + }, + { + "epoch": 0.6603710698811244, + "grad_norm": 0.3828152120113373, + "learning_rate": 0.0001, + "loss": 0.9055, + "step": 743 + }, + { + "epoch": 0.6612598600155538, + "grad_norm": 0.39052262902259827, + "learning_rate": 0.0001, + "loss": 0.9922, + "step": 744 + }, + { + "epoch": 0.6621486501499834, + "grad_norm": 0.36966073513031006, + "learning_rate": 0.0001, + "loss": 1.0097, + "step": 745 + }, + { + "epoch": 0.6630374402844128, + "grad_norm": 0.40489739179611206, + "learning_rate": 0.0001, + "loss": 0.9734, + "step": 746 + }, + { + "epoch": 0.6639262304188424, + "grad_norm": 0.4752523899078369, + "learning_rate": 0.0001, + "loss": 1.0276, + "step": 747 + }, + { + "epoch": 0.6648150205532719, + "grad_norm": 0.42752546072006226, + "learning_rate": 0.0001, + "loss": 0.9565, + "step": 748 + }, + { + "epoch": 0.6657038106877013, + "grad_norm": 0.3752210736274719, + "learning_rate": 0.0001, + "loss": 0.9709, + "step": 749 + }, + { + "epoch": 0.6665926008221309, + "grad_norm": 0.4873553514480591, + "learning_rate": 0.0001, + "loss": 1.0433, + "step": 750 + }, + { + "epoch": 0.6674813909565603, + "grad_norm": 0.4026888906955719, + "learning_rate": 0.0001, + "loss": 1.0882, + "step": 751 + }, + { + "epoch": 0.6683701810909899, + "grad_norm": 0.42212575674057007, + "learning_rate": 0.0001, + "loss": 0.9615, + "step": 752 + }, + { + "epoch": 0.6692589712254194, + "grad_norm": 0.40002021193504333, + "learning_rate": 0.0001, + "loss": 1.0106, + "step": 753 + }, + { + "epoch": 0.6701477613598489, + "grad_norm": 0.4148339331150055, + "learning_rate": 0.0001, + "loss": 0.9292, + "step": 754 + }, + { + "epoch": 0.6710365514942784, + "grad_norm": 0.37192678451538086, + "learning_rate": 0.0001, + "loss": 0.9844, + "step": 755 + }, + { + "epoch": 0.6719253416287079, + "grad_norm": 0.3779515326023102, + "learning_rate": 0.0001, + "loss": 0.8999, + "step": 756 + }, + { + "epoch": 0.6728141317631374, + "grad_norm": 0.38838016986846924, + "learning_rate": 0.0001, + "loss": 0.9855, + "step": 757 + }, + { + "epoch": 0.673702921897567, + "grad_norm": 0.3978157937526703, + "learning_rate": 0.0001, + "loss": 0.9253, + "step": 758 + }, + { + "epoch": 0.6745917120319964, + "grad_norm": 0.38334494829177856, + "learning_rate": 0.0001, + "loss": 0.8678, + "step": 759 + }, + { + "epoch": 0.675480502166426, + "grad_norm": 0.3856591582298279, + "learning_rate": 0.0001, + "loss": 0.9614, + "step": 760 + }, + { + "epoch": 0.6763692923008554, + "grad_norm": 0.6821391582489014, + "learning_rate": 0.0001, + "loss": 0.9884, + "step": 761 + }, + { + "epoch": 0.677258082435285, + "grad_norm": 0.4022596478462219, + "learning_rate": 0.0001, + "loss": 0.9512, + "step": 762 + }, + { + "epoch": 0.6781468725697145, + "grad_norm": 0.3981231153011322, + "learning_rate": 0.0001, + "loss": 0.9457, + "step": 763 + }, + { + "epoch": 0.679035662704144, + "grad_norm": 0.466782808303833, + "learning_rate": 0.0001, + "loss": 1.0009, + "step": 764 + }, + { + "epoch": 0.6799244528385735, + "grad_norm": 0.39416226744651794, + "learning_rate": 0.0001, + "loss": 0.9204, + "step": 765 + }, + { + "epoch": 0.680813242973003, + "grad_norm": 0.3830525279045105, + "learning_rate": 0.0001, + "loss": 0.9206, + "step": 766 + }, + { + "epoch": 0.6817020331074325, + "grad_norm": 0.3653806447982788, + "learning_rate": 0.0001, + "loss": 0.9601, + "step": 767 + }, + { + "epoch": 0.682590823241862, + "grad_norm": 0.4308638274669647, + "learning_rate": 0.0001, + "loss": 0.9666, + "step": 768 + }, + { + "epoch": 0.6834796133762915, + "grad_norm": 0.3821423649787903, + "learning_rate": 0.0001, + "loss": 1.0252, + "step": 769 + }, + { + "epoch": 0.684368403510721, + "grad_norm": 0.459150493144989, + "learning_rate": 0.0001, + "loss": 0.8923, + "step": 770 + }, + { + "epoch": 0.6852571936451506, + "grad_norm": 0.4109600782394409, + "learning_rate": 0.0001, + "loss": 0.9337, + "step": 771 + }, + { + "epoch": 0.68614598377958, + "grad_norm": 0.4196016788482666, + "learning_rate": 0.0001, + "loss": 1.0004, + "step": 772 + }, + { + "epoch": 0.6870347739140096, + "grad_norm": 0.39674344658851624, + "learning_rate": 0.0001, + "loss": 0.9676, + "step": 773 + }, + { + "epoch": 0.687923564048439, + "grad_norm": 0.3917883634567261, + "learning_rate": 0.0001, + "loss": 0.9255, + "step": 774 + }, + { + "epoch": 0.6888123541828686, + "grad_norm": 0.4198206663131714, + "learning_rate": 0.0001, + "loss": 0.9159, + "step": 775 + }, + { + "epoch": 0.6897011443172981, + "grad_norm": 0.4153868556022644, + "learning_rate": 0.0001, + "loss": 1.0006, + "step": 776 + }, + { + "epoch": 0.6905899344517276, + "grad_norm": 0.425462007522583, + "learning_rate": 0.0001, + "loss": 0.843, + "step": 777 + }, + { + "epoch": 0.6914787245861571, + "grad_norm": 0.40152454376220703, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 778 + }, + { + "epoch": 0.6923675147205866, + "grad_norm": 0.39553624391555786, + "learning_rate": 0.0001, + "loss": 0.8722, + "step": 779 + }, + { + "epoch": 0.6932563048550161, + "grad_norm": 0.4519752860069275, + "learning_rate": 0.0001, + "loss": 1.0222, + "step": 780 + }, + { + "epoch": 0.6941450949894457, + "grad_norm": 0.3791448473930359, + "learning_rate": 0.0001, + "loss": 0.8787, + "step": 781 + }, + { + "epoch": 0.6950338851238751, + "grad_norm": 0.3689649701118469, + "learning_rate": 0.0001, + "loss": 0.973, + "step": 782 + }, + { + "epoch": 0.6959226752583046, + "grad_norm": 0.36836689710617065, + "learning_rate": 0.0001, + "loss": 0.924, + "step": 783 + }, + { + "epoch": 0.6968114653927341, + "grad_norm": 0.4386933147907257, + "learning_rate": 0.0001, + "loss": 1.0039, + "step": 784 + }, + { + "epoch": 0.6977002555271636, + "grad_norm": 0.39001014828681946, + "learning_rate": 0.0001, + "loss": 0.9655, + "step": 785 + }, + { + "epoch": 0.6985890456615932, + "grad_norm": 0.36727163195610046, + "learning_rate": 0.0001, + "loss": 0.9681, + "step": 786 + }, + { + "epoch": 0.6994778357960226, + "grad_norm": 0.3847578465938568, + "learning_rate": 0.0001, + "loss": 0.9809, + "step": 787 + }, + { + "epoch": 0.7003666259304522, + "grad_norm": 0.3912128806114197, + "learning_rate": 0.0001, + "loss": 0.9828, + "step": 788 + }, + { + "epoch": 0.7012554160648817, + "grad_norm": 0.3891061842441559, + "learning_rate": 0.0001, + "loss": 0.9816, + "step": 789 + }, + { + "epoch": 0.7021442061993112, + "grad_norm": 0.40884703397750854, + "learning_rate": 0.0001, + "loss": 0.9406, + "step": 790 + }, + { + "epoch": 0.7030329963337407, + "grad_norm": 0.40863409638404846, + "learning_rate": 0.0001, + "loss": 0.9473, + "step": 791 + }, + { + "epoch": 0.7039217864681702, + "grad_norm": 0.4846546947956085, + "learning_rate": 0.0001, + "loss": 1.0337, + "step": 792 + }, + { + "epoch": 0.7048105766025997, + "grad_norm": 0.4108925759792328, + "learning_rate": 0.0001, + "loss": 0.8871, + "step": 793 + }, + { + "epoch": 0.7056993667370293, + "grad_norm": 0.41259604692459106, + "learning_rate": 0.0001, + "loss": 0.9616, + "step": 794 + }, + { + "epoch": 0.7065881568714587, + "grad_norm": 0.39563241600990295, + "learning_rate": 0.0001, + "loss": 0.917, + "step": 795 + }, + { + "epoch": 0.7074769470058883, + "grad_norm": 0.44878554344177246, + "learning_rate": 0.0001, + "loss": 0.9133, + "step": 796 + }, + { + "epoch": 0.7083657371403177, + "grad_norm": 0.4403840899467468, + "learning_rate": 0.0001, + "loss": 0.961, + "step": 797 + }, + { + "epoch": 0.7092545272747472, + "grad_norm": 0.47209668159484863, + "learning_rate": 0.0001, + "loss": 0.9692, + "step": 798 + }, + { + "epoch": 0.7101433174091768, + "grad_norm": 0.3841542601585388, + "learning_rate": 0.0001, + "loss": 0.9458, + "step": 799 + }, + { + "epoch": 0.7110321075436062, + "grad_norm": 0.46148043870925903, + "learning_rate": 0.0001, + "loss": 0.8355, + "step": 800 + }, + { + "epoch": 0.7119208976780358, + "grad_norm": 0.6854037046432495, + "learning_rate": 0.0001, + "loss": 0.9275, + "step": 801 + }, + { + "epoch": 0.7128096878124652, + "grad_norm": 0.38869839906692505, + "learning_rate": 0.0001, + "loss": 0.9882, + "step": 802 + }, + { + "epoch": 0.7136984779468948, + "grad_norm": 0.4120323956012726, + "learning_rate": 0.0001, + "loss": 0.87, + "step": 803 + }, + { + "epoch": 0.7145872680813243, + "grad_norm": 0.43215325474739075, + "learning_rate": 0.0001, + "loss": 0.9797, + "step": 804 + }, + { + "epoch": 0.7154760582157538, + "grad_norm": 0.4285464882850647, + "learning_rate": 0.0001, + "loss": 0.9303, + "step": 805 + }, + { + "epoch": 0.7163648483501833, + "grad_norm": 0.3982923626899719, + "learning_rate": 0.0001, + "loss": 0.9645, + "step": 806 + }, + { + "epoch": 0.7172536384846128, + "grad_norm": 0.3845166563987732, + "learning_rate": 0.0001, + "loss": 0.8746, + "step": 807 + }, + { + "epoch": 0.7181424286190423, + "grad_norm": 0.34493544697761536, + "learning_rate": 0.0001, + "loss": 0.8451, + "step": 808 + }, + { + "epoch": 0.7190312187534719, + "grad_norm": 0.4371100962162018, + "learning_rate": 0.0001, + "loss": 1.0098, + "step": 809 + }, + { + "epoch": 0.7199200088879013, + "grad_norm": 0.4431426525115967, + "learning_rate": 0.0001, + "loss": 0.9671, + "step": 810 + }, + { + "epoch": 0.7208087990223309, + "grad_norm": 0.39591118693351746, + "learning_rate": 0.0001, + "loss": 0.9669, + "step": 811 + }, + { + "epoch": 0.7216975891567604, + "grad_norm": 0.4144839942455292, + "learning_rate": 0.0001, + "loss": 1.0489, + "step": 812 + }, + { + "epoch": 0.7225863792911899, + "grad_norm": 0.36368703842163086, + "learning_rate": 0.0001, + "loss": 0.9269, + "step": 813 + }, + { + "epoch": 0.7234751694256194, + "grad_norm": 0.34978145360946655, + "learning_rate": 0.0001, + "loss": 0.925, + "step": 814 + }, + { + "epoch": 0.7243639595600488, + "grad_norm": 0.38028684258461, + "learning_rate": 0.0001, + "loss": 1.0324, + "step": 815 + }, + { + "epoch": 0.7252527496944784, + "grad_norm": 0.3885200023651123, + "learning_rate": 0.0001, + "loss": 0.8994, + "step": 816 + }, + { + "epoch": 0.726141539828908, + "grad_norm": 0.3828847110271454, + "learning_rate": 0.0001, + "loss": 0.977, + "step": 817 + }, + { + "epoch": 0.7270303299633374, + "grad_norm": 0.41237783432006836, + "learning_rate": 0.0001, + "loss": 0.9004, + "step": 818 + }, + { + "epoch": 0.7279191200977669, + "grad_norm": 0.3976728916168213, + "learning_rate": 0.0001, + "loss": 0.8497, + "step": 819 + }, + { + "epoch": 0.7288079102321964, + "grad_norm": 0.3727687895298004, + "learning_rate": 0.0001, + "loss": 0.9518, + "step": 820 + }, + { + "epoch": 0.7296967003666259, + "grad_norm": 0.36306801438331604, + "learning_rate": 0.0001, + "loss": 0.9451, + "step": 821 + }, + { + "epoch": 0.7305854905010555, + "grad_norm": 0.38123902678489685, + "learning_rate": 0.0001, + "loss": 0.9649, + "step": 822 + }, + { + "epoch": 0.7314742806354849, + "grad_norm": 0.4005994200706482, + "learning_rate": 0.0001, + "loss": 0.9271, + "step": 823 + }, + { + "epoch": 0.7323630707699145, + "grad_norm": 0.36427778005599976, + "learning_rate": 0.0001, + "loss": 0.8705, + "step": 824 + }, + { + "epoch": 0.7332518609043439, + "grad_norm": 0.39238572120666504, + "learning_rate": 0.0001, + "loss": 1.0382, + "step": 825 + }, + { + "epoch": 0.7341406510387735, + "grad_norm": 0.3285076320171356, + "learning_rate": 0.0001, + "loss": 0.8946, + "step": 826 + }, + { + "epoch": 0.735029441173203, + "grad_norm": 0.4472292363643646, + "learning_rate": 0.0001, + "loss": 1.0144, + "step": 827 + }, + { + "epoch": 0.7359182313076325, + "grad_norm": 0.43173158168792725, + "learning_rate": 0.0001, + "loss": 0.918, + "step": 828 + }, + { + "epoch": 0.736807021442062, + "grad_norm": 0.3840146064758301, + "learning_rate": 0.0001, + "loss": 0.825, + "step": 829 + }, + { + "epoch": 0.7376958115764916, + "grad_norm": 0.3785271644592285, + "learning_rate": 0.0001, + "loss": 0.9523, + "step": 830 + }, + { + "epoch": 0.738584601710921, + "grad_norm": 0.4263155460357666, + "learning_rate": 0.0001, + "loss": 0.9, + "step": 831 + }, + { + "epoch": 0.7394733918453505, + "grad_norm": 0.3733021020889282, + "learning_rate": 0.0001, + "loss": 0.9072, + "step": 832 + }, + { + "epoch": 0.74036218197978, + "grad_norm": 0.4950011074542999, + "learning_rate": 0.0001, + "loss": 1.0596, + "step": 833 + }, + { + "epoch": 0.7412509721142095, + "grad_norm": 0.4155547320842743, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 834 + }, + { + "epoch": 0.7421397622486391, + "grad_norm": 0.39016374945640564, + "learning_rate": 0.0001, + "loss": 0.9511, + "step": 835 + }, + { + "epoch": 0.7430285523830685, + "grad_norm": 0.44248321652412415, + "learning_rate": 0.0001, + "loss": 0.9117, + "step": 836 + }, + { + "epoch": 0.7439173425174981, + "grad_norm": 0.4027865529060364, + "learning_rate": 0.0001, + "loss": 0.9217, + "step": 837 + }, + { + "epoch": 0.7448061326519275, + "grad_norm": 0.4021622836589813, + "learning_rate": 0.0001, + "loss": 0.9234, + "step": 838 + }, + { + "epoch": 0.7456949227863571, + "grad_norm": 0.4510715901851654, + "learning_rate": 0.0001, + "loss": 1.0242, + "step": 839 + }, + { + "epoch": 0.7465837129207866, + "grad_norm": 0.38624054193496704, + "learning_rate": 0.0001, + "loss": 0.9285, + "step": 840 + }, + { + "epoch": 0.7474725030552161, + "grad_norm": 0.48192909359931946, + "learning_rate": 0.0001, + "loss": 1.0023, + "step": 841 + }, + { + "epoch": 0.7483612931896456, + "grad_norm": 0.4182127118110657, + "learning_rate": 0.0001, + "loss": 1.04, + "step": 842 + }, + { + "epoch": 0.7492500833240751, + "grad_norm": 0.3804481029510498, + "learning_rate": 0.0001, + "loss": 0.9602, + "step": 843 + }, + { + "epoch": 0.7501388734585046, + "grad_norm": 0.3628024458885193, + "learning_rate": 0.0001, + "loss": 0.955, + "step": 844 + }, + { + "epoch": 0.7510276635929342, + "grad_norm": 0.3579307496547699, + "learning_rate": 0.0001, + "loss": 0.9355, + "step": 845 + }, + { + "epoch": 0.7519164537273636, + "grad_norm": 0.36626923084259033, + "learning_rate": 0.0001, + "loss": 0.9324, + "step": 846 + }, + { + "epoch": 0.7528052438617931, + "grad_norm": 0.37422046065330505, + "learning_rate": 0.0001, + "loss": 1.0267, + "step": 847 + }, + { + "epoch": 0.7536940339962226, + "grad_norm": 0.3965827226638794, + "learning_rate": 0.0001, + "loss": 0.9784, + "step": 848 + }, + { + "epoch": 0.7545828241306521, + "grad_norm": 0.36925482749938965, + "learning_rate": 0.0001, + "loss": 0.9801, + "step": 849 + }, + { + "epoch": 0.7554716142650817, + "grad_norm": 0.372070848941803, + "learning_rate": 0.0001, + "loss": 0.9474, + "step": 850 + }, + { + "epoch": 0.7563604043995111, + "grad_norm": 0.4122565984725952, + "learning_rate": 0.0001, + "loss": 0.9738, + "step": 851 + }, + { + "epoch": 0.7572491945339407, + "grad_norm": 0.36244523525238037, + "learning_rate": 0.0001, + "loss": 0.9174, + "step": 852 + }, + { + "epoch": 0.7581379846683702, + "grad_norm": 0.3738951086997986, + "learning_rate": 0.0001, + "loss": 0.9236, + "step": 853 + }, + { + "epoch": 0.7590267748027997, + "grad_norm": 0.3978452980518341, + "learning_rate": 0.0001, + "loss": 1.0203, + "step": 854 + }, + { + "epoch": 0.7599155649372292, + "grad_norm": 0.38576042652130127, + "learning_rate": 0.0001, + "loss": 0.9305, + "step": 855 + }, + { + "epoch": 0.7608043550716587, + "grad_norm": 0.39469850063323975, + "learning_rate": 0.0001, + "loss": 0.9165, + "step": 856 + }, + { + "epoch": 0.7616931452060882, + "grad_norm": 0.4054928719997406, + "learning_rate": 0.0001, + "loss": 0.8531, + "step": 857 + }, + { + "epoch": 0.7625819353405178, + "grad_norm": 0.42798909544944763, + "learning_rate": 0.0001, + "loss": 0.8747, + "step": 858 + }, + { + "epoch": 0.7634707254749472, + "grad_norm": 0.42001426219940186, + "learning_rate": 0.0001, + "loss": 0.9606, + "step": 859 + }, + { + "epoch": 0.7643595156093768, + "grad_norm": 0.3773418366909027, + "learning_rate": 0.0001, + "loss": 0.9564, + "step": 860 + }, + { + "epoch": 0.7652483057438062, + "grad_norm": 0.3583545982837677, + "learning_rate": 0.0001, + "loss": 0.9112, + "step": 861 + }, + { + "epoch": 0.7661370958782358, + "grad_norm": 0.4381794035434723, + "learning_rate": 0.0001, + "loss": 1.0014, + "step": 862 + }, + { + "epoch": 0.7670258860126653, + "grad_norm": 0.40912652015686035, + "learning_rate": 0.0001, + "loss": 0.9742, + "step": 863 + }, + { + "epoch": 0.7679146761470947, + "grad_norm": 0.3959810733795166, + "learning_rate": 0.0001, + "loss": 0.9635, + "step": 864 + }, + { + "epoch": 0.7688034662815243, + "grad_norm": 0.3853726387023926, + "learning_rate": 0.0001, + "loss": 0.9379, + "step": 865 + }, + { + "epoch": 0.7696922564159537, + "grad_norm": 0.4313822388648987, + "learning_rate": 0.0001, + "loss": 1.0387, + "step": 866 + }, + { + "epoch": 0.7705810465503833, + "grad_norm": 0.34767740964889526, + "learning_rate": 0.0001, + "loss": 0.953, + "step": 867 + }, + { + "epoch": 0.7714698366848128, + "grad_norm": 0.414546936750412, + "learning_rate": 0.0001, + "loss": 0.9447, + "step": 868 + }, + { + "epoch": 0.7723586268192423, + "grad_norm": 0.3723791539669037, + "learning_rate": 0.0001, + "loss": 0.9898, + "step": 869 + }, + { + "epoch": 0.7732474169536718, + "grad_norm": 0.3587020933628082, + "learning_rate": 0.0001, + "loss": 0.9793, + "step": 870 + }, + { + "epoch": 0.7741362070881013, + "grad_norm": 0.37663257122039795, + "learning_rate": 0.0001, + "loss": 0.898, + "step": 871 + }, + { + "epoch": 0.7750249972225308, + "grad_norm": 0.4134519398212433, + "learning_rate": 0.0001, + "loss": 0.8667, + "step": 872 + }, + { + "epoch": 0.7759137873569604, + "grad_norm": 0.3675340414047241, + "learning_rate": 0.0001, + "loss": 0.9326, + "step": 873 + }, + { + "epoch": 0.7768025774913898, + "grad_norm": 0.355490118265152, + "learning_rate": 0.0001, + "loss": 1.0111, + "step": 874 + }, + { + "epoch": 0.7776913676258194, + "grad_norm": 0.34475648403167725, + "learning_rate": 0.0001, + "loss": 0.9417, + "step": 875 + }, + { + "epoch": 0.7785801577602489, + "grad_norm": 0.3858400285243988, + "learning_rate": 0.0001, + "loss": 0.9454, + "step": 876 + }, + { + "epoch": 0.7794689478946784, + "grad_norm": 0.41093963384628296, + "learning_rate": 0.0001, + "loss": 0.9135, + "step": 877 + }, + { + "epoch": 0.7803577380291079, + "grad_norm": 0.3808945417404175, + "learning_rate": 0.0001, + "loss": 1.0214, + "step": 878 + }, + { + "epoch": 0.7812465281635373, + "grad_norm": 0.3945116400718689, + "learning_rate": 0.0001, + "loss": 0.8784, + "step": 879 + }, + { + "epoch": 0.7821353182979669, + "grad_norm": 0.34549954533576965, + "learning_rate": 0.0001, + "loss": 0.8422, + "step": 880 + }, + { + "epoch": 0.7830241084323964, + "grad_norm": 0.39158105850219727, + "learning_rate": 0.0001, + "loss": 0.8915, + "step": 881 + }, + { + "epoch": 0.7839128985668259, + "grad_norm": 0.41858598589897156, + "learning_rate": 0.0001, + "loss": 1.0106, + "step": 882 + }, + { + "epoch": 0.7848016887012554, + "grad_norm": 0.3741881251335144, + "learning_rate": 0.0001, + "loss": 0.9062, + "step": 883 + }, + { + "epoch": 0.7856904788356849, + "grad_norm": 0.42688676714897156, + "learning_rate": 0.0001, + "loss": 1.0078, + "step": 884 + }, + { + "epoch": 0.7865792689701144, + "grad_norm": 0.4139765799045563, + "learning_rate": 0.0001, + "loss": 0.9226, + "step": 885 + }, + { + "epoch": 0.787468059104544, + "grad_norm": 0.34299251437187195, + "learning_rate": 0.0001, + "loss": 0.8292, + "step": 886 + }, + { + "epoch": 0.7883568492389734, + "grad_norm": 0.37689974904060364, + "learning_rate": 0.0001, + "loss": 0.9543, + "step": 887 + }, + { + "epoch": 0.789245639373403, + "grad_norm": 0.4176059365272522, + "learning_rate": 0.0001, + "loss": 0.9437, + "step": 888 + }, + { + "epoch": 0.7901344295078324, + "grad_norm": 0.3841492235660553, + "learning_rate": 0.0001, + "loss": 1.0038, + "step": 889 + }, + { + "epoch": 0.791023219642262, + "grad_norm": 0.3431592285633087, + "learning_rate": 0.0001, + "loss": 0.9337, + "step": 890 + }, + { + "epoch": 0.7919120097766915, + "grad_norm": 0.4018877148628235, + "learning_rate": 0.0001, + "loss": 0.9502, + "step": 891 + }, + { + "epoch": 0.792800799911121, + "grad_norm": 0.4328785836696625, + "learning_rate": 0.0001, + "loss": 1.0362, + "step": 892 + }, + { + "epoch": 0.7936895900455505, + "grad_norm": 0.41478991508483887, + "learning_rate": 0.0001, + "loss": 0.9226, + "step": 893 + }, + { + "epoch": 0.79457838017998, + "grad_norm": 0.3804970979690552, + "learning_rate": 0.0001, + "loss": 0.9237, + "step": 894 + }, + { + "epoch": 0.7954671703144095, + "grad_norm": 0.41232219338417053, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 895 + }, + { + "epoch": 0.796355960448839, + "grad_norm": 0.3760983645915985, + "learning_rate": 0.0001, + "loss": 0.9839, + "step": 896 + }, + { + "epoch": 0.7972447505832685, + "grad_norm": 0.3917902410030365, + "learning_rate": 0.0001, + "loss": 0.8802, + "step": 897 + }, + { + "epoch": 0.798133540717698, + "grad_norm": 0.3652724623680115, + "learning_rate": 0.0001, + "loss": 0.9058, + "step": 898 + }, + { + "epoch": 0.7990223308521276, + "grad_norm": 0.3827455937862396, + "learning_rate": 0.0001, + "loss": 0.9821, + "step": 899 + }, + { + "epoch": 0.799911120986557, + "grad_norm": 0.4203377664089203, + "learning_rate": 0.0001, + "loss": 1.0054, + "step": 900 + }, + { + "epoch": 0.8007999111209866, + "grad_norm": 0.3544522821903229, + "learning_rate": 0.0001, + "loss": 1.0044, + "step": 901 + }, + { + "epoch": 0.801688701255416, + "grad_norm": 0.39687561988830566, + "learning_rate": 0.0001, + "loss": 0.9896, + "step": 902 + }, + { + "epoch": 0.8025774913898456, + "grad_norm": 0.3697212040424347, + "learning_rate": 0.0001, + "loss": 0.9282, + "step": 903 + }, + { + "epoch": 0.8034662815242751, + "grad_norm": 0.37844914197921753, + "learning_rate": 0.0001, + "loss": 0.9032, + "step": 904 + }, + { + "epoch": 0.8043550716587046, + "grad_norm": 0.36820754408836365, + "learning_rate": 0.0001, + "loss": 0.9314, + "step": 905 + }, + { + "epoch": 0.8052438617931341, + "grad_norm": 0.4033227264881134, + "learning_rate": 0.0001, + "loss": 0.9648, + "step": 906 + }, + { + "epoch": 0.8061326519275636, + "grad_norm": 0.3827173411846161, + "learning_rate": 0.0001, + "loss": 0.8304, + "step": 907 + }, + { + "epoch": 0.8070214420619931, + "grad_norm": 0.3617156147956848, + "learning_rate": 0.0001, + "loss": 0.8478, + "step": 908 + }, + { + "epoch": 0.8079102321964227, + "grad_norm": 0.38975533843040466, + "learning_rate": 0.0001, + "loss": 0.8794, + "step": 909 + }, + { + "epoch": 0.8087990223308521, + "grad_norm": 0.39395904541015625, + "learning_rate": 0.0001, + "loss": 0.8944, + "step": 910 + }, + { + "epoch": 0.8096878124652817, + "grad_norm": 0.31278157234191895, + "learning_rate": 0.0001, + "loss": 0.8736, + "step": 911 + }, + { + "epoch": 0.8105766025997111, + "grad_norm": 0.3955362141132355, + "learning_rate": 0.0001, + "loss": 1.012, + "step": 912 + }, + { + "epoch": 0.8114653927341406, + "grad_norm": 0.41375401616096497, + "learning_rate": 0.0001, + "loss": 0.9767, + "step": 913 + }, + { + "epoch": 0.8123541828685702, + "grad_norm": 0.38847875595092773, + "learning_rate": 0.0001, + "loss": 0.9193, + "step": 914 + }, + { + "epoch": 0.8132429730029996, + "grad_norm": 0.42238739132881165, + "learning_rate": 0.0001, + "loss": 1.0099, + "step": 915 + }, + { + "epoch": 0.8141317631374292, + "grad_norm": 0.3599422872066498, + "learning_rate": 0.0001, + "loss": 0.8548, + "step": 916 + }, + { + "epoch": 0.8150205532718586, + "grad_norm": 0.369069367647171, + "learning_rate": 0.0001, + "loss": 0.9032, + "step": 917 + }, + { + "epoch": 0.8159093434062882, + "grad_norm": 0.3789387047290802, + "learning_rate": 0.0001, + "loss": 0.9597, + "step": 918 + }, + { + "epoch": 0.8167981335407177, + "grad_norm": 0.37680602073669434, + "learning_rate": 0.0001, + "loss": 0.9284, + "step": 919 + }, + { + "epoch": 0.8176869236751472, + "grad_norm": 0.4254203140735626, + "learning_rate": 0.0001, + "loss": 0.9765, + "step": 920 + }, + { + "epoch": 0.8185757138095767, + "grad_norm": 0.3756076991558075, + "learning_rate": 0.0001, + "loss": 0.9469, + "step": 921 + }, + { + "epoch": 0.8194645039440063, + "grad_norm": 0.37767136096954346, + "learning_rate": 0.0001, + "loss": 0.926, + "step": 922 + }, + { + "epoch": 0.8203532940784357, + "grad_norm": 0.43672624230384827, + "learning_rate": 0.0001, + "loss": 0.9305, + "step": 923 + }, + { + "epoch": 0.8212420842128653, + "grad_norm": 0.36939603090286255, + "learning_rate": 0.0001, + "loss": 0.8828, + "step": 924 + }, + { + "epoch": 0.8221308743472947, + "grad_norm": 0.41821223497390747, + "learning_rate": 0.0001, + "loss": 0.9346, + "step": 925 + }, + { + "epoch": 0.8230196644817243, + "grad_norm": 0.3761398494243622, + "learning_rate": 0.0001, + "loss": 0.8826, + "step": 926 + }, + { + "epoch": 0.8239084546161538, + "grad_norm": 0.4117630124092102, + "learning_rate": 0.0001, + "loss": 0.9253, + "step": 927 + }, + { + "epoch": 0.8247972447505832, + "grad_norm": 0.31008240580558777, + "learning_rate": 0.0001, + "loss": 0.9083, + "step": 928 + }, + { + "epoch": 0.8256860348850128, + "grad_norm": 0.3615438938140869, + "learning_rate": 0.0001, + "loss": 0.8833, + "step": 929 + }, + { + "epoch": 0.8265748250194422, + "grad_norm": 0.3704020082950592, + "learning_rate": 0.0001, + "loss": 1.0443, + "step": 930 + }, + { + "epoch": 0.8274636151538718, + "grad_norm": 0.3424414098262787, + "learning_rate": 0.0001, + "loss": 0.959, + "step": 931 + }, + { + "epoch": 0.8283524052883013, + "grad_norm": 0.41668984293937683, + "learning_rate": 0.0001, + "loss": 1.0217, + "step": 932 + }, + { + "epoch": 0.8292411954227308, + "grad_norm": 0.35622361302375793, + "learning_rate": 0.0001, + "loss": 0.9546, + "step": 933 + }, + { + "epoch": 0.8301299855571603, + "grad_norm": 0.36572763323783875, + "learning_rate": 0.0001, + "loss": 0.9514, + "step": 934 + }, + { + "epoch": 0.8310187756915898, + "grad_norm": 0.46444347500801086, + "learning_rate": 0.0001, + "loss": 1.0018, + "step": 935 + }, + { + "epoch": 0.8319075658260193, + "grad_norm": 0.39167654514312744, + "learning_rate": 0.0001, + "loss": 0.9338, + "step": 936 + }, + { + "epoch": 0.8327963559604489, + "grad_norm": 0.4243658781051636, + "learning_rate": 0.0001, + "loss": 0.9801, + "step": 937 + }, + { + "epoch": 0.8336851460948783, + "grad_norm": 2.1183691024780273, + "learning_rate": 0.0001, + "loss": 0.9376, + "step": 938 + }, + { + "epoch": 0.8345739362293079, + "grad_norm": 0.4078412353992462, + "learning_rate": 0.0001, + "loss": 0.9533, + "step": 939 + }, + { + "epoch": 0.8354627263637374, + "grad_norm": 0.36636313796043396, + "learning_rate": 0.0001, + "loss": 0.993, + "step": 940 + }, + { + "epoch": 0.8363515164981669, + "grad_norm": 0.42190858721733093, + "learning_rate": 0.0001, + "loss": 0.9789, + "step": 941 + }, + { + "epoch": 0.8372403066325964, + "grad_norm": 0.34506893157958984, + "learning_rate": 0.0001, + "loss": 0.9759, + "step": 942 + }, + { + "epoch": 0.8381290967670258, + "grad_norm": 0.43273380398750305, + "learning_rate": 0.0001, + "loss": 0.9753, + "step": 943 + }, + { + "epoch": 0.8390178869014554, + "grad_norm": 0.36178067326545715, + "learning_rate": 0.0001, + "loss": 0.9842, + "step": 944 + }, + { + "epoch": 0.839906677035885, + "grad_norm": 0.3905907869338989, + "learning_rate": 0.0001, + "loss": 0.9809, + "step": 945 + }, + { + "epoch": 0.8407954671703144, + "grad_norm": 0.3678842782974243, + "learning_rate": 0.0001, + "loss": 0.939, + "step": 946 + }, + { + "epoch": 0.8416842573047439, + "grad_norm": 0.3797389268875122, + "learning_rate": 0.0001, + "loss": 0.8508, + "step": 947 + }, + { + "epoch": 0.8425730474391734, + "grad_norm": 0.43192946910858154, + "learning_rate": 0.0001, + "loss": 0.9417, + "step": 948 + }, + { + "epoch": 0.8434618375736029, + "grad_norm": 0.4288097620010376, + "learning_rate": 0.0001, + "loss": 0.9428, + "step": 949 + }, + { + "epoch": 0.8443506277080325, + "grad_norm": 0.3306112587451935, + "learning_rate": 0.0001, + "loss": 0.9606, + "step": 950 + }, + { + "epoch": 0.8452394178424619, + "grad_norm": 0.3389092683792114, + "learning_rate": 0.0001, + "loss": 0.8848, + "step": 951 + }, + { + "epoch": 0.8461282079768915, + "grad_norm": 0.38675200939178467, + "learning_rate": 0.0001, + "loss": 0.9428, + "step": 952 + }, + { + "epoch": 0.8470169981113209, + "grad_norm": 0.3907453417778015, + "learning_rate": 0.0001, + "loss": 0.9628, + "step": 953 + }, + { + "epoch": 0.8479057882457505, + "grad_norm": 0.3519386649131775, + "learning_rate": 0.0001, + "loss": 0.9262, + "step": 954 + }, + { + "epoch": 0.84879457838018, + "grad_norm": 0.38831275701522827, + "learning_rate": 0.0001, + "loss": 0.904, + "step": 955 + }, + { + "epoch": 0.8496833685146095, + "grad_norm": 0.39892178773880005, + "learning_rate": 0.0001, + "loss": 0.9897, + "step": 956 + }, + { + "epoch": 0.850572158649039, + "grad_norm": 0.4237317144870758, + "learning_rate": 0.0001, + "loss": 0.9679, + "step": 957 + }, + { + "epoch": 0.8514609487834685, + "grad_norm": 0.35414162278175354, + "learning_rate": 0.0001, + "loss": 0.9362, + "step": 958 + }, + { + "epoch": 0.852349738917898, + "grad_norm": 0.34299251437187195, + "learning_rate": 0.0001, + "loss": 0.9223, + "step": 959 + }, + { + "epoch": 0.8532385290523276, + "grad_norm": 0.3854987621307373, + "learning_rate": 0.0001, + "loss": 0.9285, + "step": 960 + }, + { + "epoch": 0.854127319186757, + "grad_norm": 0.3863328993320465, + "learning_rate": 0.0001, + "loss": 0.9039, + "step": 961 + }, + { + "epoch": 0.8550161093211865, + "grad_norm": 0.3316647708415985, + "learning_rate": 0.0001, + "loss": 0.9445, + "step": 962 + }, + { + "epoch": 0.8559048994556161, + "grad_norm": 0.39452093839645386, + "learning_rate": 0.0001, + "loss": 0.8977, + "step": 963 + }, + { + "epoch": 0.8567936895900455, + "grad_norm": 0.3937751352787018, + "learning_rate": 0.0001, + "loss": 0.914, + "step": 964 + }, + { + "epoch": 0.8576824797244751, + "grad_norm": 0.31849080324172974, + "learning_rate": 0.0001, + "loss": 0.8676, + "step": 965 + }, + { + "epoch": 0.8585712698589045, + "grad_norm": 0.3330874741077423, + "learning_rate": 0.0001, + "loss": 0.8927, + "step": 966 + }, + { + "epoch": 0.8594600599933341, + "grad_norm": 0.3816487789154053, + "learning_rate": 0.0001, + "loss": 0.9715, + "step": 967 + }, + { + "epoch": 0.8603488501277636, + "grad_norm": 0.4189969301223755, + "learning_rate": 0.0001, + "loss": 0.9714, + "step": 968 + }, + { + "epoch": 0.8612376402621931, + "grad_norm": 0.3965904712677002, + "learning_rate": 0.0001, + "loss": 0.9532, + "step": 969 + }, + { + "epoch": 0.8621264303966226, + "grad_norm": 0.3900628089904785, + "learning_rate": 0.0001, + "loss": 0.9577, + "step": 970 + }, + { + "epoch": 0.8630152205310521, + "grad_norm": 0.43104588985443115, + "learning_rate": 0.0001, + "loss": 1.0495, + "step": 971 + }, + { + "epoch": 0.8639040106654816, + "grad_norm": 0.39500030875205994, + "learning_rate": 0.0001, + "loss": 0.9637, + "step": 972 + }, + { + "epoch": 0.8647928007999112, + "grad_norm": 0.3971693813800812, + "learning_rate": 0.0001, + "loss": 1.0178, + "step": 973 + }, + { + "epoch": 0.8656815909343406, + "grad_norm": 0.3862766921520233, + "learning_rate": 0.0001, + "loss": 1.0042, + "step": 974 + }, + { + "epoch": 0.8665703810687702, + "grad_norm": 0.3328218162059784, + "learning_rate": 0.0001, + "loss": 0.9846, + "step": 975 + }, + { + "epoch": 0.8674591712031996, + "grad_norm": 0.36906227469444275, + "learning_rate": 0.0001, + "loss": 0.9593, + "step": 976 + }, + { + "epoch": 0.8683479613376291, + "grad_norm": 0.40187719464302063, + "learning_rate": 0.0001, + "loss": 0.97, + "step": 977 + }, + { + "epoch": 0.8692367514720587, + "grad_norm": 0.3088963031768799, + "learning_rate": 0.0001, + "loss": 0.8898, + "step": 978 + }, + { + "epoch": 0.8701255416064881, + "grad_norm": 0.4896550178527832, + "learning_rate": 0.0001, + "loss": 0.9039, + "step": 979 + }, + { + "epoch": 0.8710143317409177, + "grad_norm": 0.39287492632865906, + "learning_rate": 0.0001, + "loss": 0.8972, + "step": 980 + }, + { + "epoch": 0.8719031218753471, + "grad_norm": 0.3530255854129791, + "learning_rate": 0.0001, + "loss": 0.949, + "step": 981 + }, + { + "epoch": 0.8727919120097767, + "grad_norm": 0.4530492126941681, + "learning_rate": 0.0001, + "loss": 1.0146, + "step": 982 + }, + { + "epoch": 0.8736807021442062, + "grad_norm": 0.3938184082508087, + "learning_rate": 0.0001, + "loss": 0.9926, + "step": 983 + }, + { + "epoch": 0.8745694922786357, + "grad_norm": 0.41301020979881287, + "learning_rate": 0.0001, + "loss": 0.9626, + "step": 984 + }, + { + "epoch": 0.8754582824130652, + "grad_norm": 0.5657033920288086, + "learning_rate": 0.0001, + "loss": 0.9557, + "step": 985 + }, + { + "epoch": 0.8763470725474948, + "grad_norm": 0.39124661684036255, + "learning_rate": 0.0001, + "loss": 0.9334, + "step": 986 + }, + { + "epoch": 0.8772358626819242, + "grad_norm": 0.367252916097641, + "learning_rate": 0.0001, + "loss": 0.9568, + "step": 987 + }, + { + "epoch": 0.8781246528163538, + "grad_norm": 0.45111268758773804, + "learning_rate": 0.0001, + "loss": 0.9269, + "step": 988 + }, + { + "epoch": 0.8790134429507832, + "grad_norm": 0.3949245810508728, + "learning_rate": 0.0001, + "loss": 0.9541, + "step": 989 + }, + { + "epoch": 0.8799022330852128, + "grad_norm": 0.3697701096534729, + "learning_rate": 0.0001, + "loss": 0.858, + "step": 990 + }, + { + "epoch": 0.8807910232196423, + "grad_norm": 0.3903849720954895, + "learning_rate": 0.0001, + "loss": 0.8917, + "step": 991 + }, + { + "epoch": 0.8816798133540718, + "grad_norm": 0.3852884769439697, + "learning_rate": 0.0001, + "loss": 0.9524, + "step": 992 + }, + { + "epoch": 0.8825686034885013, + "grad_norm": 0.425076425075531, + "learning_rate": 0.0001, + "loss": 0.9229, + "step": 993 + }, + { + "epoch": 0.8834573936229307, + "grad_norm": 0.3908687233924866, + "learning_rate": 0.0001, + "loss": 0.8766, + "step": 994 + }, + { + "epoch": 0.8843461837573603, + "grad_norm": 0.3858028054237366, + "learning_rate": 0.0001, + "loss": 0.9996, + "step": 995 + }, + { + "epoch": 0.8852349738917898, + "grad_norm": 0.4030248820781708, + "learning_rate": 0.0001, + "loss": 1.0158, + "step": 996 + }, + { + "epoch": 0.8861237640262193, + "grad_norm": 0.3604998290538788, + "learning_rate": 0.0001, + "loss": 0.8926, + "step": 997 + }, + { + "epoch": 0.8870125541606488, + "grad_norm": 0.38563668727874756, + "learning_rate": 0.0001, + "loss": 0.8513, + "step": 998 + }, + { + "epoch": 0.8879013442950783, + "grad_norm": 0.40275925397872925, + "learning_rate": 0.0001, + "loss": 1.0191, + "step": 999 + }, + { + "epoch": 0.8887901344295078, + "grad_norm": 0.3867271840572357, + "learning_rate": 0.0001, + "loss": 0.8962, + "step": 1000 + }, + { + "epoch": 0.8896789245639374, + "grad_norm": 0.39497697353363037, + "learning_rate": 0.0001, + "loss": 0.983, + "step": 1001 + }, + { + "epoch": 0.8905677146983668, + "grad_norm": 0.3594026267528534, + "learning_rate": 0.0001, + "loss": 0.9519, + "step": 1002 + }, + { + "epoch": 0.8914565048327964, + "grad_norm": 0.3522585332393646, + "learning_rate": 0.0001, + "loss": 0.9274, + "step": 1003 + }, + { + "epoch": 0.8923452949672258, + "grad_norm": 0.41415807604789734, + "learning_rate": 0.0001, + "loss": 0.9674, + "step": 1004 + }, + { + "epoch": 0.8932340851016554, + "grad_norm": 0.46825456619262695, + "learning_rate": 0.0001, + "loss": 0.9694, + "step": 1005 + }, + { + "epoch": 0.8941228752360849, + "grad_norm": 0.3260214328765869, + "learning_rate": 0.0001, + "loss": 0.9656, + "step": 1006 + }, + { + "epoch": 0.8950116653705144, + "grad_norm": 0.36279264092445374, + "learning_rate": 0.0001, + "loss": 0.9663, + "step": 1007 + }, + { + "epoch": 0.8959004555049439, + "grad_norm": 0.4032383859157562, + "learning_rate": 0.0001, + "loss": 0.9276, + "step": 1008 + }, + { + "epoch": 0.8967892456393735, + "grad_norm": 0.4371550679206848, + "learning_rate": 0.0001, + "loss": 1.0294, + "step": 1009 + }, + { + "epoch": 0.8976780357738029, + "grad_norm": 0.36339619755744934, + "learning_rate": 0.0001, + "loss": 0.9453, + "step": 1010 + }, + { + "epoch": 0.8985668259082324, + "grad_norm": 0.38631105422973633, + "learning_rate": 0.0001, + "loss": 0.9966, + "step": 1011 + }, + { + "epoch": 0.8994556160426619, + "grad_norm": 0.34750688076019287, + "learning_rate": 0.0001, + "loss": 0.8757, + "step": 1012 + }, + { + "epoch": 0.9003444061770914, + "grad_norm": 0.3377476632595062, + "learning_rate": 0.0001, + "loss": 0.973, + "step": 1013 + }, + { + "epoch": 0.901233196311521, + "grad_norm": 0.39975231885910034, + "learning_rate": 0.0001, + "loss": 0.9392, + "step": 1014 + }, + { + "epoch": 0.9021219864459504, + "grad_norm": 0.33771857619285583, + "learning_rate": 0.0001, + "loss": 0.9934, + "step": 1015 + }, + { + "epoch": 0.90301077658038, + "grad_norm": 0.33681410551071167, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 1016 + }, + { + "epoch": 0.9038995667148094, + "grad_norm": 0.39895641803741455, + "learning_rate": 0.0001, + "loss": 0.9482, + "step": 1017 + }, + { + "epoch": 0.904788356849239, + "grad_norm": 0.3800975978374481, + "learning_rate": 0.0001, + "loss": 0.9504, + "step": 1018 + }, + { + "epoch": 0.9056771469836685, + "grad_norm": 0.3816840350627899, + "learning_rate": 0.0001, + "loss": 0.9613, + "step": 1019 + }, + { + "epoch": 0.906565937118098, + "grad_norm": 0.3548993766307831, + "learning_rate": 0.0001, + "loss": 0.9234, + "step": 1020 + }, + { + "epoch": 0.9074547272525275, + "grad_norm": 0.38023990392684937, + "learning_rate": 0.0001, + "loss": 0.9697, + "step": 1021 + }, + { + "epoch": 0.908343517386957, + "grad_norm": 0.35391300916671753, + "learning_rate": 0.0001, + "loss": 0.8196, + "step": 1022 + }, + { + "epoch": 0.9092323075213865, + "grad_norm": 0.4617893397808075, + "learning_rate": 0.0001, + "loss": 1.0236, + "step": 1023 + }, + { + "epoch": 0.9101210976558161, + "grad_norm": 0.3808736503124237, + "learning_rate": 0.0001, + "loss": 0.9539, + "step": 1024 + }, + { + "epoch": 0.9110098877902455, + "grad_norm": 0.41231459379196167, + "learning_rate": 0.0001, + "loss": 0.9251, + "step": 1025 + }, + { + "epoch": 0.911898677924675, + "grad_norm": 0.34969326853752136, + "learning_rate": 0.0001, + "loss": 0.9151, + "step": 1026 + }, + { + "epoch": 0.9127874680591045, + "grad_norm": 0.3669031262397766, + "learning_rate": 0.0001, + "loss": 0.8832, + "step": 1027 + }, + { + "epoch": 0.913676258193534, + "grad_norm": 0.3683227598667145, + "learning_rate": 0.0001, + "loss": 0.9324, + "step": 1028 + }, + { + "epoch": 0.9145650483279636, + "grad_norm": 0.3768469989299774, + "learning_rate": 0.0001, + "loss": 0.8713, + "step": 1029 + }, + { + "epoch": 0.915453838462393, + "grad_norm": 0.40044888854026794, + "learning_rate": 0.0001, + "loss": 1.0428, + "step": 1030 + }, + { + "epoch": 0.9163426285968226, + "grad_norm": 0.3713497519493103, + "learning_rate": 0.0001, + "loss": 0.8803, + "step": 1031 + }, + { + "epoch": 0.9172314187312521, + "grad_norm": 0.33332690596580505, + "learning_rate": 0.0001, + "loss": 0.9579, + "step": 1032 + }, + { + "epoch": 0.9181202088656816, + "grad_norm": 0.3899736702442169, + "learning_rate": 0.0001, + "loss": 0.9934, + "step": 1033 + }, + { + "epoch": 0.9190089990001111, + "grad_norm": 0.3549093008041382, + "learning_rate": 0.0001, + "loss": 0.9093, + "step": 1034 + }, + { + "epoch": 0.9198977891345406, + "grad_norm": 0.37367865443229675, + "learning_rate": 0.0001, + "loss": 0.9748, + "step": 1035 + }, + { + "epoch": 0.9207865792689701, + "grad_norm": 0.3680754601955414, + "learning_rate": 0.0001, + "loss": 1.0113, + "step": 1036 + }, + { + "epoch": 0.9216753694033997, + "grad_norm": 0.36346691846847534, + "learning_rate": 0.0001, + "loss": 0.8892, + "step": 1037 + }, + { + "epoch": 0.9225641595378291, + "grad_norm": 0.3745006322860718, + "learning_rate": 0.0001, + "loss": 0.9352, + "step": 1038 + }, + { + "epoch": 0.9234529496722587, + "grad_norm": 0.3873980641365051, + "learning_rate": 0.0001, + "loss": 0.8809, + "step": 1039 + }, + { + "epoch": 0.9243417398066881, + "grad_norm": 0.42303210496902466, + "learning_rate": 0.0001, + "loss": 1.0008, + "step": 1040 + }, + { + "epoch": 0.9252305299411177, + "grad_norm": 0.38883471488952637, + "learning_rate": 0.0001, + "loss": 0.9839, + "step": 1041 + }, + { + "epoch": 0.9261193200755472, + "grad_norm": 0.39919066429138184, + "learning_rate": 0.0001, + "loss": 0.9246, + "step": 1042 + }, + { + "epoch": 0.9270081102099766, + "grad_norm": 0.3721344769001007, + "learning_rate": 0.0001, + "loss": 0.8927, + "step": 1043 + }, + { + "epoch": 0.9278969003444062, + "grad_norm": 0.4106079638004303, + "learning_rate": 0.0001, + "loss": 0.9501, + "step": 1044 + }, + { + "epoch": 0.9287856904788356, + "grad_norm": 0.35994967818260193, + "learning_rate": 0.0001, + "loss": 0.9483, + "step": 1045 + }, + { + "epoch": 0.9296744806132652, + "grad_norm": 0.3131955862045288, + "learning_rate": 0.0001, + "loss": 0.9175, + "step": 1046 + }, + { + "epoch": 0.9305632707476947, + "grad_norm": 0.37486889958381653, + "learning_rate": 0.0001, + "loss": 0.9459, + "step": 1047 + }, + { + "epoch": 0.9314520608821242, + "grad_norm": 0.36107298731803894, + "learning_rate": 0.0001, + "loss": 0.9201, + "step": 1048 + }, + { + "epoch": 0.9323408510165537, + "grad_norm": 0.3575218915939331, + "learning_rate": 0.0001, + "loss": 0.9254, + "step": 1049 + }, + { + "epoch": 0.9332296411509832, + "grad_norm": 0.36310702562332153, + "learning_rate": 0.0001, + "loss": 0.9542, + "step": 1050 + }, + { + "epoch": 0.9341184312854127, + "grad_norm": 0.3741386830806732, + "learning_rate": 0.0001, + "loss": 0.8913, + "step": 1051 + }, + { + "epoch": 0.9350072214198423, + "grad_norm": 0.3636328876018524, + "learning_rate": 0.0001, + "loss": 0.9218, + "step": 1052 + }, + { + "epoch": 0.9358960115542717, + "grad_norm": 0.3283315598964691, + "learning_rate": 0.0001, + "loss": 0.9341, + "step": 1053 + }, + { + "epoch": 0.9367848016887013, + "grad_norm": 0.4130898118019104, + "learning_rate": 0.0001, + "loss": 0.9666, + "step": 1054 + }, + { + "epoch": 0.9376735918231308, + "grad_norm": 0.3228664696216583, + "learning_rate": 0.0001, + "loss": 0.8924, + "step": 1055 + }, + { + "epoch": 0.9385623819575603, + "grad_norm": 0.42613714933395386, + "learning_rate": 0.0001, + "loss": 0.9347, + "step": 1056 + }, + { + "epoch": 0.9394511720919898, + "grad_norm": 0.4059881567955017, + "learning_rate": 0.0001, + "loss": 0.906, + "step": 1057 + }, + { + "epoch": 0.9403399622264192, + "grad_norm": 0.4185310900211334, + "learning_rate": 0.0001, + "loss": 1.0162, + "step": 1058 + }, + { + "epoch": 0.9412287523608488, + "grad_norm": 0.38086622953414917, + "learning_rate": 0.0001, + "loss": 1.0297, + "step": 1059 + }, + { + "epoch": 0.9421175424952783, + "grad_norm": 0.3949964642524719, + "learning_rate": 0.0001, + "loss": 0.9184, + "step": 1060 + }, + { + "epoch": 0.9430063326297078, + "grad_norm": 0.3886110186576843, + "learning_rate": 0.0001, + "loss": 0.9339, + "step": 1061 + }, + { + "epoch": 0.9438951227641373, + "grad_norm": 0.37949997186660767, + "learning_rate": 0.0001, + "loss": 0.9141, + "step": 1062 + }, + { + "epoch": 0.9447839128985668, + "grad_norm": 0.34117576479911804, + "learning_rate": 0.0001, + "loss": 0.9153, + "step": 1063 + }, + { + "epoch": 0.9456727030329963, + "grad_norm": 0.36747774481773376, + "learning_rate": 0.0001, + "loss": 0.9275, + "step": 1064 + }, + { + "epoch": 0.9465614931674259, + "grad_norm": 0.3943864703178406, + "learning_rate": 0.0001, + "loss": 0.8985, + "step": 1065 + }, + { + "epoch": 0.9474502833018553, + "grad_norm": 0.3849271237850189, + "learning_rate": 0.0001, + "loss": 0.956, + "step": 1066 + }, + { + "epoch": 0.9483390734362849, + "grad_norm": 0.3633384108543396, + "learning_rate": 0.0001, + "loss": 0.9836, + "step": 1067 + }, + { + "epoch": 0.9492278635707143, + "grad_norm": 0.3905209004878998, + "learning_rate": 0.0001, + "loss": 0.9753, + "step": 1068 + }, + { + "epoch": 0.9501166537051439, + "grad_norm": 0.36972206830978394, + "learning_rate": 0.0001, + "loss": 0.9108, + "step": 1069 + }, + { + "epoch": 0.9510054438395734, + "grad_norm": 0.38415467739105225, + "learning_rate": 0.0001, + "loss": 0.9539, + "step": 1070 + }, + { + "epoch": 0.9518942339740029, + "grad_norm": 0.3597318232059479, + "learning_rate": 0.0001, + "loss": 0.8953, + "step": 1071 + }, + { + "epoch": 0.9527830241084324, + "grad_norm": 0.374531090259552, + "learning_rate": 0.0001, + "loss": 0.981, + "step": 1072 + }, + { + "epoch": 0.953671814242862, + "grad_norm": 0.3501724302768707, + "learning_rate": 0.0001, + "loss": 0.9589, + "step": 1073 + }, + { + "epoch": 0.9545606043772914, + "grad_norm": 0.3414580821990967, + "learning_rate": 0.0001, + "loss": 0.901, + "step": 1074 + }, + { + "epoch": 0.955449394511721, + "grad_norm": 0.3523072600364685, + "learning_rate": 0.0001, + "loss": 0.901, + "step": 1075 + }, + { + "epoch": 0.9563381846461504, + "grad_norm": 0.38694173097610474, + "learning_rate": 0.0001, + "loss": 0.9884, + "step": 1076 + }, + { + "epoch": 0.9572269747805799, + "grad_norm": 0.41436851024627686, + "learning_rate": 0.0001, + "loss": 0.9123, + "step": 1077 + }, + { + "epoch": 0.9581157649150095, + "grad_norm": 0.34676891565322876, + "learning_rate": 0.0001, + "loss": 0.9113, + "step": 1078 + }, + { + "epoch": 0.9590045550494389, + "grad_norm": 0.3826766908168793, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 1079 + }, + { + "epoch": 0.9598933451838685, + "grad_norm": 0.34997478127479553, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 1080 + }, + { + "epoch": 0.9607821353182979, + "grad_norm": 0.40335193276405334, + "learning_rate": 0.0001, + "loss": 1.0525, + "step": 1081 + }, + { + "epoch": 0.9616709254527275, + "grad_norm": 0.4107684791088104, + "learning_rate": 0.0001, + "loss": 1.0239, + "step": 1082 + }, + { + "epoch": 0.962559715587157, + "grad_norm": 0.3917597830295563, + "learning_rate": 0.0001, + "loss": 0.8393, + "step": 1083 + }, + { + "epoch": 0.9634485057215865, + "grad_norm": 0.3460274040699005, + "learning_rate": 0.0001, + "loss": 0.9266, + "step": 1084 + }, + { + "epoch": 0.964337295856016, + "grad_norm": 0.36212509870529175, + "learning_rate": 0.0001, + "loss": 0.9578, + "step": 1085 + }, + { + "epoch": 0.9652260859904455, + "grad_norm": 0.4016794264316559, + "learning_rate": 0.0001, + "loss": 0.9461, + "step": 1086 + }, + { + "epoch": 0.966114876124875, + "grad_norm": 0.3891691267490387, + "learning_rate": 0.0001, + "loss": 0.9576, + "step": 1087 + }, + { + "epoch": 0.9670036662593046, + "grad_norm": 0.35440635681152344, + "learning_rate": 0.0001, + "loss": 0.9546, + "step": 1088 + }, + { + "epoch": 0.967892456393734, + "grad_norm": 0.3960229158401489, + "learning_rate": 0.0001, + "loss": 1.0514, + "step": 1089 + }, + { + "epoch": 0.9687812465281636, + "grad_norm": 0.38155800104141235, + "learning_rate": 0.0001, + "loss": 0.8884, + "step": 1090 + }, + { + "epoch": 0.969670036662593, + "grad_norm": 0.3513757884502411, + "learning_rate": 0.0001, + "loss": 0.8801, + "step": 1091 + }, + { + "epoch": 0.9705588267970225, + "grad_norm": 0.38408926129341125, + "learning_rate": 0.0001, + "loss": 0.8309, + "step": 1092 + }, + { + "epoch": 0.9714476169314521, + "grad_norm": 0.378099262714386, + "learning_rate": 0.0001, + "loss": 0.9089, + "step": 1093 + }, + { + "epoch": 0.9723364070658815, + "grad_norm": 0.3182670474052429, + "learning_rate": 0.0001, + "loss": 0.9489, + "step": 1094 + }, + { + "epoch": 0.9732251972003111, + "grad_norm": 0.3988575041294098, + "learning_rate": 0.0001, + "loss": 0.9364, + "step": 1095 + }, + { + "epoch": 0.9741139873347406, + "grad_norm": 0.4182455837726593, + "learning_rate": 0.0001, + "loss": 1.0097, + "step": 1096 + }, + { + "epoch": 0.9750027774691701, + "grad_norm": 0.3374396562576294, + "learning_rate": 0.0001, + "loss": 0.9467, + "step": 1097 + }, + { + "epoch": 0.9758915676035996, + "grad_norm": 0.36527591943740845, + "learning_rate": 0.0001, + "loss": 0.9365, + "step": 1098 + }, + { + "epoch": 0.9767803577380291, + "grad_norm": 0.3679245710372925, + "learning_rate": 0.0001, + "loss": 0.9387, + "step": 1099 + }, + { + "epoch": 0.9776691478724586, + "grad_norm": 0.3694630265235901, + "learning_rate": 0.0001, + "loss": 0.8805, + "step": 1100 + }, + { + "epoch": 0.9785579380068882, + "grad_norm": 0.3750841021537781, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 1101 + }, + { + "epoch": 0.9794467281413176, + "grad_norm": 0.4152679145336151, + "learning_rate": 0.0001, + "loss": 1.0473, + "step": 1102 + }, + { + "epoch": 0.9803355182757472, + "grad_norm": 0.3526078164577484, + "learning_rate": 0.0001, + "loss": 0.9227, + "step": 1103 + }, + { + "epoch": 0.9812243084101766, + "grad_norm": 0.3585357069969177, + "learning_rate": 0.0001, + "loss": 0.961, + "step": 1104 + }, + { + "epoch": 0.9821130985446062, + "grad_norm": 0.38510292768478394, + "learning_rate": 0.0001, + "loss": 0.9997, + "step": 1105 + }, + { + "epoch": 0.9830018886790357, + "grad_norm": 0.34511297941207886, + "learning_rate": 0.0001, + "loss": 0.895, + "step": 1106 + }, + { + "epoch": 0.9838906788134651, + "grad_norm": 0.36190247535705566, + "learning_rate": 0.0001, + "loss": 0.9789, + "step": 1107 + }, + { + "epoch": 0.9847794689478947, + "grad_norm": 0.3606034219264984, + "learning_rate": 0.0001, + "loss": 0.9512, + "step": 1108 + }, + { + "epoch": 0.9856682590823241, + "grad_norm": 0.31674253940582275, + "learning_rate": 0.0001, + "loss": 0.9767, + "step": 1109 + }, + { + "epoch": 0.9865570492167537, + "grad_norm": 0.3847641944885254, + "learning_rate": 0.0001, + "loss": 0.9811, + "step": 1110 + }, + { + "epoch": 0.9874458393511832, + "grad_norm": 0.28332382440567017, + "learning_rate": 0.0001, + "loss": 0.9064, + "step": 1111 + }, + { + "epoch": 0.9883346294856127, + "grad_norm": 0.3163788318634033, + "learning_rate": 0.0001, + "loss": 1.0134, + "step": 1112 + }, + { + "epoch": 0.9892234196200422, + "grad_norm": 0.37489238381385803, + "learning_rate": 0.0001, + "loss": 0.9254, + "step": 1113 + }, + { + "epoch": 0.9901122097544717, + "grad_norm": 0.3543959856033325, + "learning_rate": 0.0001, + "loss": 0.9415, + "step": 1114 + }, + { + "epoch": 0.9910009998889012, + "grad_norm": 0.42274850606918335, + "learning_rate": 0.0001, + "loss": 0.9142, + "step": 1115 + }, + { + "epoch": 0.9918897900233308, + "grad_norm": 0.38619187474250793, + "learning_rate": 0.0001, + "loss": 0.9387, + "step": 1116 + }, + { + "epoch": 0.9927785801577602, + "grad_norm": 0.379362553358078, + "learning_rate": 0.0001, + "loss": 0.9879, + "step": 1117 + }, + { + "epoch": 0.9936673702921898, + "grad_norm": 0.4227985441684723, + "learning_rate": 0.0001, + "loss": 1.0083, + "step": 1118 + }, + { + "epoch": 0.9945561604266193, + "grad_norm": 0.4064882695674896, + "learning_rate": 0.0001, + "loss": 0.9726, + "step": 1119 + }, + { + "epoch": 0.9954449505610488, + "grad_norm": 0.35058096051216125, + "learning_rate": 0.0001, + "loss": 0.9862, + "step": 1120 + }, + { + "epoch": 0.9963337406954783, + "grad_norm": 0.3268047869205475, + "learning_rate": 0.0001, + "loss": 0.8753, + "step": 1121 + }, + { + "epoch": 0.9972225308299077, + "grad_norm": 0.3554556667804718, + "learning_rate": 0.0001, + "loss": 1.0007, + "step": 1122 + }, + { + "epoch": 0.9981113209643373, + "grad_norm": 0.368964284658432, + "learning_rate": 0.0001, + "loss": 0.9934, + "step": 1123 + }, + { + "epoch": 0.9990001110987669, + "grad_norm": 0.39982903003692627, + "learning_rate": 0.0001, + "loss": 0.8933, + "step": 1124 + }, + { + "epoch": 0.9998889012331963, + "grad_norm": 0.3517761528491974, + "learning_rate": 0.0001, + "loss": 0.9454, + "step": 1125 + }, + { + "epoch": 1.0007776913676258, + "grad_norm": 0.3954550623893738, + "learning_rate": 0.0001, + "loss": 0.9681, + "step": 1126 + }, + { + "epoch": 1.0016664815020553, + "grad_norm": 0.377174973487854, + "learning_rate": 0.0001, + "loss": 0.8917, + "step": 1127 + }, + { + "epoch": 1.002555271636485, + "grad_norm": 0.4547029435634613, + "learning_rate": 0.0001, + "loss": 0.8837, + "step": 1128 + }, + { + "epoch": 1.0034440617709144, + "grad_norm": 0.4161812961101532, + "learning_rate": 0.0001, + "loss": 1.0238, + "step": 1129 + }, + { + "epoch": 1.0043328519053438, + "grad_norm": 0.398183673620224, + "learning_rate": 0.0001, + "loss": 0.9245, + "step": 1130 + }, + { + "epoch": 1.0052216420397733, + "grad_norm": 0.41381561756134033, + "learning_rate": 0.0001, + "loss": 0.909, + "step": 1131 + }, + { + "epoch": 1.006110432174203, + "grad_norm": 0.37161487340927124, + "learning_rate": 0.0001, + "loss": 0.9355, + "step": 1132 + }, + { + "epoch": 1.0069992223086324, + "grad_norm": 0.378110408782959, + "learning_rate": 0.0001, + "loss": 0.973, + "step": 1133 + }, + { + "epoch": 1.0078880124430618, + "grad_norm": 0.3747970163822174, + "learning_rate": 0.0001, + "loss": 0.8684, + "step": 1134 + }, + { + "epoch": 1.0087768025774915, + "grad_norm": 0.3934114873409271, + "learning_rate": 0.0001, + "loss": 0.9361, + "step": 1135 + }, + { + "epoch": 1.009665592711921, + "grad_norm": 0.41165485978126526, + "learning_rate": 0.0001, + "loss": 0.9385, + "step": 1136 + }, + { + "epoch": 1.0105543828463504, + "grad_norm": 0.6741541624069214, + "learning_rate": 0.0001, + "loss": 0.8731, + "step": 1137 + }, + { + "epoch": 1.01144317298078, + "grad_norm": 0.3466031551361084, + "learning_rate": 0.0001, + "loss": 0.8852, + "step": 1138 + }, + { + "epoch": 1.0123319631152095, + "grad_norm": 0.3946625590324402, + "learning_rate": 0.0001, + "loss": 0.9609, + "step": 1139 + }, + { + "epoch": 1.013220753249639, + "grad_norm": 0.3739851713180542, + "learning_rate": 0.0001, + "loss": 0.7588, + "step": 1140 + }, + { + "epoch": 1.0141095433840683, + "grad_norm": 0.3292269706726074, + "learning_rate": 0.0001, + "loss": 0.9439, + "step": 1141 + }, + { + "epoch": 1.014998333518498, + "grad_norm": 0.4359779953956604, + "learning_rate": 0.0001, + "loss": 0.9791, + "step": 1142 + }, + { + "epoch": 1.0158871236529274, + "grad_norm": 0.392539381980896, + "learning_rate": 0.0001, + "loss": 0.8939, + "step": 1143 + }, + { + "epoch": 1.0167759137873569, + "grad_norm": 0.36536142230033875, + "learning_rate": 0.0001, + "loss": 0.9454, + "step": 1144 + }, + { + "epoch": 1.0176647039217865, + "grad_norm": 0.4121522903442383, + "learning_rate": 0.0001, + "loss": 0.951, + "step": 1145 + }, + { + "epoch": 1.018553494056216, + "grad_norm": 0.3721693456172943, + "learning_rate": 0.0001, + "loss": 0.9443, + "step": 1146 + }, + { + "epoch": 1.0194422841906454, + "grad_norm": 0.42354851961135864, + "learning_rate": 0.0001, + "loss": 0.903, + "step": 1147 + }, + { + "epoch": 1.020331074325075, + "grad_norm": 0.3700781464576721, + "learning_rate": 0.0001, + "loss": 0.9729, + "step": 1148 + }, + { + "epoch": 1.0212198644595045, + "grad_norm": 0.37510499358177185, + "learning_rate": 0.0001, + "loss": 0.9149, + "step": 1149 + }, + { + "epoch": 1.022108654593934, + "grad_norm": 0.3892500400543213, + "learning_rate": 0.0001, + "loss": 0.9894, + "step": 1150 + }, + { + "epoch": 1.0229974447283636, + "grad_norm": 0.4017266631126404, + "learning_rate": 0.0001, + "loss": 0.9369, + "step": 1151 + }, + { + "epoch": 1.023886234862793, + "grad_norm": 0.37094780802726746, + "learning_rate": 0.0001, + "loss": 1.0178, + "step": 1152 + }, + { + "epoch": 1.0247750249972225, + "grad_norm": 0.3605554401874542, + "learning_rate": 0.0001, + "loss": 0.8953, + "step": 1153 + }, + { + "epoch": 1.025663815131652, + "grad_norm": 0.3322463035583496, + "learning_rate": 0.0001, + "loss": 0.8531, + "step": 1154 + }, + { + "epoch": 1.0265526052660816, + "grad_norm": 0.35462141036987305, + "learning_rate": 0.0001, + "loss": 0.9225, + "step": 1155 + }, + { + "epoch": 1.027441395400511, + "grad_norm": 0.3974057137966156, + "learning_rate": 0.0001, + "loss": 0.9839, + "step": 1156 + }, + { + "epoch": 1.0283301855349405, + "grad_norm": 0.3556059002876282, + "learning_rate": 0.0001, + "loss": 0.9588, + "step": 1157 + }, + { + "epoch": 1.0292189756693702, + "grad_norm": 0.37689536809921265, + "learning_rate": 0.0001, + "loss": 0.923, + "step": 1158 + }, + { + "epoch": 1.0301077658037996, + "grad_norm": 0.36872097849845886, + "learning_rate": 0.0001, + "loss": 0.9578, + "step": 1159 + }, + { + "epoch": 1.030996555938229, + "grad_norm": 0.44219520688056946, + "learning_rate": 0.0001, + "loss": 1.0076, + "step": 1160 + }, + { + "epoch": 1.0318853460726587, + "grad_norm": 0.3993608355522156, + "learning_rate": 0.0001, + "loss": 0.9316, + "step": 1161 + }, + { + "epoch": 1.0327741362070881, + "grad_norm": 0.3941507041454315, + "learning_rate": 0.0001, + "loss": 0.9119, + "step": 1162 + }, + { + "epoch": 1.0336629263415176, + "grad_norm": 0.43561699986457825, + "learning_rate": 0.0001, + "loss": 0.8983, + "step": 1163 + }, + { + "epoch": 1.034551716475947, + "grad_norm": 0.3520337641239166, + "learning_rate": 0.0001, + "loss": 0.8171, + "step": 1164 + }, + { + "epoch": 1.0354405066103767, + "grad_norm": 0.376632958650589, + "learning_rate": 0.0001, + "loss": 0.8648, + "step": 1165 + }, + { + "epoch": 1.0363292967448061, + "grad_norm": 0.6677507162094116, + "learning_rate": 0.0001, + "loss": 0.9802, + "step": 1166 + }, + { + "epoch": 1.0372180868792356, + "grad_norm": 0.3786594271659851, + "learning_rate": 0.0001, + "loss": 1.0064, + "step": 1167 + }, + { + "epoch": 1.0381068770136652, + "grad_norm": 0.3683249056339264, + "learning_rate": 0.0001, + "loss": 0.8562, + "step": 1168 + }, + { + "epoch": 1.0389956671480947, + "grad_norm": 0.38416755199432373, + "learning_rate": 0.0001, + "loss": 0.9727, + "step": 1169 + }, + { + "epoch": 1.039884457282524, + "grad_norm": 0.3452812135219574, + "learning_rate": 0.0001, + "loss": 0.8639, + "step": 1170 + }, + { + "epoch": 1.0407732474169538, + "grad_norm": 0.3969337046146393, + "learning_rate": 0.0001, + "loss": 0.9049, + "step": 1171 + }, + { + "epoch": 1.0416620375513832, + "grad_norm": 0.3651241362094879, + "learning_rate": 0.0001, + "loss": 0.8987, + "step": 1172 + }, + { + "epoch": 1.0425508276858126, + "grad_norm": 0.4599679410457611, + "learning_rate": 0.0001, + "loss": 1.0484, + "step": 1173 + }, + { + "epoch": 1.0434396178202423, + "grad_norm": 0.4104447066783905, + "learning_rate": 0.0001, + "loss": 0.9513, + "step": 1174 + }, + { + "epoch": 1.0443284079546717, + "grad_norm": 0.39843717217445374, + "learning_rate": 0.0001, + "loss": 1.0321, + "step": 1175 + }, + { + "epoch": 1.0452171980891012, + "grad_norm": 0.3388933837413788, + "learning_rate": 0.0001, + "loss": 0.8599, + "step": 1176 + }, + { + "epoch": 1.0461059882235306, + "grad_norm": 0.3476930856704712, + "learning_rate": 0.0001, + "loss": 0.8383, + "step": 1177 + }, + { + "epoch": 1.0469947783579603, + "grad_norm": 0.37614160776138306, + "learning_rate": 0.0001, + "loss": 0.9211, + "step": 1178 + }, + { + "epoch": 1.0478835684923897, + "grad_norm": 0.37879347801208496, + "learning_rate": 0.0001, + "loss": 0.9369, + "step": 1179 + }, + { + "epoch": 1.0487723586268192, + "grad_norm": 0.3917993903160095, + "learning_rate": 0.0001, + "loss": 0.9324, + "step": 1180 + }, + { + "epoch": 1.0496611487612488, + "grad_norm": 0.32946088910102844, + "learning_rate": 0.0001, + "loss": 0.913, + "step": 1181 + }, + { + "epoch": 1.0505499388956783, + "grad_norm": 0.362324595451355, + "learning_rate": 0.0001, + "loss": 0.9393, + "step": 1182 + }, + { + "epoch": 1.0514387290301077, + "grad_norm": 0.411262184381485, + "learning_rate": 0.0001, + "loss": 0.9152, + "step": 1183 + }, + { + "epoch": 1.0523275191645374, + "grad_norm": 0.3655692934989929, + "learning_rate": 0.0001, + "loss": 0.8978, + "step": 1184 + }, + { + "epoch": 1.0532163092989668, + "grad_norm": 0.4027964174747467, + "learning_rate": 0.0001, + "loss": 0.954, + "step": 1185 + }, + { + "epoch": 1.0541050994333963, + "grad_norm": 0.362482488155365, + "learning_rate": 0.0001, + "loss": 0.8665, + "step": 1186 + }, + { + "epoch": 1.0549938895678257, + "grad_norm": 0.43920066952705383, + "learning_rate": 0.0001, + "loss": 0.9462, + "step": 1187 + }, + { + "epoch": 1.0558826797022554, + "grad_norm": 0.4220573604106903, + "learning_rate": 0.0001, + "loss": 1.0071, + "step": 1188 + }, + { + "epoch": 1.0567714698366848, + "grad_norm": 0.3542270064353943, + "learning_rate": 0.0001, + "loss": 0.8843, + "step": 1189 + }, + { + "epoch": 1.0576602599711142, + "grad_norm": 0.3698994815349579, + "learning_rate": 0.0001, + "loss": 0.883, + "step": 1190 + }, + { + "epoch": 1.058549050105544, + "grad_norm": 0.39016515016555786, + "learning_rate": 0.0001, + "loss": 1.0091, + "step": 1191 + }, + { + "epoch": 1.0594378402399733, + "grad_norm": 0.377370685338974, + "learning_rate": 0.0001, + "loss": 0.9807, + "step": 1192 + }, + { + "epoch": 1.0603266303744028, + "grad_norm": 0.34474626183509827, + "learning_rate": 0.0001, + "loss": 0.9282, + "step": 1193 + }, + { + "epoch": 1.0612154205088324, + "grad_norm": 0.35169529914855957, + "learning_rate": 0.0001, + "loss": 0.9907, + "step": 1194 + }, + { + "epoch": 1.0621042106432619, + "grad_norm": 0.3528148829936981, + "learning_rate": 0.0001, + "loss": 0.8829, + "step": 1195 + }, + { + "epoch": 1.0629930007776913, + "grad_norm": 0.382842481136322, + "learning_rate": 0.0001, + "loss": 1.0002, + "step": 1196 + }, + { + "epoch": 1.063881790912121, + "grad_norm": 0.35213178396224976, + "learning_rate": 0.0001, + "loss": 0.8423, + "step": 1197 + }, + { + "epoch": 1.0647705810465504, + "grad_norm": 0.3510130047798157, + "learning_rate": 0.0001, + "loss": 0.8727, + "step": 1198 + }, + { + "epoch": 1.0656593711809799, + "grad_norm": 0.3832014501094818, + "learning_rate": 0.0001, + "loss": 0.9299, + "step": 1199 + }, + { + "epoch": 1.0665481613154093, + "grad_norm": 0.3763968050479889, + "learning_rate": 0.0001, + "loss": 0.9403, + "step": 1200 + }, + { + "epoch": 1.067436951449839, + "grad_norm": 0.4154573678970337, + "learning_rate": 0.0001, + "loss": 0.9413, + "step": 1201 + }, + { + "epoch": 1.0683257415842684, + "grad_norm": 0.4002286195755005, + "learning_rate": 0.0001, + "loss": 1.0054, + "step": 1202 + }, + { + "epoch": 1.0692145317186978, + "grad_norm": 0.40614718198776245, + "learning_rate": 0.0001, + "loss": 0.9248, + "step": 1203 + }, + { + "epoch": 1.0701033218531275, + "grad_norm": 0.3200342357158661, + "learning_rate": 0.0001, + "loss": 0.9438, + "step": 1204 + }, + { + "epoch": 1.070992111987557, + "grad_norm": 0.4085826575756073, + "learning_rate": 0.0001, + "loss": 0.9417, + "step": 1205 + }, + { + "epoch": 1.0718809021219864, + "grad_norm": 0.3780025541782379, + "learning_rate": 0.0001, + "loss": 0.9076, + "step": 1206 + }, + { + "epoch": 1.072769692256416, + "grad_norm": 0.42114147543907166, + "learning_rate": 0.0001, + "loss": 0.9071, + "step": 1207 + }, + { + "epoch": 1.0736584823908455, + "grad_norm": 0.41555988788604736, + "learning_rate": 0.0001, + "loss": 0.9986, + "step": 1208 + }, + { + "epoch": 1.074547272525275, + "grad_norm": 0.4004509747028351, + "learning_rate": 0.0001, + "loss": 0.9048, + "step": 1209 + }, + { + "epoch": 1.0754360626597044, + "grad_norm": 1.0510995388031006, + "learning_rate": 0.0001, + "loss": 0.9838, + "step": 1210 + }, + { + "epoch": 1.076324852794134, + "grad_norm": 0.3881635069847107, + "learning_rate": 0.0001, + "loss": 0.978, + "step": 1211 + }, + { + "epoch": 1.0772136429285635, + "grad_norm": 0.4103749096393585, + "learning_rate": 0.0001, + "loss": 0.9817, + "step": 1212 + }, + { + "epoch": 1.078102433062993, + "grad_norm": 0.4124666750431061, + "learning_rate": 0.0001, + "loss": 0.9354, + "step": 1213 + }, + { + "epoch": 1.0789912231974226, + "grad_norm": 0.433636337518692, + "learning_rate": 0.0001, + "loss": 0.9371, + "step": 1214 + }, + { + "epoch": 1.079880013331852, + "grad_norm": 0.41402560472488403, + "learning_rate": 0.0001, + "loss": 0.9656, + "step": 1215 + }, + { + "epoch": 1.0807688034662815, + "grad_norm": 0.36485034227371216, + "learning_rate": 0.0001, + "loss": 0.9697, + "step": 1216 + }, + { + "epoch": 1.0816575936007111, + "grad_norm": 0.41599103808403015, + "learning_rate": 0.0001, + "loss": 0.8941, + "step": 1217 + }, + { + "epoch": 1.0825463837351406, + "grad_norm": 0.38060900568962097, + "learning_rate": 0.0001, + "loss": 0.8354, + "step": 1218 + }, + { + "epoch": 1.08343517386957, + "grad_norm": 0.40148356556892395, + "learning_rate": 0.0001, + "loss": 0.9132, + "step": 1219 + }, + { + "epoch": 1.0843239640039997, + "grad_norm": 0.3883506655693054, + "learning_rate": 0.0001, + "loss": 0.9233, + "step": 1220 + }, + { + "epoch": 1.085212754138429, + "grad_norm": 0.34674960374832153, + "learning_rate": 0.0001, + "loss": 0.8928, + "step": 1221 + }, + { + "epoch": 1.0861015442728585, + "grad_norm": 0.35441455245018005, + "learning_rate": 0.0001, + "loss": 0.8653, + "step": 1222 + }, + { + "epoch": 1.086990334407288, + "grad_norm": 0.37511923909187317, + "learning_rate": 0.0001, + "loss": 0.9179, + "step": 1223 + }, + { + "epoch": 1.0878791245417176, + "grad_norm": 0.35857459902763367, + "learning_rate": 0.0001, + "loss": 0.8866, + "step": 1224 + }, + { + "epoch": 1.088767914676147, + "grad_norm": 0.3715518116950989, + "learning_rate": 0.0001, + "loss": 0.9263, + "step": 1225 + }, + { + "epoch": 1.0896567048105765, + "grad_norm": 0.3738372325897217, + "learning_rate": 0.0001, + "loss": 0.9759, + "step": 1226 + }, + { + "epoch": 1.0905454949450062, + "grad_norm": 0.41106364130973816, + "learning_rate": 0.0001, + "loss": 0.9539, + "step": 1227 + }, + { + "epoch": 1.0914342850794356, + "grad_norm": 0.36827585101127625, + "learning_rate": 0.0001, + "loss": 0.949, + "step": 1228 + }, + { + "epoch": 1.092323075213865, + "grad_norm": 0.39710530638694763, + "learning_rate": 0.0001, + "loss": 0.9351, + "step": 1229 + }, + { + "epoch": 1.0932118653482947, + "grad_norm": 0.3501657247543335, + "learning_rate": 0.0001, + "loss": 0.9026, + "step": 1230 + }, + { + "epoch": 1.0941006554827242, + "grad_norm": 0.4335416853427887, + "learning_rate": 0.0001, + "loss": 0.9438, + "step": 1231 + }, + { + "epoch": 1.0949894456171536, + "grad_norm": 0.3835698962211609, + "learning_rate": 0.0001, + "loss": 0.8951, + "step": 1232 + }, + { + "epoch": 1.095878235751583, + "grad_norm": 0.3333055078983307, + "learning_rate": 0.0001, + "loss": 0.8505, + "step": 1233 + }, + { + "epoch": 1.0967670258860127, + "grad_norm": 0.4030056595802307, + "learning_rate": 0.0001, + "loss": 0.9226, + "step": 1234 + }, + { + "epoch": 1.0976558160204422, + "grad_norm": 0.4055632948875427, + "learning_rate": 0.0001, + "loss": 0.9892, + "step": 1235 + }, + { + "epoch": 1.0985446061548716, + "grad_norm": 0.3327998220920563, + "learning_rate": 0.0001, + "loss": 0.8774, + "step": 1236 + }, + { + "epoch": 1.0994333962893013, + "grad_norm": 0.4032350182533264, + "learning_rate": 0.0001, + "loss": 0.8252, + "step": 1237 + }, + { + "epoch": 1.1003221864237307, + "grad_norm": 0.40369337797164917, + "learning_rate": 0.0001, + "loss": 0.9785, + "step": 1238 + }, + { + "epoch": 1.1012109765581601, + "grad_norm": 0.339615136384964, + "learning_rate": 0.0001, + "loss": 0.9099, + "step": 1239 + }, + { + "epoch": 1.1020997666925898, + "grad_norm": 0.4322150945663452, + "learning_rate": 0.0001, + "loss": 0.9437, + "step": 1240 + }, + { + "epoch": 1.1029885568270192, + "grad_norm": 0.4031563103199005, + "learning_rate": 0.0001, + "loss": 0.9856, + "step": 1241 + }, + { + "epoch": 1.1038773469614487, + "grad_norm": 0.36764100193977356, + "learning_rate": 0.0001, + "loss": 0.9202, + "step": 1242 + }, + { + "epoch": 1.1047661370958783, + "grad_norm": 0.35683104395866394, + "learning_rate": 0.0001, + "loss": 0.8976, + "step": 1243 + }, + { + "epoch": 1.1056549272303078, + "grad_norm": 0.3336647152900696, + "learning_rate": 0.0001, + "loss": 0.875, + "step": 1244 + }, + { + "epoch": 1.1065437173647372, + "grad_norm": 0.36215344071388245, + "learning_rate": 0.0001, + "loss": 0.8985, + "step": 1245 + }, + { + "epoch": 1.1074325074991667, + "grad_norm": 0.3436198830604553, + "learning_rate": 0.0001, + "loss": 0.8672, + "step": 1246 + }, + { + "epoch": 1.1083212976335963, + "grad_norm": 0.3478955328464508, + "learning_rate": 0.0001, + "loss": 0.833, + "step": 1247 + }, + { + "epoch": 1.1092100877680258, + "grad_norm": 0.3481768071651459, + "learning_rate": 0.0001, + "loss": 0.891, + "step": 1248 + }, + { + "epoch": 1.1100988779024552, + "grad_norm": 0.37578824162483215, + "learning_rate": 0.0001, + "loss": 0.88, + "step": 1249 + }, + { + "epoch": 1.1109876680368849, + "grad_norm": 0.3523572087287903, + "learning_rate": 0.0001, + "loss": 0.8152, + "step": 1250 + }, + { + "epoch": 1.1118764581713143, + "grad_norm": 0.37987327575683594, + "learning_rate": 0.0001, + "loss": 0.9859, + "step": 1251 + }, + { + "epoch": 1.1127652483057437, + "grad_norm": 0.34919604659080505, + "learning_rate": 0.0001, + "loss": 0.948, + "step": 1252 + }, + { + "epoch": 1.1136540384401734, + "grad_norm": 0.401996910572052, + "learning_rate": 0.0001, + "loss": 0.9123, + "step": 1253 + }, + { + "epoch": 1.1145428285746029, + "grad_norm": 0.3942583501338959, + "learning_rate": 0.0001, + "loss": 0.8524, + "step": 1254 + }, + { + "epoch": 1.1154316187090323, + "grad_norm": 0.3758942484855652, + "learning_rate": 0.0001, + "loss": 0.8813, + "step": 1255 + }, + { + "epoch": 1.1163204088434617, + "grad_norm": 0.3327169716358185, + "learning_rate": 0.0001, + "loss": 0.8493, + "step": 1256 + }, + { + "epoch": 1.1172091989778914, + "grad_norm": 0.34999555349349976, + "learning_rate": 0.0001, + "loss": 0.8986, + "step": 1257 + }, + { + "epoch": 1.1180979891123208, + "grad_norm": 0.3059004843235016, + "learning_rate": 0.0001, + "loss": 0.8299, + "step": 1258 + }, + { + "epoch": 1.1189867792467503, + "grad_norm": 0.40918049216270447, + "learning_rate": 0.0001, + "loss": 1.0235, + "step": 1259 + }, + { + "epoch": 1.11987556938118, + "grad_norm": 0.45679351687431335, + "learning_rate": 0.0001, + "loss": 0.9245, + "step": 1260 + }, + { + "epoch": 1.1207643595156094, + "grad_norm": 0.36050447821617126, + "learning_rate": 0.0001, + "loss": 0.7914, + "step": 1261 + }, + { + "epoch": 1.1216531496500388, + "grad_norm": 0.3530547320842743, + "learning_rate": 0.0001, + "loss": 0.8892, + "step": 1262 + }, + { + "epoch": 1.1225419397844685, + "grad_norm": 0.39871805906295776, + "learning_rate": 0.0001, + "loss": 0.9088, + "step": 1263 + }, + { + "epoch": 1.123430729918898, + "grad_norm": 0.37267911434173584, + "learning_rate": 0.0001, + "loss": 0.9331, + "step": 1264 + }, + { + "epoch": 1.1243195200533274, + "grad_norm": 0.3619190752506256, + "learning_rate": 0.0001, + "loss": 0.8432, + "step": 1265 + }, + { + "epoch": 1.125208310187757, + "grad_norm": 0.3668549656867981, + "learning_rate": 0.0001, + "loss": 0.9503, + "step": 1266 + }, + { + "epoch": 1.1260971003221865, + "grad_norm": 0.34030482172966003, + "learning_rate": 0.0001, + "loss": 0.8852, + "step": 1267 + }, + { + "epoch": 1.126985890456616, + "grad_norm": 0.4020557403564453, + "learning_rate": 0.0001, + "loss": 0.9593, + "step": 1268 + }, + { + "epoch": 1.1278746805910456, + "grad_norm": 0.34024256467819214, + "learning_rate": 0.0001, + "loss": 0.8787, + "step": 1269 + }, + { + "epoch": 1.128763470725475, + "grad_norm": 0.4241231083869934, + "learning_rate": 0.0001, + "loss": 0.9793, + "step": 1270 + }, + { + "epoch": 1.1296522608599044, + "grad_norm": 0.4005233645439148, + "learning_rate": 0.0001, + "loss": 0.9619, + "step": 1271 + }, + { + "epoch": 1.1305410509943339, + "grad_norm": 0.38167819380760193, + "learning_rate": 0.0001, + "loss": 0.9449, + "step": 1272 + }, + { + "epoch": 1.1314298411287635, + "grad_norm": 0.34920740127563477, + "learning_rate": 0.0001, + "loss": 0.9811, + "step": 1273 + }, + { + "epoch": 1.132318631263193, + "grad_norm": 0.3310723900794983, + "learning_rate": 0.0001, + "loss": 0.8865, + "step": 1274 + }, + { + "epoch": 1.1332074213976224, + "grad_norm": 0.3370254933834076, + "learning_rate": 0.0001, + "loss": 0.8304, + "step": 1275 + }, + { + "epoch": 1.134096211532052, + "grad_norm": 0.35583510994911194, + "learning_rate": 0.0001, + "loss": 0.9203, + "step": 1276 + }, + { + "epoch": 1.1349850016664815, + "grad_norm": 0.34542956948280334, + "learning_rate": 0.0001, + "loss": 0.9403, + "step": 1277 + }, + { + "epoch": 1.135873791800911, + "grad_norm": 0.3754197657108307, + "learning_rate": 0.0001, + "loss": 0.9866, + "step": 1278 + }, + { + "epoch": 1.1367625819353404, + "grad_norm": 0.3555785119533539, + "learning_rate": 0.0001, + "loss": 0.8689, + "step": 1279 + }, + { + "epoch": 1.13765137206977, + "grad_norm": 0.33429041504859924, + "learning_rate": 0.0001, + "loss": 0.833, + "step": 1280 + }, + { + "epoch": 1.1385401622041995, + "grad_norm": 0.3541424572467804, + "learning_rate": 0.0001, + "loss": 0.8759, + "step": 1281 + }, + { + "epoch": 1.139428952338629, + "grad_norm": 0.36240896582603455, + "learning_rate": 0.0001, + "loss": 0.9841, + "step": 1282 + }, + { + "epoch": 1.1403177424730586, + "grad_norm": 0.3360092043876648, + "learning_rate": 0.0001, + "loss": 0.9414, + "step": 1283 + }, + { + "epoch": 1.141206532607488, + "grad_norm": 0.3737250864505768, + "learning_rate": 0.0001, + "loss": 0.9729, + "step": 1284 + }, + { + "epoch": 1.1420953227419175, + "grad_norm": 0.37672123312950134, + "learning_rate": 0.0001, + "loss": 0.8692, + "step": 1285 + }, + { + "epoch": 1.1429841128763472, + "grad_norm": 0.3979622721672058, + "learning_rate": 0.0001, + "loss": 0.9644, + "step": 1286 + }, + { + "epoch": 1.1438729030107766, + "grad_norm": 0.4209362268447876, + "learning_rate": 0.0001, + "loss": 0.9097, + "step": 1287 + }, + { + "epoch": 1.144761693145206, + "grad_norm": 0.3987758159637451, + "learning_rate": 0.0001, + "loss": 0.8919, + "step": 1288 + }, + { + "epoch": 1.1456504832796357, + "grad_norm": 0.3739655613899231, + "learning_rate": 0.0001, + "loss": 0.9623, + "step": 1289 + }, + { + "epoch": 1.1465392734140651, + "grad_norm": 0.3652169406414032, + "learning_rate": 0.0001, + "loss": 0.9033, + "step": 1290 + }, + { + "epoch": 1.1474280635484946, + "grad_norm": 0.3900451958179474, + "learning_rate": 0.0001, + "loss": 0.9795, + "step": 1291 + }, + { + "epoch": 1.1483168536829242, + "grad_norm": 0.36547282338142395, + "learning_rate": 0.0001, + "loss": 0.9595, + "step": 1292 + }, + { + "epoch": 1.1492056438173537, + "grad_norm": 0.3713114559650421, + "learning_rate": 0.0001, + "loss": 0.8854, + "step": 1293 + }, + { + "epoch": 1.1500944339517831, + "grad_norm": 0.3759624660015106, + "learning_rate": 0.0001, + "loss": 0.9053, + "step": 1294 + }, + { + "epoch": 1.1509832240862126, + "grad_norm": 0.3536144196987152, + "learning_rate": 0.0001, + "loss": 1.0054, + "step": 1295 + }, + { + "epoch": 1.1518720142206422, + "grad_norm": 0.36850425601005554, + "learning_rate": 0.0001, + "loss": 0.9134, + "step": 1296 + }, + { + "epoch": 1.1527608043550717, + "grad_norm": 0.3645191192626953, + "learning_rate": 0.0001, + "loss": 1.0067, + "step": 1297 + }, + { + "epoch": 1.153649594489501, + "grad_norm": 0.3773975670337677, + "learning_rate": 0.0001, + "loss": 0.9362, + "step": 1298 + }, + { + "epoch": 1.1545383846239308, + "grad_norm": 0.35518959164619446, + "learning_rate": 0.0001, + "loss": 0.9289, + "step": 1299 + }, + { + "epoch": 1.1554271747583602, + "grad_norm": 0.4167252779006958, + "learning_rate": 0.0001, + "loss": 0.8655, + "step": 1300 + }, + { + "epoch": 1.1563159648927896, + "grad_norm": 0.35073405504226685, + "learning_rate": 0.0001, + "loss": 0.8565, + "step": 1301 + }, + { + "epoch": 1.157204755027219, + "grad_norm": 0.3294246196746826, + "learning_rate": 0.0001, + "loss": 0.8501, + "step": 1302 + }, + { + "epoch": 1.1580935451616488, + "grad_norm": 0.3594052493572235, + "learning_rate": 0.0001, + "loss": 0.96, + "step": 1303 + }, + { + "epoch": 1.1589823352960782, + "grad_norm": 0.40062353014945984, + "learning_rate": 0.0001, + "loss": 0.9908, + "step": 1304 + }, + { + "epoch": 1.1598711254305076, + "grad_norm": 0.38618043065071106, + "learning_rate": 0.0001, + "loss": 0.9185, + "step": 1305 + }, + { + "epoch": 1.1607599155649373, + "grad_norm": 0.36038386821746826, + "learning_rate": 0.0001, + "loss": 0.9733, + "step": 1306 + }, + { + "epoch": 1.1616487056993667, + "grad_norm": 0.3996288776397705, + "learning_rate": 0.0001, + "loss": 0.9534, + "step": 1307 + }, + { + "epoch": 1.1625374958337962, + "grad_norm": 0.3757215440273285, + "learning_rate": 0.0001, + "loss": 0.9219, + "step": 1308 + }, + { + "epoch": 1.1634262859682258, + "grad_norm": 0.3682938814163208, + "learning_rate": 0.0001, + "loss": 0.9141, + "step": 1309 + }, + { + "epoch": 1.1643150761026553, + "grad_norm": 0.36964985728263855, + "learning_rate": 0.0001, + "loss": 0.9956, + "step": 1310 + }, + { + "epoch": 1.1652038662370847, + "grad_norm": 0.39142096042633057, + "learning_rate": 0.0001, + "loss": 0.8502, + "step": 1311 + }, + { + "epoch": 1.1660926563715144, + "grad_norm": 0.31965819001197815, + "learning_rate": 0.0001, + "loss": 0.8842, + "step": 1312 + }, + { + "epoch": 1.1669814465059438, + "grad_norm": 0.40179508924484253, + "learning_rate": 0.0001, + "loss": 0.9441, + "step": 1313 + }, + { + "epoch": 1.1678702366403733, + "grad_norm": 0.3406355679035187, + "learning_rate": 0.0001, + "loss": 0.9468, + "step": 1314 + }, + { + "epoch": 1.168759026774803, + "grad_norm": 0.3638407289981842, + "learning_rate": 0.0001, + "loss": 0.9296, + "step": 1315 + }, + { + "epoch": 1.1696478169092324, + "grad_norm": 0.34185290336608887, + "learning_rate": 0.0001, + "loss": 0.8647, + "step": 1316 + }, + { + "epoch": 1.1705366070436618, + "grad_norm": 0.3356599807739258, + "learning_rate": 0.0001, + "loss": 0.9484, + "step": 1317 + }, + { + "epoch": 1.1714253971780912, + "grad_norm": 0.3953014314174652, + "learning_rate": 0.0001, + "loss": 0.947, + "step": 1318 + }, + { + "epoch": 1.172314187312521, + "grad_norm": 0.4338318109512329, + "learning_rate": 0.0001, + "loss": 0.9389, + "step": 1319 + }, + { + "epoch": 1.1732029774469503, + "grad_norm": 0.38250255584716797, + "learning_rate": 0.0001, + "loss": 0.9113, + "step": 1320 + }, + { + "epoch": 1.1740917675813798, + "grad_norm": 1.4235469102859497, + "learning_rate": 0.0001, + "loss": 0.837, + "step": 1321 + }, + { + "epoch": 1.1749805577158094, + "grad_norm": 0.4314570724964142, + "learning_rate": 0.0001, + "loss": 0.955, + "step": 1322 + }, + { + "epoch": 1.1758693478502389, + "grad_norm": 0.3919623792171478, + "learning_rate": 0.0001, + "loss": 0.9662, + "step": 1323 + }, + { + "epoch": 1.1767581379846683, + "grad_norm": 0.3859492540359497, + "learning_rate": 0.0001, + "loss": 0.8974, + "step": 1324 + }, + { + "epoch": 1.1776469281190978, + "grad_norm": 0.39512211084365845, + "learning_rate": 0.0001, + "loss": 0.9592, + "step": 1325 + }, + { + "epoch": 1.1785357182535274, + "grad_norm": 0.34336525201797485, + "learning_rate": 0.0001, + "loss": 0.8636, + "step": 1326 + }, + { + "epoch": 1.1794245083879569, + "grad_norm": 0.39709898829460144, + "learning_rate": 0.0001, + "loss": 0.8655, + "step": 1327 + }, + { + "epoch": 1.1803132985223863, + "grad_norm": 0.4077267348766327, + "learning_rate": 0.0001, + "loss": 0.9687, + "step": 1328 + }, + { + "epoch": 1.181202088656816, + "grad_norm": 0.3338770866394043, + "learning_rate": 0.0001, + "loss": 0.9293, + "step": 1329 + }, + { + "epoch": 1.1820908787912454, + "grad_norm": 0.3750726580619812, + "learning_rate": 0.0001, + "loss": 0.922, + "step": 1330 + }, + { + "epoch": 1.1829796689256749, + "grad_norm": 0.35356971621513367, + "learning_rate": 0.0001, + "loss": 0.8574, + "step": 1331 + }, + { + "epoch": 1.1838684590601045, + "grad_norm": 0.41766199469566345, + "learning_rate": 0.0001, + "loss": 0.9395, + "step": 1332 + }, + { + "epoch": 1.184757249194534, + "grad_norm": 0.37516269087791443, + "learning_rate": 0.0001, + "loss": 0.9011, + "step": 1333 + }, + { + "epoch": 1.1856460393289634, + "grad_norm": 0.3945169746875763, + "learning_rate": 0.0001, + "loss": 0.9384, + "step": 1334 + }, + { + "epoch": 1.186534829463393, + "grad_norm": 0.33570146560668945, + "learning_rate": 0.0001, + "loss": 0.9534, + "step": 1335 + }, + { + "epoch": 1.1874236195978225, + "grad_norm": 0.35922086238861084, + "learning_rate": 0.0001, + "loss": 0.9344, + "step": 1336 + }, + { + "epoch": 1.188312409732252, + "grad_norm": 0.33384042978286743, + "learning_rate": 0.0001, + "loss": 0.9778, + "step": 1337 + }, + { + "epoch": 1.1892011998666816, + "grad_norm": 0.3461971879005432, + "learning_rate": 0.0001, + "loss": 0.9317, + "step": 1338 + }, + { + "epoch": 1.190089990001111, + "grad_norm": 0.366767019033432, + "learning_rate": 0.0001, + "loss": 0.923, + "step": 1339 + }, + { + "epoch": 1.1909787801355405, + "grad_norm": 0.35670191049575806, + "learning_rate": 0.0001, + "loss": 0.8958, + "step": 1340 + }, + { + "epoch": 1.19186757026997, + "grad_norm": 0.3564472496509552, + "learning_rate": 0.0001, + "loss": 0.8879, + "step": 1341 + }, + { + "epoch": 1.1927563604043996, + "grad_norm": 0.34850582480430603, + "learning_rate": 0.0001, + "loss": 0.9472, + "step": 1342 + }, + { + "epoch": 1.193645150538829, + "grad_norm": 0.3118721842765808, + "learning_rate": 0.0001, + "loss": 0.9182, + "step": 1343 + }, + { + "epoch": 1.1945339406732585, + "grad_norm": 0.4474131464958191, + "learning_rate": 0.0001, + "loss": 0.9986, + "step": 1344 + }, + { + "epoch": 1.1954227308076881, + "grad_norm": 0.4048672616481781, + "learning_rate": 0.0001, + "loss": 0.924, + "step": 1345 + }, + { + "epoch": 1.1963115209421176, + "grad_norm": 0.3493656814098358, + "learning_rate": 0.0001, + "loss": 0.8614, + "step": 1346 + }, + { + "epoch": 1.197200311076547, + "grad_norm": 0.32247281074523926, + "learning_rate": 0.0001, + "loss": 0.8659, + "step": 1347 + }, + { + "epoch": 1.1980891012109764, + "grad_norm": 0.4119790196418762, + "learning_rate": 0.0001, + "loss": 0.9521, + "step": 1348 + }, + { + "epoch": 1.198977891345406, + "grad_norm": 0.40390363335609436, + "learning_rate": 0.0001, + "loss": 0.9203, + "step": 1349 + }, + { + "epoch": 1.1998666814798356, + "grad_norm": 0.40085315704345703, + "learning_rate": 0.0001, + "loss": 1.0135, + "step": 1350 + }, + { + "epoch": 1.200755471614265, + "grad_norm": 0.3355647325515747, + "learning_rate": 0.0001, + "loss": 0.9248, + "step": 1351 + }, + { + "epoch": 1.2016442617486947, + "grad_norm": 0.3319724500179291, + "learning_rate": 0.0001, + "loss": 0.8671, + "step": 1352 + }, + { + "epoch": 1.202533051883124, + "grad_norm": 0.4024242162704468, + "learning_rate": 0.0001, + "loss": 0.9497, + "step": 1353 + }, + { + "epoch": 1.2034218420175535, + "grad_norm": 0.3751072287559509, + "learning_rate": 0.0001, + "loss": 0.9692, + "step": 1354 + }, + { + "epoch": 1.2043106321519832, + "grad_norm": 0.3578762114048004, + "learning_rate": 0.0001, + "loss": 0.9306, + "step": 1355 + }, + { + "epoch": 1.2051994222864126, + "grad_norm": 0.41604581475257874, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 1356 + }, + { + "epoch": 1.206088212420842, + "grad_norm": 0.38614901900291443, + "learning_rate": 0.0001, + "loss": 0.9475, + "step": 1357 + }, + { + "epoch": 1.2069770025552717, + "grad_norm": 0.3261745274066925, + "learning_rate": 0.0001, + "loss": 0.9745, + "step": 1358 + }, + { + "epoch": 1.2078657926897012, + "grad_norm": 0.34567147493362427, + "learning_rate": 0.0001, + "loss": 0.9237, + "step": 1359 + }, + { + "epoch": 1.2087545828241306, + "grad_norm": 0.3168199360370636, + "learning_rate": 0.0001, + "loss": 0.9789, + "step": 1360 + }, + { + "epoch": 1.2096433729585603, + "grad_norm": 0.32394152879714966, + "learning_rate": 0.0001, + "loss": 0.9087, + "step": 1361 + }, + { + "epoch": 1.2105321630929897, + "grad_norm": 0.41741707921028137, + "learning_rate": 0.0001, + "loss": 0.9361, + "step": 1362 + }, + { + "epoch": 1.2114209532274192, + "grad_norm": 0.37223905324935913, + "learning_rate": 0.0001, + "loss": 0.9193, + "step": 1363 + }, + { + "epoch": 1.2123097433618486, + "grad_norm": 0.3394053876399994, + "learning_rate": 0.0001, + "loss": 0.9014, + "step": 1364 + }, + { + "epoch": 1.2131985334962783, + "grad_norm": 0.43821749091148376, + "learning_rate": 0.0001, + "loss": 0.9964, + "step": 1365 + }, + { + "epoch": 1.2140873236307077, + "grad_norm": 0.3783372938632965, + "learning_rate": 0.0001, + "loss": 0.9388, + "step": 1366 + }, + { + "epoch": 1.2149761137651371, + "grad_norm": 0.3495936691761017, + "learning_rate": 0.0001, + "loss": 0.8109, + "step": 1367 + }, + { + "epoch": 1.2158649038995668, + "grad_norm": 0.3929634690284729, + "learning_rate": 0.0001, + "loss": 0.9825, + "step": 1368 + }, + { + "epoch": 1.2167536940339962, + "grad_norm": 0.3742099702358246, + "learning_rate": 0.0001, + "loss": 0.9098, + "step": 1369 + }, + { + "epoch": 1.2176424841684257, + "grad_norm": 0.36973753571510315, + "learning_rate": 0.0001, + "loss": 0.9053, + "step": 1370 + }, + { + "epoch": 1.2185312743028551, + "grad_norm": 0.362594872713089, + "learning_rate": 0.0001, + "loss": 0.9906, + "step": 1371 + }, + { + "epoch": 1.2194200644372848, + "grad_norm": 0.442941278219223, + "learning_rate": 0.0001, + "loss": 1.0028, + "step": 1372 + }, + { + "epoch": 1.2203088545717142, + "grad_norm": 0.41126692295074463, + "learning_rate": 0.0001, + "loss": 0.9192, + "step": 1373 + }, + { + "epoch": 1.2211976447061437, + "grad_norm": 0.3734437823295593, + "learning_rate": 0.0001, + "loss": 0.919, + "step": 1374 + }, + { + "epoch": 1.2220864348405733, + "grad_norm": 0.37153056263923645, + "learning_rate": 0.0001, + "loss": 0.9253, + "step": 1375 + }, + { + "epoch": 1.2229752249750028, + "grad_norm": 0.4021115303039551, + "learning_rate": 0.0001, + "loss": 0.9013, + "step": 1376 + }, + { + "epoch": 1.2238640151094322, + "grad_norm": 0.3954538404941559, + "learning_rate": 0.0001, + "loss": 0.9907, + "step": 1377 + }, + { + "epoch": 1.2247528052438619, + "grad_norm": 0.707315981388092, + "learning_rate": 0.0001, + "loss": 0.9124, + "step": 1378 + }, + { + "epoch": 1.2256415953782913, + "grad_norm": 0.2907516658306122, + "learning_rate": 0.0001, + "loss": 0.8792, + "step": 1379 + }, + { + "epoch": 1.2265303855127208, + "grad_norm": 0.32730191946029663, + "learning_rate": 0.0001, + "loss": 0.8724, + "step": 1380 + }, + { + "epoch": 1.2274191756471504, + "grad_norm": 0.3393136262893677, + "learning_rate": 0.0001, + "loss": 0.9822, + "step": 1381 + }, + { + "epoch": 1.2283079657815799, + "grad_norm": 0.3186226189136505, + "learning_rate": 0.0001, + "loss": 0.9026, + "step": 1382 + }, + { + "epoch": 1.2291967559160093, + "grad_norm": 0.3546365797519684, + "learning_rate": 0.0001, + "loss": 0.9419, + "step": 1383 + }, + { + "epoch": 1.230085546050439, + "grad_norm": 0.33756113052368164, + "learning_rate": 0.0001, + "loss": 0.9292, + "step": 1384 + }, + { + "epoch": 1.2309743361848684, + "grad_norm": 0.357030987739563, + "learning_rate": 0.0001, + "loss": 0.8748, + "step": 1385 + }, + { + "epoch": 1.2318631263192978, + "grad_norm": 0.3271646797657013, + "learning_rate": 0.0001, + "loss": 0.9411, + "step": 1386 + }, + { + "epoch": 1.2327519164537273, + "grad_norm": 0.3888196647167206, + "learning_rate": 0.0001, + "loss": 0.9634, + "step": 1387 + }, + { + "epoch": 1.233640706588157, + "grad_norm": 0.3897395730018616, + "learning_rate": 0.0001, + "loss": 0.9404, + "step": 1388 + }, + { + "epoch": 1.2345294967225864, + "grad_norm": 0.36448606848716736, + "learning_rate": 0.0001, + "loss": 0.9543, + "step": 1389 + }, + { + "epoch": 1.2354182868570158, + "grad_norm": 0.35686194896698, + "learning_rate": 0.0001, + "loss": 0.8772, + "step": 1390 + }, + { + "epoch": 1.2363070769914455, + "grad_norm": 0.3667612373828888, + "learning_rate": 0.0001, + "loss": 0.8449, + "step": 1391 + }, + { + "epoch": 1.237195867125875, + "grad_norm": 0.35356754064559937, + "learning_rate": 0.0001, + "loss": 0.9099, + "step": 1392 + }, + { + "epoch": 1.2380846572603044, + "grad_norm": 0.3918624818325043, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 1393 + }, + { + "epoch": 1.2389734473947338, + "grad_norm": 0.347768634557724, + "learning_rate": 0.0001, + "loss": 0.9423, + "step": 1394 + }, + { + "epoch": 1.2398622375291635, + "grad_norm": 0.3811168372631073, + "learning_rate": 0.0001, + "loss": 1.0301, + "step": 1395 + }, + { + "epoch": 1.240751027663593, + "grad_norm": 0.3333447575569153, + "learning_rate": 0.0001, + "loss": 0.9424, + "step": 1396 + }, + { + "epoch": 1.2416398177980223, + "grad_norm": 0.3668173551559448, + "learning_rate": 0.0001, + "loss": 0.9513, + "step": 1397 + }, + { + "epoch": 1.242528607932452, + "grad_norm": 0.4245815575122833, + "learning_rate": 0.0001, + "loss": 0.8381, + "step": 1398 + }, + { + "epoch": 1.2434173980668815, + "grad_norm": 0.38784492015838623, + "learning_rate": 0.0001, + "loss": 0.9348, + "step": 1399 + }, + { + "epoch": 1.244306188201311, + "grad_norm": 0.3626404404640198, + "learning_rate": 0.0001, + "loss": 0.891, + "step": 1400 + }, + { + "epoch": 1.2451949783357406, + "grad_norm": 0.3952024579048157, + "learning_rate": 0.0001, + "loss": 0.8988, + "step": 1401 + }, + { + "epoch": 1.24608376847017, + "grad_norm": 0.3714921474456787, + "learning_rate": 0.0001, + "loss": 0.9597, + "step": 1402 + }, + { + "epoch": 1.2469725586045994, + "grad_norm": 0.3674130141735077, + "learning_rate": 0.0001, + "loss": 0.8814, + "step": 1403 + }, + { + "epoch": 1.247861348739029, + "grad_norm": 0.332264244556427, + "learning_rate": 0.0001, + "loss": 0.8723, + "step": 1404 + }, + { + "epoch": 1.2487501388734585, + "grad_norm": 0.44990214705467224, + "learning_rate": 0.0001, + "loss": 0.8562, + "step": 1405 + }, + { + "epoch": 1.249638929007888, + "grad_norm": 0.3691662847995758, + "learning_rate": 0.0001, + "loss": 0.9808, + "step": 1406 + }, + { + "epoch": 1.2505277191423176, + "grad_norm": 0.4463985562324524, + "learning_rate": 0.0001, + "loss": 1.0113, + "step": 1407 + }, + { + "epoch": 1.251416509276747, + "grad_norm": 0.40863966941833496, + "learning_rate": 0.0001, + "loss": 0.8999, + "step": 1408 + }, + { + "epoch": 1.2523052994111765, + "grad_norm": 0.39324069023132324, + "learning_rate": 0.0001, + "loss": 0.9607, + "step": 1409 + }, + { + "epoch": 1.2531940895456062, + "grad_norm": 0.4144541919231415, + "learning_rate": 0.0001, + "loss": 0.9257, + "step": 1410 + }, + { + "epoch": 1.2540828796800356, + "grad_norm": 0.3502121567726135, + "learning_rate": 0.0001, + "loss": 0.9035, + "step": 1411 + }, + { + "epoch": 1.254971669814465, + "grad_norm": 0.3595639765262604, + "learning_rate": 0.0001, + "loss": 0.958, + "step": 1412 + }, + { + "epoch": 1.2558604599488945, + "grad_norm": 0.31337282061576843, + "learning_rate": 0.0001, + "loss": 0.8338, + "step": 1413 + }, + { + "epoch": 1.256749250083324, + "grad_norm": 0.38238781690597534, + "learning_rate": 0.0001, + "loss": 0.9705, + "step": 1414 + }, + { + "epoch": 1.2576380402177536, + "grad_norm": 0.34189799427986145, + "learning_rate": 0.0001, + "loss": 0.9685, + "step": 1415 + }, + { + "epoch": 1.258526830352183, + "grad_norm": 0.3783426582813263, + "learning_rate": 0.0001, + "loss": 0.8579, + "step": 1416 + }, + { + "epoch": 1.2594156204866125, + "grad_norm": 0.35170289874076843, + "learning_rate": 0.0001, + "loss": 1.0055, + "step": 1417 + }, + { + "epoch": 1.2603044106210421, + "grad_norm": 0.3678469657897949, + "learning_rate": 0.0001, + "loss": 0.9932, + "step": 1418 + }, + { + "epoch": 1.2611932007554716, + "grad_norm": 0.35460302233695984, + "learning_rate": 0.0001, + "loss": 0.8314, + "step": 1419 + }, + { + "epoch": 1.262081990889901, + "grad_norm": 0.3786843419075012, + "learning_rate": 0.0001, + "loss": 0.9391, + "step": 1420 + }, + { + "epoch": 1.2629707810243307, + "grad_norm": 0.37704455852508545, + "learning_rate": 0.0001, + "loss": 0.9314, + "step": 1421 + }, + { + "epoch": 1.2638595711587601, + "grad_norm": 0.3811683654785156, + "learning_rate": 0.0001, + "loss": 0.9184, + "step": 1422 + }, + { + "epoch": 1.2647483612931896, + "grad_norm": 0.3323057293891907, + "learning_rate": 0.0001, + "loss": 0.8478, + "step": 1423 + }, + { + "epoch": 1.2656371514276192, + "grad_norm": 0.33943992853164673, + "learning_rate": 0.0001, + "loss": 0.7872, + "step": 1424 + }, + { + "epoch": 1.2665259415620487, + "grad_norm": 0.370331346988678, + "learning_rate": 0.0001, + "loss": 1.0004, + "step": 1425 + }, + { + "epoch": 1.2674147316964781, + "grad_norm": 0.3509312868118286, + "learning_rate": 0.0001, + "loss": 0.8777, + "step": 1426 + }, + { + "epoch": 1.2683035218309078, + "grad_norm": 0.35262539982795715, + "learning_rate": 0.0001, + "loss": 0.8772, + "step": 1427 + }, + { + "epoch": 1.2691923119653372, + "grad_norm": 0.3514516055583954, + "learning_rate": 0.0001, + "loss": 0.9092, + "step": 1428 + }, + { + "epoch": 1.2700811020997667, + "grad_norm": 0.33917155861854553, + "learning_rate": 0.0001, + "loss": 0.9088, + "step": 1429 + }, + { + "epoch": 1.2709698922341963, + "grad_norm": 0.37379592657089233, + "learning_rate": 0.0001, + "loss": 0.8811, + "step": 1430 + }, + { + "epoch": 1.2718586823686258, + "grad_norm": 0.37309855222702026, + "learning_rate": 0.0001, + "loss": 0.9243, + "step": 1431 + }, + { + "epoch": 1.2727474725030552, + "grad_norm": 0.3677520751953125, + "learning_rate": 0.0001, + "loss": 0.9191, + "step": 1432 + }, + { + "epoch": 1.2736362626374849, + "grad_norm": 0.3768025040626526, + "learning_rate": 0.0001, + "loss": 0.9713, + "step": 1433 + }, + { + "epoch": 1.2745250527719143, + "grad_norm": 0.3794255256652832, + "learning_rate": 0.0001, + "loss": 0.8944, + "step": 1434 + }, + { + "epoch": 1.2754138429063437, + "grad_norm": 0.3847828805446625, + "learning_rate": 0.0001, + "loss": 0.9895, + "step": 1435 + }, + { + "epoch": 1.2763026330407732, + "grad_norm": 0.3901936411857605, + "learning_rate": 0.0001, + "loss": 0.9604, + "step": 1436 + }, + { + "epoch": 1.2771914231752026, + "grad_norm": 0.36127346754074097, + "learning_rate": 0.0001, + "loss": 0.9633, + "step": 1437 + }, + { + "epoch": 1.2780802133096323, + "grad_norm": 0.3554041385650635, + "learning_rate": 0.0001, + "loss": 0.9282, + "step": 1438 + }, + { + "epoch": 1.2789690034440617, + "grad_norm": 0.3366556465625763, + "learning_rate": 0.0001, + "loss": 0.9398, + "step": 1439 + }, + { + "epoch": 1.2798577935784912, + "grad_norm": 0.38084515929222107, + "learning_rate": 0.0001, + "loss": 1.0101, + "step": 1440 + }, + { + "epoch": 1.2807465837129208, + "grad_norm": 0.35212430357933044, + "learning_rate": 0.0001, + "loss": 0.921, + "step": 1441 + }, + { + "epoch": 1.2816353738473503, + "grad_norm": 0.37851765751838684, + "learning_rate": 0.0001, + "loss": 0.8989, + "step": 1442 + }, + { + "epoch": 1.2825241639817797, + "grad_norm": 0.31411126255989075, + "learning_rate": 0.0001, + "loss": 0.8476, + "step": 1443 + }, + { + "epoch": 1.2834129541162094, + "grad_norm": 0.35545769333839417, + "learning_rate": 0.0001, + "loss": 0.9857, + "step": 1444 + }, + { + "epoch": 1.2843017442506388, + "grad_norm": 0.36752983927726746, + "learning_rate": 0.0001, + "loss": 0.8876, + "step": 1445 + }, + { + "epoch": 1.2851905343850683, + "grad_norm": 0.3653068542480469, + "learning_rate": 0.0001, + "loss": 0.8651, + "step": 1446 + }, + { + "epoch": 1.286079324519498, + "grad_norm": 0.3388923704624176, + "learning_rate": 0.0001, + "loss": 0.8922, + "step": 1447 + }, + { + "epoch": 1.2869681146539274, + "grad_norm": 0.3574695587158203, + "learning_rate": 0.0001, + "loss": 0.903, + "step": 1448 + }, + { + "epoch": 1.2878569047883568, + "grad_norm": 0.36422044038772583, + "learning_rate": 0.0001, + "loss": 0.9122, + "step": 1449 + }, + { + "epoch": 1.2887456949227865, + "grad_norm": 0.3757425844669342, + "learning_rate": 0.0001, + "loss": 0.9014, + "step": 1450 + }, + { + "epoch": 1.289634485057216, + "grad_norm": 0.3608701229095459, + "learning_rate": 0.0001, + "loss": 0.9089, + "step": 1451 + }, + { + "epoch": 1.2905232751916453, + "grad_norm": 0.3886134922504425, + "learning_rate": 0.0001, + "loss": 0.817, + "step": 1452 + }, + { + "epoch": 1.291412065326075, + "grad_norm": 0.33738628029823303, + "learning_rate": 0.0001, + "loss": 0.8872, + "step": 1453 + }, + { + "epoch": 1.2923008554605044, + "grad_norm": 0.3938889801502228, + "learning_rate": 0.0001, + "loss": 0.8523, + "step": 1454 + }, + { + "epoch": 1.2931896455949339, + "grad_norm": 0.37151628732681274, + "learning_rate": 0.0001, + "loss": 0.9448, + "step": 1455 + }, + { + "epoch": 1.2940784357293635, + "grad_norm": 0.36321815848350525, + "learning_rate": 0.0001, + "loss": 0.9511, + "step": 1456 + }, + { + "epoch": 1.294967225863793, + "grad_norm": 0.35245969891548157, + "learning_rate": 0.0001, + "loss": 1.0141, + "step": 1457 + }, + { + "epoch": 1.2958560159982224, + "grad_norm": 0.3494570851325989, + "learning_rate": 0.0001, + "loss": 0.9411, + "step": 1458 + }, + { + "epoch": 1.2967448061326519, + "grad_norm": 0.3875609338283539, + "learning_rate": 0.0001, + "loss": 0.9318, + "step": 1459 + }, + { + "epoch": 1.2976335962670813, + "grad_norm": 0.3761698305606842, + "learning_rate": 0.0001, + "loss": 0.8563, + "step": 1460 + }, + { + "epoch": 1.298522386401511, + "grad_norm": 0.3185148239135742, + "learning_rate": 0.0001, + "loss": 0.9486, + "step": 1461 + }, + { + "epoch": 1.2994111765359404, + "grad_norm": 0.3480708599090576, + "learning_rate": 0.0001, + "loss": 0.8733, + "step": 1462 + }, + { + "epoch": 1.3002999666703698, + "grad_norm": 0.3743402659893036, + "learning_rate": 0.0001, + "loss": 0.9778, + "step": 1463 + }, + { + "epoch": 1.3011887568047995, + "grad_norm": 0.4024488031864166, + "learning_rate": 0.0001, + "loss": 0.8907, + "step": 1464 + }, + { + "epoch": 1.302077546939229, + "grad_norm": 0.366472452878952, + "learning_rate": 0.0001, + "loss": 0.9898, + "step": 1465 + }, + { + "epoch": 1.3029663370736584, + "grad_norm": 0.36721619963645935, + "learning_rate": 0.0001, + "loss": 0.8577, + "step": 1466 + }, + { + "epoch": 1.303855127208088, + "grad_norm": 0.36933907866477966, + "learning_rate": 0.0001, + "loss": 0.9365, + "step": 1467 + }, + { + "epoch": 1.3047439173425175, + "grad_norm": 0.36210325360298157, + "learning_rate": 0.0001, + "loss": 0.8787, + "step": 1468 + }, + { + "epoch": 1.305632707476947, + "grad_norm": 0.3773205876350403, + "learning_rate": 0.0001, + "loss": 0.9396, + "step": 1469 + }, + { + "epoch": 1.3065214976113766, + "grad_norm": 0.3904916048049927, + "learning_rate": 0.0001, + "loss": 0.9872, + "step": 1470 + }, + { + "epoch": 1.307410287745806, + "grad_norm": 0.38779687881469727, + "learning_rate": 0.0001, + "loss": 0.9682, + "step": 1471 + }, + { + "epoch": 1.3082990778802355, + "grad_norm": 0.3612334728240967, + "learning_rate": 0.0001, + "loss": 0.8867, + "step": 1472 + }, + { + "epoch": 1.3091878680146651, + "grad_norm": 0.4181813597679138, + "learning_rate": 0.0001, + "loss": 0.9666, + "step": 1473 + }, + { + "epoch": 1.3100766581490946, + "grad_norm": 0.4051264524459839, + "learning_rate": 0.0001, + "loss": 0.9088, + "step": 1474 + }, + { + "epoch": 1.310965448283524, + "grad_norm": 0.3271094262599945, + "learning_rate": 0.0001, + "loss": 0.9018, + "step": 1475 + }, + { + "epoch": 1.3118542384179537, + "grad_norm": 0.35540738701820374, + "learning_rate": 0.0001, + "loss": 1.0157, + "step": 1476 + }, + { + "epoch": 1.3127430285523831, + "grad_norm": 0.36050257086753845, + "learning_rate": 0.0001, + "loss": 0.9347, + "step": 1477 + }, + { + "epoch": 1.3136318186868126, + "grad_norm": 0.39505940675735474, + "learning_rate": 0.0001, + "loss": 0.9321, + "step": 1478 + }, + { + "epoch": 1.3145206088212422, + "grad_norm": 0.3984764516353607, + "learning_rate": 0.0001, + "loss": 0.9472, + "step": 1479 + }, + { + "epoch": 1.3154093989556717, + "grad_norm": 0.3784257769584656, + "learning_rate": 0.0001, + "loss": 0.9075, + "step": 1480 + }, + { + "epoch": 1.316298189090101, + "grad_norm": 0.35195082426071167, + "learning_rate": 0.0001, + "loss": 0.9514, + "step": 1481 + }, + { + "epoch": 1.3171869792245305, + "grad_norm": 0.33862170577049255, + "learning_rate": 0.0001, + "loss": 0.9411, + "step": 1482 + }, + { + "epoch": 1.31807576935896, + "grad_norm": 0.3316287696361542, + "learning_rate": 0.0001, + "loss": 0.9368, + "step": 1483 + }, + { + "epoch": 1.3189645594933896, + "grad_norm": 0.3842187225818634, + "learning_rate": 0.0001, + "loss": 0.9396, + "step": 1484 + }, + { + "epoch": 1.319853349627819, + "grad_norm": 0.39101001620292664, + "learning_rate": 0.0001, + "loss": 0.9329, + "step": 1485 + }, + { + "epoch": 1.3207421397622485, + "grad_norm": 0.3348684012889862, + "learning_rate": 0.0001, + "loss": 0.8891, + "step": 1486 + }, + { + "epoch": 1.3216309298966782, + "grad_norm": 0.35855552554130554, + "learning_rate": 0.0001, + "loss": 0.9148, + "step": 1487 + }, + { + "epoch": 1.3225197200311076, + "grad_norm": 0.39019593596458435, + "learning_rate": 0.0001, + "loss": 0.9043, + "step": 1488 + }, + { + "epoch": 1.323408510165537, + "grad_norm": 0.3681492805480957, + "learning_rate": 0.0001, + "loss": 0.9222, + "step": 1489 + }, + { + "epoch": 1.3242973002999667, + "grad_norm": 0.3557593524456024, + "learning_rate": 0.0001, + "loss": 0.8972, + "step": 1490 + }, + { + "epoch": 1.3251860904343962, + "grad_norm": 0.3503437638282776, + "learning_rate": 0.0001, + "loss": 0.9307, + "step": 1491 + }, + { + "epoch": 1.3260748805688256, + "grad_norm": 0.31487593054771423, + "learning_rate": 0.0001, + "loss": 0.899, + "step": 1492 + }, + { + "epoch": 1.3269636707032553, + "grad_norm": 0.3774222433567047, + "learning_rate": 0.0001, + "loss": 0.8246, + "step": 1493 + }, + { + "epoch": 1.3278524608376847, + "grad_norm": 0.3760510981082916, + "learning_rate": 0.0001, + "loss": 1.0001, + "step": 1494 + }, + { + "epoch": 1.3287412509721142, + "grad_norm": 0.3963356912136078, + "learning_rate": 0.0001, + "loss": 0.9093, + "step": 1495 + }, + { + "epoch": 1.3296300411065438, + "grad_norm": 0.3808361887931824, + "learning_rate": 0.0001, + "loss": 0.9258, + "step": 1496 + }, + { + "epoch": 1.3305188312409733, + "grad_norm": 0.39797237515449524, + "learning_rate": 0.0001, + "loss": 0.9168, + "step": 1497 + }, + { + "epoch": 1.3314076213754027, + "grad_norm": 0.36626359820365906, + "learning_rate": 0.0001, + "loss": 0.9289, + "step": 1498 + }, + { + "epoch": 1.3322964115098324, + "grad_norm": 0.38699278235435486, + "learning_rate": 0.0001, + "loss": 0.8827, + "step": 1499 + }, + { + "epoch": 1.3331852016442618, + "grad_norm": 0.37499698996543884, + "learning_rate": 0.0001, + "loss": 0.9254, + "step": 1500 + }, + { + "epoch": 1.3340739917786912, + "grad_norm": 0.34208056330680847, + "learning_rate": 0.0001, + "loss": 0.8792, + "step": 1501 + }, + { + "epoch": 1.334962781913121, + "grad_norm": 0.3909793496131897, + "learning_rate": 0.0001, + "loss": 0.9986, + "step": 1502 + }, + { + "epoch": 1.3358515720475503, + "grad_norm": 0.35674503445625305, + "learning_rate": 0.0001, + "loss": 0.8902, + "step": 1503 + }, + { + "epoch": 1.3367403621819798, + "grad_norm": 0.35735827684402466, + "learning_rate": 0.0001, + "loss": 0.9898, + "step": 1504 + }, + { + "epoch": 1.3376291523164092, + "grad_norm": 0.39872074127197266, + "learning_rate": 0.0001, + "loss": 1.048, + "step": 1505 + }, + { + "epoch": 1.3385179424508387, + "grad_norm": 0.4395754039287567, + "learning_rate": 0.0001, + "loss": 0.8621, + "step": 1506 + }, + { + "epoch": 1.3394067325852683, + "grad_norm": 0.3650192320346832, + "learning_rate": 0.0001, + "loss": 0.8788, + "step": 1507 + }, + { + "epoch": 1.3402955227196978, + "grad_norm": 0.35158872604370117, + "learning_rate": 0.0001, + "loss": 0.8228, + "step": 1508 + }, + { + "epoch": 1.3411843128541272, + "grad_norm": 0.3553421199321747, + "learning_rate": 0.0001, + "loss": 0.9102, + "step": 1509 + }, + { + "epoch": 1.3420731029885569, + "grad_norm": 0.32123276591300964, + "learning_rate": 0.0001, + "loss": 0.8808, + "step": 1510 + }, + { + "epoch": 1.3429618931229863, + "grad_norm": 0.35363566875457764, + "learning_rate": 0.0001, + "loss": 0.8895, + "step": 1511 + }, + { + "epoch": 1.3438506832574157, + "grad_norm": 0.38487884402275085, + "learning_rate": 0.0001, + "loss": 0.8713, + "step": 1512 + }, + { + "epoch": 1.3447394733918454, + "grad_norm": 0.3799397051334381, + "learning_rate": 0.0001, + "loss": 0.8982, + "step": 1513 + }, + { + "epoch": 1.3456282635262748, + "grad_norm": 0.3352080285549164, + "learning_rate": 0.0001, + "loss": 0.914, + "step": 1514 + }, + { + "epoch": 1.3465170536607043, + "grad_norm": 0.6567979454994202, + "learning_rate": 0.0001, + "loss": 0.978, + "step": 1515 + }, + { + "epoch": 1.347405843795134, + "grad_norm": 0.4132377505302429, + "learning_rate": 0.0001, + "loss": 1.0033, + "step": 1516 + }, + { + "epoch": 1.3482946339295634, + "grad_norm": 0.4519813060760498, + "learning_rate": 0.0001, + "loss": 0.9213, + "step": 1517 + }, + { + "epoch": 1.3491834240639928, + "grad_norm": 0.35851332545280457, + "learning_rate": 0.0001, + "loss": 0.9634, + "step": 1518 + }, + { + "epoch": 1.3500722141984225, + "grad_norm": 0.39293810725212097, + "learning_rate": 0.0001, + "loss": 0.9069, + "step": 1519 + }, + { + "epoch": 1.350961004332852, + "grad_norm": 0.36240848898887634, + "learning_rate": 0.0001, + "loss": 0.8621, + "step": 1520 + }, + { + "epoch": 1.3518497944672814, + "grad_norm": 0.3404124975204468, + "learning_rate": 0.0001, + "loss": 0.8964, + "step": 1521 + }, + { + "epoch": 1.352738584601711, + "grad_norm": 0.31596043705940247, + "learning_rate": 0.0001, + "loss": 0.9091, + "step": 1522 + }, + { + "epoch": 1.3536273747361405, + "grad_norm": 0.3596150279045105, + "learning_rate": 0.0001, + "loss": 0.9275, + "step": 1523 + }, + { + "epoch": 1.35451616487057, + "grad_norm": 0.35948145389556885, + "learning_rate": 0.0001, + "loss": 0.8821, + "step": 1524 + }, + { + "epoch": 1.3554049550049996, + "grad_norm": 0.3627282381057739, + "learning_rate": 0.0001, + "loss": 0.9501, + "step": 1525 + }, + { + "epoch": 1.356293745139429, + "grad_norm": 0.38337674736976624, + "learning_rate": 0.0001, + "loss": 0.8677, + "step": 1526 + }, + { + "epoch": 1.3571825352738585, + "grad_norm": 0.38964757323265076, + "learning_rate": 0.0001, + "loss": 0.9327, + "step": 1527 + }, + { + "epoch": 1.358071325408288, + "grad_norm": 0.3563820421695709, + "learning_rate": 0.0001, + "loss": 0.836, + "step": 1528 + }, + { + "epoch": 1.3589601155427173, + "grad_norm": 0.3765336573123932, + "learning_rate": 0.0001, + "loss": 0.9198, + "step": 1529 + }, + { + "epoch": 1.359848905677147, + "grad_norm": 0.3981674611568451, + "learning_rate": 0.0001, + "loss": 0.9539, + "step": 1530 + }, + { + "epoch": 1.3607376958115764, + "grad_norm": 0.4140075147151947, + "learning_rate": 0.0001, + "loss": 0.9104, + "step": 1531 + }, + { + "epoch": 1.3616264859460059, + "grad_norm": 0.4137401282787323, + "learning_rate": 0.0001, + "loss": 1.0583, + "step": 1532 + }, + { + "epoch": 1.3625152760804355, + "grad_norm": 0.3649982511997223, + "learning_rate": 0.0001, + "loss": 0.8334, + "step": 1533 + }, + { + "epoch": 1.363404066214865, + "grad_norm": 0.3175910711288452, + "learning_rate": 0.0001, + "loss": 0.8834, + "step": 1534 + }, + { + "epoch": 1.3642928563492944, + "grad_norm": 0.3742975890636444, + "learning_rate": 0.0001, + "loss": 0.9664, + "step": 1535 + }, + { + "epoch": 1.365181646483724, + "grad_norm": 0.369791179895401, + "learning_rate": 0.0001, + "loss": 0.8947, + "step": 1536 + }, + { + "epoch": 1.3660704366181535, + "grad_norm": 0.33241912722587585, + "learning_rate": 0.0001, + "loss": 0.9392, + "step": 1537 + }, + { + "epoch": 1.366959226752583, + "grad_norm": 0.345084011554718, + "learning_rate": 0.0001, + "loss": 0.8655, + "step": 1538 + }, + { + "epoch": 1.3678480168870126, + "grad_norm": 0.33112481236457825, + "learning_rate": 0.0001, + "loss": 0.9325, + "step": 1539 + }, + { + "epoch": 1.368736807021442, + "grad_norm": 0.3624488413333893, + "learning_rate": 0.0001, + "loss": 0.9253, + "step": 1540 + }, + { + "epoch": 1.3696255971558715, + "grad_norm": 0.3580976724624634, + "learning_rate": 0.0001, + "loss": 0.8796, + "step": 1541 + }, + { + "epoch": 1.3705143872903012, + "grad_norm": 0.30134427547454834, + "learning_rate": 0.0001, + "loss": 0.8049, + "step": 1542 + }, + { + "epoch": 1.3714031774247306, + "grad_norm": 0.3331443965435028, + "learning_rate": 0.0001, + "loss": 0.9014, + "step": 1543 + }, + { + "epoch": 1.37229196755916, + "grad_norm": 0.33350786566734314, + "learning_rate": 0.0001, + "loss": 0.9826, + "step": 1544 + }, + { + "epoch": 1.3731807576935897, + "grad_norm": 0.3651019036769867, + "learning_rate": 0.0001, + "loss": 0.9215, + "step": 1545 + }, + { + "epoch": 1.3740695478280192, + "grad_norm": 0.3742315173149109, + "learning_rate": 0.0001, + "loss": 0.9774, + "step": 1546 + }, + { + "epoch": 1.3749583379624486, + "grad_norm": 0.32286152243614197, + "learning_rate": 0.0001, + "loss": 0.8499, + "step": 1547 + }, + { + "epoch": 1.3758471280968783, + "grad_norm": 0.33028700947761536, + "learning_rate": 0.0001, + "loss": 0.8511, + "step": 1548 + }, + { + "epoch": 1.3767359182313077, + "grad_norm": 0.3529914617538452, + "learning_rate": 0.0001, + "loss": 0.8478, + "step": 1549 + }, + { + "epoch": 1.3776247083657371, + "grad_norm": 0.34889042377471924, + "learning_rate": 0.0001, + "loss": 0.8968, + "step": 1550 + }, + { + "epoch": 1.3785134985001666, + "grad_norm": 0.35926511883735657, + "learning_rate": 0.0001, + "loss": 0.9326, + "step": 1551 + }, + { + "epoch": 1.379402288634596, + "grad_norm": 0.3612864315509796, + "learning_rate": 0.0001, + "loss": 0.9212, + "step": 1552 + }, + { + "epoch": 1.3802910787690257, + "grad_norm": 0.342424601316452, + "learning_rate": 0.0001, + "loss": 0.8462, + "step": 1553 + }, + { + "epoch": 1.3811798689034551, + "grad_norm": 0.36990538239479065, + "learning_rate": 0.0001, + "loss": 0.8569, + "step": 1554 + }, + { + "epoch": 1.3820686590378846, + "grad_norm": 0.3396192491054535, + "learning_rate": 0.0001, + "loss": 0.8649, + "step": 1555 + }, + { + "epoch": 1.3829574491723142, + "grad_norm": 0.3837670087814331, + "learning_rate": 0.0001, + "loss": 0.8626, + "step": 1556 + }, + { + "epoch": 1.3838462393067437, + "grad_norm": 0.3661665618419647, + "learning_rate": 0.0001, + "loss": 0.9781, + "step": 1557 + }, + { + "epoch": 1.384735029441173, + "grad_norm": 0.41726961731910706, + "learning_rate": 0.0001, + "loss": 0.9522, + "step": 1558 + }, + { + "epoch": 1.3856238195756028, + "grad_norm": 0.4061615467071533, + "learning_rate": 0.0001, + "loss": 0.9127, + "step": 1559 + }, + { + "epoch": 1.3865126097100322, + "grad_norm": 0.32869914174079895, + "learning_rate": 0.0001, + "loss": 0.8947, + "step": 1560 + }, + { + "epoch": 1.3874013998444616, + "grad_norm": 0.6287943124771118, + "learning_rate": 0.0001, + "loss": 0.8759, + "step": 1561 + }, + { + "epoch": 1.3882901899788913, + "grad_norm": 0.562345027923584, + "learning_rate": 0.0001, + "loss": 0.9109, + "step": 1562 + }, + { + "epoch": 1.3891789801133207, + "grad_norm": 0.39289891719818115, + "learning_rate": 0.0001, + "loss": 0.9831, + "step": 1563 + }, + { + "epoch": 1.3900677702477502, + "grad_norm": 0.4826609194278717, + "learning_rate": 0.0001, + "loss": 0.8846, + "step": 1564 + }, + { + "epoch": 1.3909565603821799, + "grad_norm": 0.4759630262851715, + "learning_rate": 0.0001, + "loss": 0.7826, + "step": 1565 + }, + { + "epoch": 1.3918453505166093, + "grad_norm": 0.8236848711967468, + "learning_rate": 0.0001, + "loss": 0.9145, + "step": 1566 + }, + { + "epoch": 1.3927341406510387, + "grad_norm": 0.39659222960472107, + "learning_rate": 0.0001, + "loss": 0.9594, + "step": 1567 + }, + { + "epoch": 1.3936229307854684, + "grad_norm": 0.32191386818885803, + "learning_rate": 0.0001, + "loss": 0.8515, + "step": 1568 + }, + { + "epoch": 1.3945117209198978, + "grad_norm": 0.397210955619812, + "learning_rate": 0.0001, + "loss": 0.9207, + "step": 1569 + }, + { + "epoch": 1.3954005110543273, + "grad_norm": 0.36070799827575684, + "learning_rate": 0.0001, + "loss": 0.932, + "step": 1570 + }, + { + "epoch": 1.396289301188757, + "grad_norm": 0.35527974367141724, + "learning_rate": 0.0001, + "loss": 0.9069, + "step": 1571 + }, + { + "epoch": 1.3971780913231864, + "grad_norm": 0.39062851667404175, + "learning_rate": 0.0001, + "loss": 0.9763, + "step": 1572 + }, + { + "epoch": 1.3980668814576158, + "grad_norm": 0.3772708475589752, + "learning_rate": 0.0001, + "loss": 0.9299, + "step": 1573 + }, + { + "epoch": 1.3989556715920453, + "grad_norm": 0.3413572609424591, + "learning_rate": 0.0001, + "loss": 0.9314, + "step": 1574 + }, + { + "epoch": 1.3998444617264747, + "grad_norm": 0.4356358051300049, + "learning_rate": 0.0001, + "loss": 0.8741, + "step": 1575 + }, + { + "epoch": 1.4007332518609044, + "grad_norm": 0.3393174409866333, + "learning_rate": 0.0001, + "loss": 0.8657, + "step": 1576 + }, + { + "epoch": 1.4016220419953338, + "grad_norm": 0.35734716057777405, + "learning_rate": 0.0001, + "loss": 0.9329, + "step": 1577 + }, + { + "epoch": 1.4025108321297632, + "grad_norm": 0.3565090298652649, + "learning_rate": 0.0001, + "loss": 0.9528, + "step": 1578 + }, + { + "epoch": 1.403399622264193, + "grad_norm": 0.39361128211021423, + "learning_rate": 0.0001, + "loss": 0.8599, + "step": 1579 + }, + { + "epoch": 1.4042884123986223, + "grad_norm": 0.4015718102455139, + "learning_rate": 0.0001, + "loss": 0.8092, + "step": 1580 + }, + { + "epoch": 1.4051772025330518, + "grad_norm": 0.35963454842567444, + "learning_rate": 0.0001, + "loss": 0.9064, + "step": 1581 + }, + { + "epoch": 1.4060659926674814, + "grad_norm": 0.3792876601219177, + "learning_rate": 0.0001, + "loss": 0.9415, + "step": 1582 + }, + { + "epoch": 1.4069547828019109, + "grad_norm": 0.45550185441970825, + "learning_rate": 0.0001, + "loss": 0.9823, + "step": 1583 + }, + { + "epoch": 1.4078435729363403, + "grad_norm": 0.3728649318218231, + "learning_rate": 0.0001, + "loss": 0.9192, + "step": 1584 + }, + { + "epoch": 1.40873236307077, + "grad_norm": 0.35824286937713623, + "learning_rate": 0.0001, + "loss": 0.9029, + "step": 1585 + }, + { + "epoch": 1.4096211532051994, + "grad_norm": 0.34785544872283936, + "learning_rate": 0.0001, + "loss": 0.8923, + "step": 1586 + }, + { + "epoch": 1.4105099433396289, + "grad_norm": 0.3750496208667755, + "learning_rate": 0.0001, + "loss": 0.97, + "step": 1587 + }, + { + "epoch": 1.4113987334740585, + "grad_norm": 0.365651398897171, + "learning_rate": 0.0001, + "loss": 0.9155, + "step": 1588 + }, + { + "epoch": 1.412287523608488, + "grad_norm": 0.389883428812027, + "learning_rate": 0.0001, + "loss": 0.8792, + "step": 1589 + }, + { + "epoch": 1.4131763137429174, + "grad_norm": 0.3404034674167633, + "learning_rate": 0.0001, + "loss": 0.8753, + "step": 1590 + }, + { + "epoch": 1.414065103877347, + "grad_norm": 0.3574478328227997, + "learning_rate": 0.0001, + "loss": 0.884, + "step": 1591 + }, + { + "epoch": 1.4149538940117765, + "grad_norm": 0.3543257415294647, + "learning_rate": 0.0001, + "loss": 0.9347, + "step": 1592 + }, + { + "epoch": 1.415842684146206, + "grad_norm": 0.37373191118240356, + "learning_rate": 0.0001, + "loss": 0.9138, + "step": 1593 + }, + { + "epoch": 1.4167314742806356, + "grad_norm": 0.7103110551834106, + "learning_rate": 0.0001, + "loss": 0.918, + "step": 1594 + }, + { + "epoch": 1.417620264415065, + "grad_norm": 0.377218097448349, + "learning_rate": 0.0001, + "loss": 1.0207, + "step": 1595 + }, + { + "epoch": 1.4185090545494945, + "grad_norm": 0.3593274652957916, + "learning_rate": 0.0001, + "loss": 0.8988, + "step": 1596 + }, + { + "epoch": 1.419397844683924, + "grad_norm": 0.3128172755241394, + "learning_rate": 0.0001, + "loss": 0.9314, + "step": 1597 + }, + { + "epoch": 1.4202866348183536, + "grad_norm": 0.39537543058395386, + "learning_rate": 0.0001, + "loss": 0.9583, + "step": 1598 + }, + { + "epoch": 1.421175424952783, + "grad_norm": 0.38807258009910583, + "learning_rate": 0.0001, + "loss": 0.9978, + "step": 1599 + }, + { + "epoch": 1.4220642150872125, + "grad_norm": 0.32216835021972656, + "learning_rate": 0.0001, + "loss": 0.8949, + "step": 1600 + }, + { + "epoch": 1.422953005221642, + "grad_norm": 0.3530212938785553, + "learning_rate": 0.0001, + "loss": 0.9434, + "step": 1601 + }, + { + "epoch": 1.4238417953560716, + "grad_norm": 0.38593819737434387, + "learning_rate": 0.0001, + "loss": 0.8588, + "step": 1602 + }, + { + "epoch": 1.424730585490501, + "grad_norm": 0.319916307926178, + "learning_rate": 0.0001, + "loss": 0.9491, + "step": 1603 + }, + { + "epoch": 1.4256193756249305, + "grad_norm": 0.46837642788887024, + "learning_rate": 0.0001, + "loss": 0.8415, + "step": 1604 + }, + { + "epoch": 1.4265081657593601, + "grad_norm": 0.3597429394721985, + "learning_rate": 0.0001, + "loss": 0.9981, + "step": 1605 + }, + { + "epoch": 1.4273969558937896, + "grad_norm": 0.33240172266960144, + "learning_rate": 0.0001, + "loss": 0.9084, + "step": 1606 + }, + { + "epoch": 1.428285746028219, + "grad_norm": 0.3380354046821594, + "learning_rate": 0.0001, + "loss": 0.9233, + "step": 1607 + }, + { + "epoch": 1.4291745361626487, + "grad_norm": 0.503455400466919, + "learning_rate": 0.0001, + "loss": 0.9825, + "step": 1608 + }, + { + "epoch": 1.430063326297078, + "grad_norm": 0.3610774576663971, + "learning_rate": 0.0001, + "loss": 1.0047, + "step": 1609 + }, + { + "epoch": 1.4309521164315075, + "grad_norm": 0.44202589988708496, + "learning_rate": 0.0001, + "loss": 0.9203, + "step": 1610 + }, + { + "epoch": 1.4318409065659372, + "grad_norm": 0.3535199761390686, + "learning_rate": 0.0001, + "loss": 0.9925, + "step": 1611 + }, + { + "epoch": 1.4327296967003667, + "grad_norm": 0.3379201292991638, + "learning_rate": 0.0001, + "loss": 0.9715, + "step": 1612 + }, + { + "epoch": 1.433618486834796, + "grad_norm": 0.34348997473716736, + "learning_rate": 0.0001, + "loss": 0.9106, + "step": 1613 + }, + { + "epoch": 1.4345072769692258, + "grad_norm": 0.341406911611557, + "learning_rate": 0.0001, + "loss": 0.9345, + "step": 1614 + }, + { + "epoch": 1.4353960671036552, + "grad_norm": 0.2867315411567688, + "learning_rate": 0.0001, + "loss": 0.8473, + "step": 1615 + }, + { + "epoch": 1.4362848572380846, + "grad_norm": 0.3688201308250427, + "learning_rate": 0.0001, + "loss": 0.887, + "step": 1616 + }, + { + "epoch": 1.4371736473725143, + "grad_norm": 0.3555888235569, + "learning_rate": 0.0001, + "loss": 0.8905, + "step": 1617 + }, + { + "epoch": 1.4380624375069437, + "grad_norm": 0.3761601448059082, + "learning_rate": 0.0001, + "loss": 0.8659, + "step": 1618 + }, + { + "epoch": 1.4389512276413732, + "grad_norm": 0.3437979519367218, + "learning_rate": 0.0001, + "loss": 0.9541, + "step": 1619 + }, + { + "epoch": 1.4398400177758026, + "grad_norm": 0.33345940709114075, + "learning_rate": 0.0001, + "loss": 0.9721, + "step": 1620 + }, + { + "epoch": 1.4407288079102323, + "grad_norm": 0.356842577457428, + "learning_rate": 0.0001, + "loss": 0.9055, + "step": 1621 + }, + { + "epoch": 1.4416175980446617, + "grad_norm": 0.35060185194015503, + "learning_rate": 0.0001, + "loss": 0.8517, + "step": 1622 + }, + { + "epoch": 1.4425063881790912, + "grad_norm": 0.3705711364746094, + "learning_rate": 0.0001, + "loss": 1.0321, + "step": 1623 + }, + { + "epoch": 1.4433951783135206, + "grad_norm": 0.38974469900131226, + "learning_rate": 0.0001, + "loss": 0.8974, + "step": 1624 + }, + { + "epoch": 1.4442839684479503, + "grad_norm": 0.40210771560668945, + "learning_rate": 0.0001, + "loss": 0.9183, + "step": 1625 + }, + { + "epoch": 1.4451727585823797, + "grad_norm": 0.3603565990924835, + "learning_rate": 0.0001, + "loss": 0.8622, + "step": 1626 + }, + { + "epoch": 1.4460615487168091, + "grad_norm": 0.3595213294029236, + "learning_rate": 0.0001, + "loss": 0.931, + "step": 1627 + }, + { + "epoch": 1.4469503388512388, + "grad_norm": 0.34834766387939453, + "learning_rate": 0.0001, + "loss": 0.9076, + "step": 1628 + }, + { + "epoch": 1.4478391289856682, + "grad_norm": 0.3451787531375885, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 1629 + }, + { + "epoch": 1.4487279191200977, + "grad_norm": 0.3447902202606201, + "learning_rate": 0.0001, + "loss": 0.8725, + "step": 1630 + }, + { + "epoch": 1.4496167092545273, + "grad_norm": 0.3713982403278351, + "learning_rate": 0.0001, + "loss": 0.9179, + "step": 1631 + }, + { + "epoch": 1.4505054993889568, + "grad_norm": 0.41470077633857727, + "learning_rate": 0.0001, + "loss": 0.8618, + "step": 1632 + }, + { + "epoch": 1.4513942895233862, + "grad_norm": 0.35781964659690857, + "learning_rate": 0.0001, + "loss": 0.8232, + "step": 1633 + }, + { + "epoch": 1.452283079657816, + "grad_norm": 0.32825222611427307, + "learning_rate": 0.0001, + "loss": 0.8527, + "step": 1634 + }, + { + "epoch": 1.4531718697922453, + "grad_norm": 0.3815636932849884, + "learning_rate": 0.0001, + "loss": 0.9824, + "step": 1635 + }, + { + "epoch": 1.4540606599266748, + "grad_norm": 0.3856026828289032, + "learning_rate": 0.0001, + "loss": 0.9522, + "step": 1636 + }, + { + "epoch": 1.4549494500611044, + "grad_norm": 0.3923175036907196, + "learning_rate": 0.0001, + "loss": 0.8057, + "step": 1637 + }, + { + "epoch": 1.4558382401955339, + "grad_norm": 0.3747645914554596, + "learning_rate": 0.0001, + "loss": 0.9438, + "step": 1638 + }, + { + "epoch": 1.4567270303299633, + "grad_norm": 0.3561338782310486, + "learning_rate": 0.0001, + "loss": 0.8827, + "step": 1639 + }, + { + "epoch": 1.457615820464393, + "grad_norm": 0.32757651805877686, + "learning_rate": 0.0001, + "loss": 0.8562, + "step": 1640 + }, + { + "epoch": 1.4585046105988224, + "grad_norm": 0.378717839717865, + "learning_rate": 0.0001, + "loss": 0.9642, + "step": 1641 + }, + { + "epoch": 1.4593934007332519, + "grad_norm": 0.3806663751602173, + "learning_rate": 0.0001, + "loss": 0.9111, + "step": 1642 + }, + { + "epoch": 1.4602821908676813, + "grad_norm": 0.34055235981941223, + "learning_rate": 0.0001, + "loss": 0.9138, + "step": 1643 + }, + { + "epoch": 1.461170981002111, + "grad_norm": 0.3508027493953705, + "learning_rate": 0.0001, + "loss": 0.8867, + "step": 1644 + }, + { + "epoch": 1.4620597711365404, + "grad_norm": 0.3853304386138916, + "learning_rate": 0.0001, + "loss": 0.895, + "step": 1645 + }, + { + "epoch": 1.4629485612709698, + "grad_norm": 0.39283403754234314, + "learning_rate": 0.0001, + "loss": 0.8977, + "step": 1646 + }, + { + "epoch": 1.4638373514053993, + "grad_norm": 0.3606354892253876, + "learning_rate": 0.0001, + "loss": 0.8564, + "step": 1647 + }, + { + "epoch": 1.464726141539829, + "grad_norm": 0.3763819932937622, + "learning_rate": 0.0001, + "loss": 0.8714, + "step": 1648 + }, + { + "epoch": 1.4656149316742584, + "grad_norm": 0.3701646625995636, + "learning_rate": 0.0001, + "loss": 0.9512, + "step": 1649 + }, + { + "epoch": 1.4665037218086878, + "grad_norm": 0.383543998003006, + "learning_rate": 0.0001, + "loss": 0.9052, + "step": 1650 + }, + { + "epoch": 1.4673925119431175, + "grad_norm": 0.3642030954360962, + "learning_rate": 0.0001, + "loss": 0.8511, + "step": 1651 + }, + { + "epoch": 1.468281302077547, + "grad_norm": 0.38401633501052856, + "learning_rate": 0.0001, + "loss": 0.9076, + "step": 1652 + }, + { + "epoch": 1.4691700922119764, + "grad_norm": 0.3193959891796112, + "learning_rate": 0.0001, + "loss": 0.8841, + "step": 1653 + }, + { + "epoch": 1.470058882346406, + "grad_norm": 0.35109949111938477, + "learning_rate": 0.0001, + "loss": 0.9746, + "step": 1654 + }, + { + "epoch": 1.4709476724808355, + "grad_norm": 0.38246193528175354, + "learning_rate": 0.0001, + "loss": 0.9658, + "step": 1655 + }, + { + "epoch": 1.471836462615265, + "grad_norm": 0.3813583552837372, + "learning_rate": 0.0001, + "loss": 0.9618, + "step": 1656 + }, + { + "epoch": 1.4727252527496946, + "grad_norm": 0.345525860786438, + "learning_rate": 0.0001, + "loss": 1.0142, + "step": 1657 + }, + { + "epoch": 1.473614042884124, + "grad_norm": 0.3886154592037201, + "learning_rate": 0.0001, + "loss": 0.9028, + "step": 1658 + }, + { + "epoch": 1.4745028330185534, + "grad_norm": 0.31922218203544617, + "learning_rate": 0.0001, + "loss": 0.9339, + "step": 1659 + }, + { + "epoch": 1.4753916231529831, + "grad_norm": 0.38563981652259827, + "learning_rate": 0.0001, + "loss": 1.0242, + "step": 1660 + }, + { + "epoch": 1.4762804132874126, + "grad_norm": 0.3879290223121643, + "learning_rate": 0.0001, + "loss": 0.9703, + "step": 1661 + }, + { + "epoch": 1.477169203421842, + "grad_norm": 0.36282819509506226, + "learning_rate": 0.0001, + "loss": 0.9179, + "step": 1662 + }, + { + "epoch": 1.4780579935562717, + "grad_norm": 0.39247390627861023, + "learning_rate": 0.0001, + "loss": 0.8528, + "step": 1663 + }, + { + "epoch": 1.478946783690701, + "grad_norm": 0.37190213799476624, + "learning_rate": 0.0001, + "loss": 0.8894, + "step": 1664 + }, + { + "epoch": 1.4798355738251305, + "grad_norm": 0.35375040769577026, + "learning_rate": 0.0001, + "loss": 0.862, + "step": 1665 + }, + { + "epoch": 1.48072436395956, + "grad_norm": 0.38695651292800903, + "learning_rate": 0.0001, + "loss": 0.9118, + "step": 1666 + }, + { + "epoch": 1.4816131540939896, + "grad_norm": 0.34383878111839294, + "learning_rate": 0.0001, + "loss": 0.874, + "step": 1667 + }, + { + "epoch": 1.482501944228419, + "grad_norm": 0.3391941785812378, + "learning_rate": 0.0001, + "loss": 0.9457, + "step": 1668 + }, + { + "epoch": 1.4833907343628485, + "grad_norm": 0.3345963954925537, + "learning_rate": 0.0001, + "loss": 0.8521, + "step": 1669 + }, + { + "epoch": 1.484279524497278, + "grad_norm": 0.3772295117378235, + "learning_rate": 0.0001, + "loss": 0.8987, + "step": 1670 + }, + { + "epoch": 1.4851683146317076, + "grad_norm": 0.3609481751918793, + "learning_rate": 0.0001, + "loss": 0.8841, + "step": 1671 + }, + { + "epoch": 1.486057104766137, + "grad_norm": 0.3498031795024872, + "learning_rate": 0.0001, + "loss": 0.9713, + "step": 1672 + }, + { + "epoch": 1.4869458949005665, + "grad_norm": 0.33337390422821045, + "learning_rate": 0.0001, + "loss": 0.9458, + "step": 1673 + }, + { + "epoch": 1.4878346850349962, + "grad_norm": 0.3364051282405853, + "learning_rate": 0.0001, + "loss": 0.8227, + "step": 1674 + }, + { + "epoch": 1.4887234751694256, + "grad_norm": 0.3339247405529022, + "learning_rate": 0.0001, + "loss": 0.9191, + "step": 1675 + }, + { + "epoch": 1.489612265303855, + "grad_norm": 0.3879411816596985, + "learning_rate": 0.0001, + "loss": 0.9857, + "step": 1676 + }, + { + "epoch": 1.4905010554382847, + "grad_norm": 0.3417137563228607, + "learning_rate": 0.0001, + "loss": 0.8781, + "step": 1677 + }, + { + "epoch": 1.4913898455727141, + "grad_norm": 0.3149627447128296, + "learning_rate": 0.0001, + "loss": 0.9536, + "step": 1678 + }, + { + "epoch": 1.4922786357071436, + "grad_norm": 0.3448125720024109, + "learning_rate": 0.0001, + "loss": 0.8991, + "step": 1679 + }, + { + "epoch": 1.4931674258415732, + "grad_norm": 0.3406059741973877, + "learning_rate": 0.0001, + "loss": 0.9704, + "step": 1680 + }, + { + "epoch": 1.4940562159760027, + "grad_norm": 0.3938051164150238, + "learning_rate": 0.0001, + "loss": 0.9602, + "step": 1681 + }, + { + "epoch": 1.4949450061104321, + "grad_norm": 0.35217610001564026, + "learning_rate": 0.0001, + "loss": 0.8743, + "step": 1682 + }, + { + "epoch": 1.4958337962448618, + "grad_norm": 0.3443836271762848, + "learning_rate": 0.0001, + "loss": 0.8809, + "step": 1683 + }, + { + "epoch": 1.4967225863792912, + "grad_norm": 0.36990824341773987, + "learning_rate": 0.0001, + "loss": 0.9191, + "step": 1684 + }, + { + "epoch": 1.4976113765137207, + "grad_norm": 0.3850703239440918, + "learning_rate": 0.0001, + "loss": 0.9439, + "step": 1685 + }, + { + "epoch": 1.4985001666481503, + "grad_norm": 0.336357444524765, + "learning_rate": 0.0001, + "loss": 0.9015, + "step": 1686 + }, + { + "epoch": 1.4993889567825798, + "grad_norm": 0.33322861790657043, + "learning_rate": 0.0001, + "loss": 0.9134, + "step": 1687 + }, + { + "epoch": 1.5002777469170092, + "grad_norm": 0.3278595507144928, + "learning_rate": 0.0001, + "loss": 0.902, + "step": 1688 + }, + { + "epoch": 1.5011665370514389, + "grad_norm": 0.36542853713035583, + "learning_rate": 0.0001, + "loss": 0.9095, + "step": 1689 + }, + { + "epoch": 1.502055327185868, + "grad_norm": 0.3316444456577301, + "learning_rate": 0.0001, + "loss": 0.8739, + "step": 1690 + }, + { + "epoch": 1.5029441173202978, + "grad_norm": 0.34426775574684143, + "learning_rate": 0.0001, + "loss": 0.8909, + "step": 1691 + }, + { + "epoch": 1.5038329074547274, + "grad_norm": 0.3481121063232422, + "learning_rate": 0.0001, + "loss": 0.9043, + "step": 1692 + }, + { + "epoch": 1.5047216975891566, + "grad_norm": 0.3318212628364563, + "learning_rate": 0.0001, + "loss": 0.9219, + "step": 1693 + }, + { + "epoch": 1.5056104877235863, + "grad_norm": 0.3894112706184387, + "learning_rate": 0.0001, + "loss": 0.9349, + "step": 1694 + }, + { + "epoch": 1.5064992778580157, + "grad_norm": 0.3638782501220703, + "learning_rate": 0.0001, + "loss": 0.8549, + "step": 1695 + }, + { + "epoch": 1.5073880679924452, + "grad_norm": 0.3686305284500122, + "learning_rate": 0.0001, + "loss": 0.8293, + "step": 1696 + }, + { + "epoch": 1.5082768581268748, + "grad_norm": 0.3255722224712372, + "learning_rate": 0.0001, + "loss": 0.9002, + "step": 1697 + }, + { + "epoch": 1.5091656482613043, + "grad_norm": 0.3039862811565399, + "learning_rate": 0.0001, + "loss": 0.838, + "step": 1698 + }, + { + "epoch": 1.5100544383957337, + "grad_norm": 0.372408926486969, + "learning_rate": 0.0001, + "loss": 0.9254, + "step": 1699 + }, + { + "epoch": 1.5109432285301634, + "grad_norm": 0.3547128438949585, + "learning_rate": 0.0001, + "loss": 0.9622, + "step": 1700 + }, + { + "epoch": 1.5118320186645928, + "grad_norm": 0.3619207441806793, + "learning_rate": 0.0001, + "loss": 0.9454, + "step": 1701 + }, + { + "epoch": 1.5127208087990223, + "grad_norm": 0.347741961479187, + "learning_rate": 0.0001, + "loss": 0.8835, + "step": 1702 + }, + { + "epoch": 1.513609598933452, + "grad_norm": 0.3651529848575592, + "learning_rate": 0.0001, + "loss": 0.9481, + "step": 1703 + }, + { + "epoch": 1.5144983890678814, + "grad_norm": 0.3714975416660309, + "learning_rate": 0.0001, + "loss": 0.8664, + "step": 1704 + }, + { + "epoch": 1.5153871792023108, + "grad_norm": 0.38629162311553955, + "learning_rate": 0.0001, + "loss": 0.911, + "step": 1705 + }, + { + "epoch": 1.5162759693367405, + "grad_norm": 0.31858816742897034, + "learning_rate": 0.0001, + "loss": 0.8828, + "step": 1706 + }, + { + "epoch": 1.51716475947117, + "grad_norm": 0.34497538208961487, + "learning_rate": 0.0001, + "loss": 0.8967, + "step": 1707 + }, + { + "epoch": 1.5180535496055994, + "grad_norm": 0.34334251284599304, + "learning_rate": 0.0001, + "loss": 0.9161, + "step": 1708 + }, + { + "epoch": 1.518942339740029, + "grad_norm": 0.35556355118751526, + "learning_rate": 0.0001, + "loss": 0.9303, + "step": 1709 + }, + { + "epoch": 1.5198311298744582, + "grad_norm": 0.2841368019580841, + "learning_rate": 0.0001, + "loss": 0.9444, + "step": 1710 + }, + { + "epoch": 1.520719920008888, + "grad_norm": 0.36247727274894714, + "learning_rate": 0.0001, + "loss": 0.9561, + "step": 1711 + }, + { + "epoch": 1.5216087101433176, + "grad_norm": 0.3494194447994232, + "learning_rate": 0.0001, + "loss": 0.9227, + "step": 1712 + }, + { + "epoch": 1.5224975002777468, + "grad_norm": 0.34062543511390686, + "learning_rate": 0.0001, + "loss": 0.9002, + "step": 1713 + }, + { + "epoch": 1.5233862904121764, + "grad_norm": 0.32053127884864807, + "learning_rate": 0.0001, + "loss": 0.8472, + "step": 1714 + }, + { + "epoch": 1.524275080546606, + "grad_norm": 0.34881147742271423, + "learning_rate": 0.0001, + "loss": 0.996, + "step": 1715 + }, + { + "epoch": 1.5251638706810353, + "grad_norm": 0.31298136711120605, + "learning_rate": 0.0001, + "loss": 0.9664, + "step": 1716 + }, + { + "epoch": 1.526052660815465, + "grad_norm": 0.35817843675613403, + "learning_rate": 0.0001, + "loss": 0.8581, + "step": 1717 + }, + { + "epoch": 1.5269414509498944, + "grad_norm": 0.36280620098114014, + "learning_rate": 0.0001, + "loss": 0.9752, + "step": 1718 + }, + { + "epoch": 1.5278302410843239, + "grad_norm": 0.3819250464439392, + "learning_rate": 0.0001, + "loss": 0.8537, + "step": 1719 + }, + { + "epoch": 1.5287190312187535, + "grad_norm": 0.319489061832428, + "learning_rate": 0.0001, + "loss": 0.8939, + "step": 1720 + }, + { + "epoch": 1.529607821353183, + "grad_norm": 0.3549860417842865, + "learning_rate": 0.0001, + "loss": 0.9703, + "step": 1721 + }, + { + "epoch": 1.5304966114876124, + "grad_norm": 0.3653222620487213, + "learning_rate": 0.0001, + "loss": 0.9454, + "step": 1722 + }, + { + "epoch": 1.531385401622042, + "grad_norm": 1.377524495124817, + "learning_rate": 0.0001, + "loss": 0.9496, + "step": 1723 + }, + { + "epoch": 1.5322741917564715, + "grad_norm": 0.37237972021102905, + "learning_rate": 0.0001, + "loss": 0.9882, + "step": 1724 + }, + { + "epoch": 1.533162981890901, + "grad_norm": 0.3513152003288269, + "learning_rate": 0.0001, + "loss": 0.9387, + "step": 1725 + }, + { + "epoch": 1.5340517720253306, + "grad_norm": 1.2794930934906006, + "learning_rate": 0.0001, + "loss": 0.9713, + "step": 1726 + }, + { + "epoch": 1.53494056215976, + "grad_norm": 0.37315309047698975, + "learning_rate": 0.0001, + "loss": 0.9837, + "step": 1727 + }, + { + "epoch": 1.5358293522941895, + "grad_norm": 0.34950658679008484, + "learning_rate": 0.0001, + "loss": 0.9261, + "step": 1728 + }, + { + "epoch": 1.5367181424286191, + "grad_norm": 0.3580825924873352, + "learning_rate": 0.0001, + "loss": 0.9007, + "step": 1729 + }, + { + "epoch": 1.5376069325630486, + "grad_norm": 0.6059324145317078, + "learning_rate": 0.0001, + "loss": 0.9215, + "step": 1730 + }, + { + "epoch": 1.538495722697478, + "grad_norm": 2.688345432281494, + "learning_rate": 0.0001, + "loss": 0.9347, + "step": 1731 + }, + { + "epoch": 1.5393845128319077, + "grad_norm": 1.058393955230713, + "learning_rate": 0.0001, + "loss": 0.8553, + "step": 1732 + }, + { + "epoch": 1.540273302966337, + "grad_norm": 0.5399541854858398, + "learning_rate": 0.0001, + "loss": 0.9347, + "step": 1733 + }, + { + "epoch": 1.5411620931007666, + "grad_norm": 0.38701578974723816, + "learning_rate": 0.0001, + "loss": 0.9532, + "step": 1734 + }, + { + "epoch": 1.5420508832351962, + "grad_norm": 0.3606896698474884, + "learning_rate": 0.0001, + "loss": 0.9679, + "step": 1735 + }, + { + "epoch": 1.5429396733696255, + "grad_norm": 0.33948925137519836, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 1736 + }, + { + "epoch": 1.5438284635040551, + "grad_norm": 0.3091464638710022, + "learning_rate": 0.0001, + "loss": 0.8451, + "step": 1737 + }, + { + "epoch": 1.5447172536384848, + "grad_norm": 0.3580194413661957, + "learning_rate": 0.0001, + "loss": 0.9166, + "step": 1738 + }, + { + "epoch": 1.545606043772914, + "grad_norm": 0.32534483075141907, + "learning_rate": 0.0001, + "loss": 0.8588, + "step": 1739 + }, + { + "epoch": 1.5464948339073437, + "grad_norm": 0.37172192335128784, + "learning_rate": 0.0001, + "loss": 0.8881, + "step": 1740 + }, + { + "epoch": 1.547383624041773, + "grad_norm": 0.3773120045661926, + "learning_rate": 0.0001, + "loss": 0.9553, + "step": 1741 + }, + { + "epoch": 1.5482724141762025, + "grad_norm": 0.37788835167884827, + "learning_rate": 0.0001, + "loss": 0.9444, + "step": 1742 + }, + { + "epoch": 1.5491612043106322, + "grad_norm": 0.33638015389442444, + "learning_rate": 0.0001, + "loss": 0.8482, + "step": 1743 + }, + { + "epoch": 1.5500499944450616, + "grad_norm": 0.38720908761024475, + "learning_rate": 0.0001, + "loss": 0.8876, + "step": 1744 + }, + { + "epoch": 1.550938784579491, + "grad_norm": 0.3406068980693817, + "learning_rate": 0.0001, + "loss": 0.9519, + "step": 1745 + }, + { + "epoch": 1.5518275747139207, + "grad_norm": 0.3615241050720215, + "learning_rate": 0.0001, + "loss": 0.9574, + "step": 1746 + }, + { + "epoch": 1.5527163648483502, + "grad_norm": 0.34514304995536804, + "learning_rate": 0.0001, + "loss": 0.9059, + "step": 1747 + }, + { + "epoch": 1.5536051549827796, + "grad_norm": 0.3243461549282074, + "learning_rate": 0.0001, + "loss": 0.888, + "step": 1748 + }, + { + "epoch": 1.5544939451172093, + "grad_norm": 0.37375468015670776, + "learning_rate": 0.0001, + "loss": 0.7914, + "step": 1749 + }, + { + "epoch": 1.5553827352516387, + "grad_norm": 0.37953221797943115, + "learning_rate": 0.0001, + "loss": 0.8981, + "step": 1750 + }, + { + "epoch": 1.5562715253860682, + "grad_norm": 0.36913594603538513, + "learning_rate": 0.0001, + "loss": 0.9042, + "step": 1751 + }, + { + "epoch": 1.5571603155204978, + "grad_norm": 0.40167394280433655, + "learning_rate": 0.0001, + "loss": 0.9929, + "step": 1752 + }, + { + "epoch": 1.5580491056549273, + "grad_norm": 0.3700322210788727, + "learning_rate": 0.0001, + "loss": 0.9464, + "step": 1753 + }, + { + "epoch": 1.5589378957893567, + "grad_norm": 0.4139519929885864, + "learning_rate": 0.0001, + "loss": 0.8966, + "step": 1754 + }, + { + "epoch": 1.5598266859237864, + "grad_norm": 1.0801198482513428, + "learning_rate": 0.0001, + "loss": 0.8788, + "step": 1755 + }, + { + "epoch": 1.5607154760582156, + "grad_norm": 0.8275560140609741, + "learning_rate": 0.0001, + "loss": 0.8335, + "step": 1756 + }, + { + "epoch": 1.5616042661926453, + "grad_norm": 0.5179623961448669, + "learning_rate": 0.0001, + "loss": 0.9282, + "step": 1757 + }, + { + "epoch": 1.562493056327075, + "grad_norm": 0.3609626889228821, + "learning_rate": 0.0001, + "loss": 0.8669, + "step": 1758 + }, + { + "epoch": 1.5633818464615041, + "grad_norm": 0.35782667994499207, + "learning_rate": 0.0001, + "loss": 0.8826, + "step": 1759 + }, + { + "epoch": 1.5642706365959338, + "grad_norm": 0.5242655873298645, + "learning_rate": 0.0001, + "loss": 0.8888, + "step": 1760 + }, + { + "epoch": 1.5651594267303635, + "grad_norm": 0.3657304346561432, + "learning_rate": 0.0001, + "loss": 0.9309, + "step": 1761 + }, + { + "epoch": 1.5660482168647927, + "grad_norm": 0.5481138825416565, + "learning_rate": 0.0001, + "loss": 0.8967, + "step": 1762 + }, + { + "epoch": 1.5669370069992223, + "grad_norm": 0.39052364230155945, + "learning_rate": 0.0001, + "loss": 0.9363, + "step": 1763 + }, + { + "epoch": 1.5678257971336518, + "grad_norm": 0.5495015978813171, + "learning_rate": 0.0001, + "loss": 0.9439, + "step": 1764 + }, + { + "epoch": 1.5687145872680812, + "grad_norm": 0.3862490653991699, + "learning_rate": 0.0001, + "loss": 0.9504, + "step": 1765 + }, + { + "epoch": 1.5696033774025109, + "grad_norm": 0.4059329330921173, + "learning_rate": 0.0001, + "loss": 0.9468, + "step": 1766 + }, + { + "epoch": 1.5704921675369403, + "grad_norm": 0.3757980465888977, + "learning_rate": 0.0001, + "loss": 0.9091, + "step": 1767 + }, + { + "epoch": 1.5713809576713698, + "grad_norm": 0.32866010069847107, + "learning_rate": 0.0001, + "loss": 0.9177, + "step": 1768 + }, + { + "epoch": 1.5722697478057994, + "grad_norm": 0.3823925852775574, + "learning_rate": 0.0001, + "loss": 0.8151, + "step": 1769 + }, + { + "epoch": 1.5731585379402289, + "grad_norm": 0.3286689519882202, + "learning_rate": 0.0001, + "loss": 0.9327, + "step": 1770 + }, + { + "epoch": 1.5740473280746583, + "grad_norm": 0.3756827712059021, + "learning_rate": 0.0001, + "loss": 0.9451, + "step": 1771 + }, + { + "epoch": 1.574936118209088, + "grad_norm": 0.34540703892707825, + "learning_rate": 0.0001, + "loss": 0.906, + "step": 1772 + }, + { + "epoch": 1.5758249083435174, + "grad_norm": 0.3327772617340088, + "learning_rate": 0.0001, + "loss": 0.8418, + "step": 1773 + }, + { + "epoch": 1.5767136984779468, + "grad_norm": 0.3816230893135071, + "learning_rate": 0.0001, + "loss": 0.9186, + "step": 1774 + }, + { + "epoch": 1.5776024886123765, + "grad_norm": 0.3620496094226837, + "learning_rate": 0.0001, + "loss": 0.9283, + "step": 1775 + }, + { + "epoch": 1.578491278746806, + "grad_norm": 0.3261551558971405, + "learning_rate": 0.0001, + "loss": 0.8566, + "step": 1776 + }, + { + "epoch": 1.5793800688812354, + "grad_norm": 0.3376888334751129, + "learning_rate": 0.0001, + "loss": 0.8816, + "step": 1777 + }, + { + "epoch": 1.580268859015665, + "grad_norm": 0.34106653928756714, + "learning_rate": 0.0001, + "loss": 0.8366, + "step": 1778 + }, + { + "epoch": 1.5811576491500943, + "grad_norm": 0.39439812302589417, + "learning_rate": 0.0001, + "loss": 0.9562, + "step": 1779 + }, + { + "epoch": 1.582046439284524, + "grad_norm": 0.4464170038700104, + "learning_rate": 0.0001, + "loss": 0.9191, + "step": 1780 + }, + { + "epoch": 1.5829352294189536, + "grad_norm": 0.3573848605155945, + "learning_rate": 0.0001, + "loss": 0.8999, + "step": 1781 + }, + { + "epoch": 1.5838240195533828, + "grad_norm": 0.3367520272731781, + "learning_rate": 0.0001, + "loss": 0.8198, + "step": 1782 + }, + { + "epoch": 1.5847128096878125, + "grad_norm": 0.7480552196502686, + "learning_rate": 0.0001, + "loss": 0.9561, + "step": 1783 + }, + { + "epoch": 1.5856015998222421, + "grad_norm": 0.3286367952823639, + "learning_rate": 0.0001, + "loss": 0.9986, + "step": 1784 + }, + { + "epoch": 1.5864903899566714, + "grad_norm": 0.32625967264175415, + "learning_rate": 0.0001, + "loss": 0.9228, + "step": 1785 + }, + { + "epoch": 1.587379180091101, + "grad_norm": 0.3715958893299103, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 1786 + }, + { + "epoch": 1.5882679702255305, + "grad_norm": 0.34337377548217773, + "learning_rate": 0.0001, + "loss": 0.9656, + "step": 1787 + }, + { + "epoch": 1.58915676035996, + "grad_norm": 0.36245790123939514, + "learning_rate": 0.0001, + "loss": 0.9587, + "step": 1788 + }, + { + "epoch": 1.5900455504943896, + "grad_norm": 0.4087197780609131, + "learning_rate": 0.0001, + "loss": 0.9179, + "step": 1789 + }, + { + "epoch": 1.590934340628819, + "grad_norm": 0.4008493423461914, + "learning_rate": 0.0001, + "loss": 0.8691, + "step": 1790 + }, + { + "epoch": 1.5918231307632484, + "grad_norm": 0.8266111016273499, + "learning_rate": 0.0001, + "loss": 1.0221, + "step": 1791 + }, + { + "epoch": 1.592711920897678, + "grad_norm": 0.3512238562107086, + "learning_rate": 0.0001, + "loss": 0.8909, + "step": 1792 + }, + { + "epoch": 1.5936007110321075, + "grad_norm": 0.3301697075366974, + "learning_rate": 0.0001, + "loss": 0.9072, + "step": 1793 + }, + { + "epoch": 1.594489501166537, + "grad_norm": 0.3027113974094391, + "learning_rate": 0.0001, + "loss": 0.8956, + "step": 1794 + }, + { + "epoch": 1.5953782913009666, + "grad_norm": 0.34008100628852844, + "learning_rate": 0.0001, + "loss": 0.922, + "step": 1795 + }, + { + "epoch": 1.596267081435396, + "grad_norm": 0.37972933053970337, + "learning_rate": 0.0001, + "loss": 0.8718, + "step": 1796 + }, + { + "epoch": 1.5971558715698255, + "grad_norm": 0.38649454712867737, + "learning_rate": 0.0001, + "loss": 0.8555, + "step": 1797 + }, + { + "epoch": 1.5980446617042552, + "grad_norm": 0.3708219826221466, + "learning_rate": 0.0001, + "loss": 0.8959, + "step": 1798 + }, + { + "epoch": 1.5989334518386846, + "grad_norm": 0.36001938581466675, + "learning_rate": 0.0001, + "loss": 0.8835, + "step": 1799 + }, + { + "epoch": 1.599822241973114, + "grad_norm": 0.36445868015289307, + "learning_rate": 0.0001, + "loss": 0.9237, + "step": 1800 + }, + { + "epoch": 1.6007110321075437, + "grad_norm": 0.41779571771621704, + "learning_rate": 0.0001, + "loss": 0.9475, + "step": 1801 + }, + { + "epoch": 1.601599822241973, + "grad_norm": 0.48116981983184814, + "learning_rate": 0.0001, + "loss": 0.9844, + "step": 1802 + }, + { + "epoch": 1.6024886123764026, + "grad_norm": 0.3373110592365265, + "learning_rate": 0.0001, + "loss": 0.8992, + "step": 1803 + }, + { + "epoch": 1.6033774025108323, + "grad_norm": 0.33223381638526917, + "learning_rate": 0.0001, + "loss": 0.8746, + "step": 1804 + }, + { + "epoch": 1.6042661926452615, + "grad_norm": 0.3680526912212372, + "learning_rate": 0.0001, + "loss": 0.9382, + "step": 1805 + }, + { + "epoch": 1.6051549827796912, + "grad_norm": 0.5222595930099487, + "learning_rate": 0.0001, + "loss": 0.9586, + "step": 1806 + }, + { + "epoch": 1.6060437729141208, + "grad_norm": 0.35774093866348267, + "learning_rate": 0.0001, + "loss": 0.9331, + "step": 1807 + }, + { + "epoch": 1.60693256304855, + "grad_norm": 0.528674840927124, + "learning_rate": 0.0001, + "loss": 0.9237, + "step": 1808 + }, + { + "epoch": 1.6078213531829797, + "grad_norm": 0.41820868849754333, + "learning_rate": 0.0001, + "loss": 0.9443, + "step": 1809 + }, + { + "epoch": 1.6087101433174091, + "grad_norm": 0.3307277262210846, + "learning_rate": 0.0001, + "loss": 0.8068, + "step": 1810 + }, + { + "epoch": 1.6095989334518386, + "grad_norm": 0.4219682812690735, + "learning_rate": 0.0001, + "loss": 0.9318, + "step": 1811 + }, + { + "epoch": 1.6104877235862682, + "grad_norm": 0.42429181933403015, + "learning_rate": 0.0001, + "loss": 0.8943, + "step": 1812 + }, + { + "epoch": 1.6113765137206977, + "grad_norm": 1.4591997861862183, + "learning_rate": 0.0001, + "loss": 0.8419, + "step": 1813 + }, + { + "epoch": 1.6122653038551271, + "grad_norm": 0.49615946412086487, + "learning_rate": 0.0001, + "loss": 0.9466, + "step": 1814 + }, + { + "epoch": 1.6131540939895568, + "grad_norm": 0.5319680571556091, + "learning_rate": 0.0001, + "loss": 0.8491, + "step": 1815 + }, + { + "epoch": 1.6140428841239862, + "grad_norm": 1.1099143028259277, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 1816 + }, + { + "epoch": 1.6149316742584157, + "grad_norm": 0.3784678280353546, + "learning_rate": 0.0001, + "loss": 0.9001, + "step": 1817 + }, + { + "epoch": 1.6158204643928453, + "grad_norm": 0.42704182863235474, + "learning_rate": 0.0001, + "loss": 0.9171, + "step": 1818 + }, + { + "epoch": 1.6167092545272748, + "grad_norm": 0.3222212791442871, + "learning_rate": 0.0001, + "loss": 0.8957, + "step": 1819 + }, + { + "epoch": 1.6175980446617042, + "grad_norm": 0.3985123038291931, + "learning_rate": 0.0001, + "loss": 0.803, + "step": 1820 + }, + { + "epoch": 1.6184868347961339, + "grad_norm": 0.3731878697872162, + "learning_rate": 0.0001, + "loss": 0.9434, + "step": 1821 + }, + { + "epoch": 1.6193756249305633, + "grad_norm": 0.35805362462997437, + "learning_rate": 0.0001, + "loss": 0.9385, + "step": 1822 + }, + { + "epoch": 1.6202644150649927, + "grad_norm": 0.353607177734375, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 1823 + }, + { + "epoch": 1.6211532051994224, + "grad_norm": 0.3861144185066223, + "learning_rate": 0.0001, + "loss": 0.9313, + "step": 1824 + }, + { + "epoch": 1.6220419953338518, + "grad_norm": 0.3477698564529419, + "learning_rate": 0.0001, + "loss": 1.0114, + "step": 1825 + }, + { + "epoch": 1.6229307854682813, + "grad_norm": 0.3571338355541229, + "learning_rate": 0.0001, + "loss": 0.9542, + "step": 1826 + }, + { + "epoch": 1.623819575602711, + "grad_norm": 0.3161649703979492, + "learning_rate": 0.0001, + "loss": 0.8867, + "step": 1827 + }, + { + "epoch": 1.6247083657371402, + "grad_norm": 0.34788355231285095, + "learning_rate": 0.0001, + "loss": 0.9498, + "step": 1828 + }, + { + "epoch": 1.6255971558715698, + "grad_norm": 0.3480173647403717, + "learning_rate": 0.0001, + "loss": 0.9012, + "step": 1829 + }, + { + "epoch": 1.6264859460059995, + "grad_norm": 0.3352920413017273, + "learning_rate": 0.0001, + "loss": 0.9405, + "step": 1830 + }, + { + "epoch": 1.6273747361404287, + "grad_norm": 0.3569203019142151, + "learning_rate": 0.0001, + "loss": 0.8903, + "step": 1831 + }, + { + "epoch": 1.6282635262748584, + "grad_norm": 0.36906954646110535, + "learning_rate": 0.0001, + "loss": 0.9598, + "step": 1832 + }, + { + "epoch": 1.6291523164092878, + "grad_norm": 0.3525664508342743, + "learning_rate": 0.0001, + "loss": 0.9319, + "step": 1833 + }, + { + "epoch": 1.6300411065437173, + "grad_norm": 0.3371136784553528, + "learning_rate": 0.0001, + "loss": 0.9193, + "step": 1834 + }, + { + "epoch": 1.630929896678147, + "grad_norm": 0.3484685719013214, + "learning_rate": 0.0001, + "loss": 0.9035, + "step": 1835 + }, + { + "epoch": 1.6318186868125764, + "grad_norm": 0.3327842354774475, + "learning_rate": 0.0001, + "loss": 0.7831, + "step": 1836 + }, + { + "epoch": 1.6327074769470058, + "grad_norm": 0.290935754776001, + "learning_rate": 0.0001, + "loss": 0.8634, + "step": 1837 + }, + { + "epoch": 1.6335962670814355, + "grad_norm": 0.35313087701797485, + "learning_rate": 0.0001, + "loss": 0.8898, + "step": 1838 + }, + { + "epoch": 1.634485057215865, + "grad_norm": 0.35210633277893066, + "learning_rate": 0.0001, + "loss": 0.9164, + "step": 1839 + }, + { + "epoch": 1.6353738473502943, + "grad_norm": 0.36426299810409546, + "learning_rate": 0.0001, + "loss": 0.8369, + "step": 1840 + }, + { + "epoch": 1.636262637484724, + "grad_norm": 0.36496469378471375, + "learning_rate": 0.0001, + "loss": 0.94, + "step": 1841 + }, + { + "epoch": 1.6371514276191534, + "grad_norm": 0.3476478159427643, + "learning_rate": 0.0001, + "loss": 0.8577, + "step": 1842 + }, + { + "epoch": 1.6380402177535829, + "grad_norm": 0.3203316926956177, + "learning_rate": 0.0001, + "loss": 0.861, + "step": 1843 + }, + { + "epoch": 1.6389290078880125, + "grad_norm": 0.3230277895927429, + "learning_rate": 0.0001, + "loss": 0.8456, + "step": 1844 + }, + { + "epoch": 1.639817798022442, + "grad_norm": 0.3015528917312622, + "learning_rate": 0.0001, + "loss": 0.8744, + "step": 1845 + }, + { + "epoch": 1.6407065881568714, + "grad_norm": 0.35518017411231995, + "learning_rate": 0.0001, + "loss": 0.8698, + "step": 1846 + }, + { + "epoch": 1.641595378291301, + "grad_norm": 0.35839059948921204, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 1847 + }, + { + "epoch": 1.6424841684257305, + "grad_norm": 0.3377850651741028, + "learning_rate": 0.0001, + "loss": 0.9225, + "step": 1848 + }, + { + "epoch": 1.64337295856016, + "grad_norm": 0.3225672245025635, + "learning_rate": 0.0001, + "loss": 0.9355, + "step": 1849 + }, + { + "epoch": 1.6442617486945896, + "grad_norm": 0.3357776999473572, + "learning_rate": 0.0001, + "loss": 0.8404, + "step": 1850 + }, + { + "epoch": 1.6451505388290188, + "grad_norm": 0.3443959653377533, + "learning_rate": 0.0001, + "loss": 0.9009, + "step": 1851 + }, + { + "epoch": 1.6460393289634485, + "grad_norm": 0.3641587495803833, + "learning_rate": 0.0001, + "loss": 0.9419, + "step": 1852 + }, + { + "epoch": 1.6469281190978782, + "grad_norm": 0.3614901304244995, + "learning_rate": 0.0001, + "loss": 0.8922, + "step": 1853 + }, + { + "epoch": 1.6478169092323074, + "grad_norm": 0.3756221532821655, + "learning_rate": 0.0001, + "loss": 0.8531, + "step": 1854 + }, + { + "epoch": 1.648705699366737, + "grad_norm": 0.32705435156822205, + "learning_rate": 0.0001, + "loss": 0.9559, + "step": 1855 + }, + { + "epoch": 1.6495944895011665, + "grad_norm": 0.354168564081192, + "learning_rate": 0.0001, + "loss": 0.8389, + "step": 1856 + }, + { + "epoch": 1.650483279635596, + "grad_norm": 0.3950870633125305, + "learning_rate": 0.0001, + "loss": 0.8642, + "step": 1857 + }, + { + "epoch": 1.6513720697700256, + "grad_norm": 0.3480079174041748, + "learning_rate": 0.0001, + "loss": 0.9008, + "step": 1858 + }, + { + "epoch": 1.652260859904455, + "grad_norm": 0.37953078746795654, + "learning_rate": 0.0001, + "loss": 0.9134, + "step": 1859 + }, + { + "epoch": 1.6531496500388845, + "grad_norm": 0.9250193238258362, + "learning_rate": 0.0001, + "loss": 0.8301, + "step": 1860 + }, + { + "epoch": 1.6540384401733141, + "grad_norm": 0.33579471707344055, + "learning_rate": 0.0001, + "loss": 0.8996, + "step": 1861 + }, + { + "epoch": 1.6549272303077436, + "grad_norm": 0.37209945917129517, + "learning_rate": 0.0001, + "loss": 0.8403, + "step": 1862 + }, + { + "epoch": 1.655816020442173, + "grad_norm": 0.3377666473388672, + "learning_rate": 0.0001, + "loss": 0.871, + "step": 1863 + }, + { + "epoch": 1.6567048105766027, + "grad_norm": 0.37800464034080505, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 1864 + }, + { + "epoch": 1.6575936007110321, + "grad_norm": 0.4948398172855377, + "learning_rate": 0.0001, + "loss": 0.9485, + "step": 1865 + }, + { + "epoch": 1.6584823908454616, + "grad_norm": 0.4751080274581909, + "learning_rate": 0.0001, + "loss": 0.8765, + "step": 1866 + }, + { + "epoch": 1.6593711809798912, + "grad_norm": 0.375337690114975, + "learning_rate": 0.0001, + "loss": 0.9074, + "step": 1867 + }, + { + "epoch": 1.6602599711143207, + "grad_norm": 1.2624320983886719, + "learning_rate": 0.0001, + "loss": 0.9032, + "step": 1868 + }, + { + "epoch": 1.66114876124875, + "grad_norm": 0.7726836800575256, + "learning_rate": 0.0001, + "loss": 0.9992, + "step": 1869 + }, + { + "epoch": 1.6620375513831798, + "grad_norm": 0.38178691267967224, + "learning_rate": 0.0001, + "loss": 0.9621, + "step": 1870 + }, + { + "epoch": 1.6629263415176092, + "grad_norm": 0.39392024278640747, + "learning_rate": 0.0001, + "loss": 0.9873, + "step": 1871 + }, + { + "epoch": 1.6638151316520386, + "grad_norm": 1.0342029333114624, + "learning_rate": 0.0001, + "loss": 0.9192, + "step": 1872 + }, + { + "epoch": 1.6647039217864683, + "grad_norm": 0.3234097361564636, + "learning_rate": 0.0001, + "loss": 0.9132, + "step": 1873 + }, + { + "epoch": 1.6655927119208975, + "grad_norm": 2.331127405166626, + "learning_rate": 0.0001, + "loss": 0.9651, + "step": 1874 + }, + { + "epoch": 1.6664815020553272, + "grad_norm": 0.3696269989013672, + "learning_rate": 0.0001, + "loss": 0.8714, + "step": 1875 + }, + { + "epoch": 1.6673702921897569, + "grad_norm": 0.3183539807796478, + "learning_rate": 0.0001, + "loss": 0.8484, + "step": 1876 + }, + { + "epoch": 1.668259082324186, + "grad_norm": 0.3290097713470459, + "learning_rate": 0.0001, + "loss": 0.9776, + "step": 1877 + }, + { + "epoch": 1.6691478724586157, + "grad_norm": 0.3337092101573944, + "learning_rate": 0.0001, + "loss": 0.9359, + "step": 1878 + }, + { + "epoch": 1.6700366625930452, + "grad_norm": 0.3510020971298218, + "learning_rate": 0.0001, + "loss": 0.9407, + "step": 1879 + }, + { + "epoch": 1.6709254527274746, + "grad_norm": 0.3333737254142761, + "learning_rate": 0.0001, + "loss": 0.8822, + "step": 1880 + }, + { + "epoch": 1.6718142428619043, + "grad_norm": 0.3105640709400177, + "learning_rate": 0.0001, + "loss": 0.8645, + "step": 1881 + }, + { + "epoch": 1.6727030329963337, + "grad_norm": 0.29750367999076843, + "learning_rate": 0.0001, + "loss": 0.9338, + "step": 1882 + }, + { + "epoch": 1.6735918231307632, + "grad_norm": 0.34713029861450195, + "learning_rate": 0.0001, + "loss": 0.9299, + "step": 1883 + }, + { + "epoch": 1.6744806132651928, + "grad_norm": 0.3725203275680542, + "learning_rate": 0.0001, + "loss": 0.9744, + "step": 1884 + }, + { + "epoch": 1.6753694033996223, + "grad_norm": 0.3104175925254822, + "learning_rate": 0.0001, + "loss": 0.9057, + "step": 1885 + }, + { + "epoch": 1.6762581935340517, + "grad_norm": 0.37686604261398315, + "learning_rate": 0.0001, + "loss": 1.0123, + "step": 1886 + }, + { + "epoch": 1.6771469836684814, + "grad_norm": 0.29763513803482056, + "learning_rate": 0.0001, + "loss": 0.8857, + "step": 1887 + }, + { + "epoch": 1.6780357738029108, + "grad_norm": 0.3398146629333496, + "learning_rate": 0.0001, + "loss": 0.8808, + "step": 1888 + }, + { + "epoch": 1.6789245639373402, + "grad_norm": 0.33802369236946106, + "learning_rate": 0.0001, + "loss": 0.8505, + "step": 1889 + }, + { + "epoch": 1.67981335407177, + "grad_norm": 0.28601887822151184, + "learning_rate": 0.0001, + "loss": 0.8208, + "step": 1890 + }, + { + "epoch": 1.6807021442061993, + "grad_norm": 0.31592151522636414, + "learning_rate": 0.0001, + "loss": 0.864, + "step": 1891 + }, + { + "epoch": 1.6815909343406288, + "grad_norm": 0.3350915014743805, + "learning_rate": 0.0001, + "loss": 0.8605, + "step": 1892 + }, + { + "epoch": 1.6824797244750584, + "grad_norm": 0.37870457768440247, + "learning_rate": 0.0001, + "loss": 0.9751, + "step": 1893 + }, + { + "epoch": 1.6833685146094879, + "grad_norm": 0.3566136956214905, + "learning_rate": 0.0001, + "loss": 0.9273, + "step": 1894 + }, + { + "epoch": 1.6842573047439173, + "grad_norm": 0.3385891318321228, + "learning_rate": 0.0001, + "loss": 0.8724, + "step": 1895 + }, + { + "epoch": 1.685146094878347, + "grad_norm": 0.3746001720428467, + "learning_rate": 0.0001, + "loss": 0.9185, + "step": 1896 + }, + { + "epoch": 1.6860348850127762, + "grad_norm": 0.3757399320602417, + "learning_rate": 0.0001, + "loss": 0.8831, + "step": 1897 + }, + { + "epoch": 1.6869236751472059, + "grad_norm": 0.33278876543045044, + "learning_rate": 0.0001, + "loss": 0.8558, + "step": 1898 + }, + { + "epoch": 1.6878124652816355, + "grad_norm": 0.3170175552368164, + "learning_rate": 0.0001, + "loss": 0.8932, + "step": 1899 + }, + { + "epoch": 1.6887012554160648, + "grad_norm": 0.32355326414108276, + "learning_rate": 0.0001, + "loss": 0.8787, + "step": 1900 + }, + { + "epoch": 1.6895900455504944, + "grad_norm": 0.31958630681037903, + "learning_rate": 0.0001, + "loss": 0.9199, + "step": 1901 + }, + { + "epoch": 1.6904788356849239, + "grad_norm": 0.3621491491794586, + "learning_rate": 0.0001, + "loss": 0.8973, + "step": 1902 + }, + { + "epoch": 1.6913676258193533, + "grad_norm": 0.48310527205467224, + "learning_rate": 0.0001, + "loss": 0.8819, + "step": 1903 + }, + { + "epoch": 1.692256415953783, + "grad_norm": 0.3930363059043884, + "learning_rate": 0.0001, + "loss": 0.9095, + "step": 1904 + }, + { + "epoch": 1.6931452060882124, + "grad_norm": 0.5452322363853455, + "learning_rate": 0.0001, + "loss": 0.8147, + "step": 1905 + }, + { + "epoch": 1.6940339962226418, + "grad_norm": 0.3676657974720001, + "learning_rate": 0.0001, + "loss": 0.8653, + "step": 1906 + }, + { + "epoch": 1.6949227863570715, + "grad_norm": 0.5217211246490479, + "learning_rate": 0.0001, + "loss": 0.8173, + "step": 1907 + }, + { + "epoch": 1.695811576491501, + "grad_norm": 0.4746188223361969, + "learning_rate": 0.0001, + "loss": 0.9034, + "step": 1908 + }, + { + "epoch": 1.6967003666259304, + "grad_norm": 0.6145462989807129, + "learning_rate": 0.0001, + "loss": 0.9044, + "step": 1909 + }, + { + "epoch": 1.69758915676036, + "grad_norm": 0.43200716376304626, + "learning_rate": 0.0001, + "loss": 0.9742, + "step": 1910 + }, + { + "epoch": 1.6984779468947895, + "grad_norm": 0.3674427568912506, + "learning_rate": 0.0001, + "loss": 0.9574, + "step": 1911 + }, + { + "epoch": 1.699366737029219, + "grad_norm": 0.3205876052379608, + "learning_rate": 0.0001, + "loss": 0.83, + "step": 1912 + }, + { + "epoch": 1.7002555271636486, + "grad_norm": 0.41618865728378296, + "learning_rate": 0.0001, + "loss": 0.9021, + "step": 1913 + }, + { + "epoch": 1.701144317298078, + "grad_norm": 0.40217745304107666, + "learning_rate": 0.0001, + "loss": 0.8975, + "step": 1914 + }, + { + "epoch": 1.7020331074325075, + "grad_norm": 0.37823233008384705, + "learning_rate": 0.0001, + "loss": 0.9554, + "step": 1915 + }, + { + "epoch": 1.7029218975669371, + "grad_norm": 0.37665289640426636, + "learning_rate": 0.0001, + "loss": 0.9769, + "step": 1916 + }, + { + "epoch": 1.7038106877013666, + "grad_norm": 0.3486294746398926, + "learning_rate": 0.0001, + "loss": 0.9428, + "step": 1917 + }, + { + "epoch": 1.704699477835796, + "grad_norm": 0.40671101212501526, + "learning_rate": 0.0001, + "loss": 0.9127, + "step": 1918 + }, + { + "epoch": 1.7055882679702257, + "grad_norm": 0.35006022453308105, + "learning_rate": 0.0001, + "loss": 0.9617, + "step": 1919 + }, + { + "epoch": 1.7064770581046549, + "grad_norm": 0.36815375089645386, + "learning_rate": 0.0001, + "loss": 0.9224, + "step": 1920 + }, + { + "epoch": 1.7073658482390845, + "grad_norm": 0.34530341625213623, + "learning_rate": 0.0001, + "loss": 0.8884, + "step": 1921 + }, + { + "epoch": 1.7082546383735142, + "grad_norm": 0.3507446348667145, + "learning_rate": 0.0001, + "loss": 0.9667, + "step": 1922 + }, + { + "epoch": 1.7091434285079434, + "grad_norm": 0.34675371646881104, + "learning_rate": 0.0001, + "loss": 0.8347, + "step": 1923 + }, + { + "epoch": 1.710032218642373, + "grad_norm": 0.34880420565605164, + "learning_rate": 0.0001, + "loss": 0.8989, + "step": 1924 + }, + { + "epoch": 1.7109210087768025, + "grad_norm": 0.35518354177474976, + "learning_rate": 0.0001, + "loss": 0.8713, + "step": 1925 + }, + { + "epoch": 1.711809798911232, + "grad_norm": 0.34916481375694275, + "learning_rate": 0.0001, + "loss": 0.8733, + "step": 1926 + }, + { + "epoch": 1.7126985890456616, + "grad_norm": 0.3095919191837311, + "learning_rate": 0.0001, + "loss": 0.9293, + "step": 1927 + }, + { + "epoch": 1.713587379180091, + "grad_norm": 0.36070945858955383, + "learning_rate": 0.0001, + "loss": 0.9314, + "step": 1928 + }, + { + "epoch": 1.7144761693145205, + "grad_norm": 0.36071017384529114, + "learning_rate": 0.0001, + "loss": 1.0066, + "step": 1929 + }, + { + "epoch": 1.7153649594489502, + "grad_norm": 0.3244760036468506, + "learning_rate": 0.0001, + "loss": 0.8946, + "step": 1930 + }, + { + "epoch": 1.7162537495833796, + "grad_norm": 0.34101998805999756, + "learning_rate": 0.0001, + "loss": 0.8418, + "step": 1931 + }, + { + "epoch": 1.717142539717809, + "grad_norm": 0.38111263513565063, + "learning_rate": 0.0001, + "loss": 0.855, + "step": 1932 + }, + { + "epoch": 1.7180313298522387, + "grad_norm": 0.3378105163574219, + "learning_rate": 0.0001, + "loss": 0.8144, + "step": 1933 + }, + { + "epoch": 1.7189201199866682, + "grad_norm": 0.3542186915874481, + "learning_rate": 0.0001, + "loss": 0.8838, + "step": 1934 + }, + { + "epoch": 1.7198089101210976, + "grad_norm": 0.3762187957763672, + "learning_rate": 0.0001, + "loss": 0.8893, + "step": 1935 + }, + { + "epoch": 1.7206977002555273, + "grad_norm": 0.3594928979873657, + "learning_rate": 0.0001, + "loss": 0.9449, + "step": 1936 + }, + { + "epoch": 1.7215864903899567, + "grad_norm": 0.3138940632343292, + "learning_rate": 0.0001, + "loss": 0.9581, + "step": 1937 + }, + { + "epoch": 1.7224752805243861, + "grad_norm": 0.3370933532714844, + "learning_rate": 0.0001, + "loss": 0.903, + "step": 1938 + }, + { + "epoch": 1.7233640706588158, + "grad_norm": 0.3399626612663269, + "learning_rate": 0.0001, + "loss": 0.8864, + "step": 1939 + }, + { + "epoch": 1.7242528607932452, + "grad_norm": 0.329193115234375, + "learning_rate": 0.0001, + "loss": 0.9119, + "step": 1940 + }, + { + "epoch": 1.7251416509276747, + "grad_norm": 0.3717508316040039, + "learning_rate": 0.0001, + "loss": 0.8974, + "step": 1941 + }, + { + "epoch": 1.7260304410621043, + "grad_norm": 0.3529798090457916, + "learning_rate": 0.0001, + "loss": 0.9278, + "step": 1942 + }, + { + "epoch": 1.7269192311965336, + "grad_norm": 0.38855284452438354, + "learning_rate": 0.0001, + "loss": 0.8737, + "step": 1943 + }, + { + "epoch": 1.7278080213309632, + "grad_norm": 0.3287023901939392, + "learning_rate": 0.0001, + "loss": 0.9743, + "step": 1944 + }, + { + "epoch": 1.728696811465393, + "grad_norm": 0.3897152841091156, + "learning_rate": 0.0001, + "loss": 0.8451, + "step": 1945 + }, + { + "epoch": 1.729585601599822, + "grad_norm": 0.3124273419380188, + "learning_rate": 0.0001, + "loss": 0.8386, + "step": 1946 + }, + { + "epoch": 1.7304743917342518, + "grad_norm": 0.2802036702632904, + "learning_rate": 0.0001, + "loss": 0.8524, + "step": 1947 + }, + { + "epoch": 1.7313631818686812, + "grad_norm": 0.33942630887031555, + "learning_rate": 0.0001, + "loss": 0.8643, + "step": 1948 + }, + { + "epoch": 1.7322519720031107, + "grad_norm": 0.3702329695224762, + "learning_rate": 0.0001, + "loss": 0.9303, + "step": 1949 + }, + { + "epoch": 1.7331407621375403, + "grad_norm": 0.31775155663490295, + "learning_rate": 0.0001, + "loss": 0.8537, + "step": 1950 + }, + { + "epoch": 1.7340295522719698, + "grad_norm": 0.3505903482437134, + "learning_rate": 0.0001, + "loss": 0.8882, + "step": 1951 + }, + { + "epoch": 1.7349183424063992, + "grad_norm": 0.3102082908153534, + "learning_rate": 0.0001, + "loss": 0.8552, + "step": 1952 + }, + { + "epoch": 1.7358071325408289, + "grad_norm": 0.40489935874938965, + "learning_rate": 0.0001, + "loss": 0.945, + "step": 1953 + }, + { + "epoch": 1.7366959226752583, + "grad_norm": 0.35709646344184875, + "learning_rate": 0.0001, + "loss": 0.8496, + "step": 1954 + }, + { + "epoch": 1.7375847128096877, + "grad_norm": 0.33537212014198303, + "learning_rate": 0.0001, + "loss": 0.9205, + "step": 1955 + }, + { + "epoch": 1.7384735029441174, + "grad_norm": 0.3619324564933777, + "learning_rate": 0.0001, + "loss": 0.9342, + "step": 1956 + }, + { + "epoch": 1.7393622930785468, + "grad_norm": 0.31395861506462097, + "learning_rate": 0.0001, + "loss": 0.8663, + "step": 1957 + }, + { + "epoch": 1.7402510832129763, + "grad_norm": 0.3466954827308655, + "learning_rate": 0.0001, + "loss": 0.8824, + "step": 1958 + }, + { + "epoch": 1.741139873347406, + "grad_norm": 0.34001612663269043, + "learning_rate": 0.0001, + "loss": 0.9166, + "step": 1959 + }, + { + "epoch": 1.7420286634818354, + "grad_norm": 0.31668078899383545, + "learning_rate": 0.0001, + "loss": 0.8834, + "step": 1960 + }, + { + "epoch": 1.7429174536162648, + "grad_norm": 0.33864402770996094, + "learning_rate": 0.0001, + "loss": 0.9826, + "step": 1961 + }, + { + "epoch": 1.7438062437506945, + "grad_norm": 0.34717845916748047, + "learning_rate": 0.0001, + "loss": 0.9625, + "step": 1962 + }, + { + "epoch": 1.744695033885124, + "grad_norm": 0.3487595021724701, + "learning_rate": 0.0001, + "loss": 0.9137, + "step": 1963 + }, + { + "epoch": 1.7455838240195534, + "grad_norm": 0.28423944115638733, + "learning_rate": 0.0001, + "loss": 0.8175, + "step": 1964 + }, + { + "epoch": 1.746472614153983, + "grad_norm": 0.4026448428630829, + "learning_rate": 0.0001, + "loss": 0.9801, + "step": 1965 + }, + { + "epoch": 1.7473614042884122, + "grad_norm": 0.34367966651916504, + "learning_rate": 0.0001, + "loss": 0.908, + "step": 1966 + }, + { + "epoch": 1.748250194422842, + "grad_norm": 0.3341876268386841, + "learning_rate": 0.0001, + "loss": 0.9105, + "step": 1967 + }, + { + "epoch": 1.7491389845572716, + "grad_norm": 0.3579274117946625, + "learning_rate": 0.0001, + "loss": 0.8702, + "step": 1968 + }, + { + "epoch": 1.7500277746917008, + "grad_norm": 0.30886539816856384, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 1969 + }, + { + "epoch": 1.7509165648261305, + "grad_norm": 0.33523109555244446, + "learning_rate": 0.0001, + "loss": 0.8876, + "step": 1970 + }, + { + "epoch": 1.75180535496056, + "grad_norm": 0.32511594891548157, + "learning_rate": 0.0001, + "loss": 0.9099, + "step": 1971 + }, + { + "epoch": 1.7526941450949893, + "grad_norm": 0.3819718658924103, + "learning_rate": 0.0001, + "loss": 0.9776, + "step": 1972 + }, + { + "epoch": 1.753582935229419, + "grad_norm": 0.32984215021133423, + "learning_rate": 0.0001, + "loss": 0.9106, + "step": 1973 + }, + { + "epoch": 1.7544717253638484, + "grad_norm": 0.4052921235561371, + "learning_rate": 0.0001, + "loss": 0.9176, + "step": 1974 + }, + { + "epoch": 1.7553605154982779, + "grad_norm": 0.3490274250507355, + "learning_rate": 0.0001, + "loss": 0.9902, + "step": 1975 + }, + { + "epoch": 1.7562493056327075, + "grad_norm": 0.37016966938972473, + "learning_rate": 0.0001, + "loss": 0.9497, + "step": 1976 + }, + { + "epoch": 1.757138095767137, + "grad_norm": 0.3575468957424164, + "learning_rate": 0.0001, + "loss": 0.864, + "step": 1977 + }, + { + "epoch": 1.7580268859015664, + "grad_norm": 0.34194570779800415, + "learning_rate": 0.0001, + "loss": 0.9862, + "step": 1978 + }, + { + "epoch": 1.758915676035996, + "grad_norm": 0.361395001411438, + "learning_rate": 0.0001, + "loss": 0.8745, + "step": 1979 + }, + { + "epoch": 1.7598044661704255, + "grad_norm": 0.3402308225631714, + "learning_rate": 0.0001, + "loss": 0.9091, + "step": 1980 + }, + { + "epoch": 1.760693256304855, + "grad_norm": 0.3822394013404846, + "learning_rate": 0.0001, + "loss": 0.9479, + "step": 1981 + }, + { + "epoch": 1.7615820464392846, + "grad_norm": 0.3259856700897217, + "learning_rate": 0.0001, + "loss": 0.912, + "step": 1982 + }, + { + "epoch": 1.762470836573714, + "grad_norm": 0.40468284487724304, + "learning_rate": 0.0001, + "loss": 0.8525, + "step": 1983 + }, + { + "epoch": 1.7633596267081435, + "grad_norm": 0.4105701446533203, + "learning_rate": 0.0001, + "loss": 0.8759, + "step": 1984 + }, + { + "epoch": 1.7642484168425732, + "grad_norm": 0.5805673003196716, + "learning_rate": 0.0001, + "loss": 1.0095, + "step": 1985 + }, + { + "epoch": 1.7651372069770026, + "grad_norm": 0.6856684684753418, + "learning_rate": 0.0001, + "loss": 0.9672, + "step": 1986 + }, + { + "epoch": 1.766025997111432, + "grad_norm": 0.3889956474304199, + "learning_rate": 0.0001, + "loss": 0.8923, + "step": 1987 + }, + { + "epoch": 1.7669147872458617, + "grad_norm": 0.37936434149742126, + "learning_rate": 0.0001, + "loss": 0.9341, + "step": 1988 + }, + { + "epoch": 1.767803577380291, + "grad_norm": 0.34393537044525146, + "learning_rate": 0.0001, + "loss": 0.8836, + "step": 1989 + }, + { + "epoch": 1.7686923675147206, + "grad_norm": 0.34456318616867065, + "learning_rate": 0.0001, + "loss": 0.9741, + "step": 1990 + }, + { + "epoch": 1.7695811576491502, + "grad_norm": 0.32842886447906494, + "learning_rate": 0.0001, + "loss": 0.9137, + "step": 1991 + }, + { + "epoch": 1.7704699477835795, + "grad_norm": 0.35680335760116577, + "learning_rate": 0.0001, + "loss": 0.8332, + "step": 1992 + }, + { + "epoch": 1.7713587379180091, + "grad_norm": 0.3687342703342438, + "learning_rate": 0.0001, + "loss": 0.9065, + "step": 1993 + }, + { + "epoch": 1.7722475280524386, + "grad_norm": 0.39165356755256653, + "learning_rate": 0.0001, + "loss": 0.933, + "step": 1994 + }, + { + "epoch": 1.773136318186868, + "grad_norm": 0.36600562930107117, + "learning_rate": 0.0001, + "loss": 0.8694, + "step": 1995 + }, + { + "epoch": 1.7740251083212977, + "grad_norm": 0.3371865749359131, + "learning_rate": 0.0001, + "loss": 0.9622, + "step": 1996 + }, + { + "epoch": 1.7749138984557271, + "grad_norm": 0.3732564449310303, + "learning_rate": 0.0001, + "loss": 1.0046, + "step": 1997 + }, + { + "epoch": 1.7758026885901566, + "grad_norm": 0.33094069361686707, + "learning_rate": 0.0001, + "loss": 0.8466, + "step": 1998 + }, + { + "epoch": 1.7766914787245862, + "grad_norm": 0.3384708762168884, + "learning_rate": 0.0001, + "loss": 0.855, + "step": 1999 + }, + { + "epoch": 1.7775802688590157, + "grad_norm": 0.3246244788169861, + "learning_rate": 0.0001, + "loss": 0.9129, + "step": 2000 + }, + { + "epoch": 1.778469058993445, + "grad_norm": 0.3717402517795563, + "learning_rate": 0.0001, + "loss": 0.9298, + "step": 2001 + }, + { + "epoch": 1.7793578491278748, + "grad_norm": 0.4107438325881958, + "learning_rate": 0.0001, + "loss": 0.9007, + "step": 2002 + }, + { + "epoch": 1.7802466392623042, + "grad_norm": 0.355356901884079, + "learning_rate": 0.0001, + "loss": 0.8573, + "step": 2003 + }, + { + "epoch": 1.7811354293967336, + "grad_norm": 0.34428539872169495, + "learning_rate": 0.0001, + "loss": 0.9481, + "step": 2004 + }, + { + "epoch": 1.7820242195311633, + "grad_norm": 0.3407171666622162, + "learning_rate": 0.0001, + "loss": 0.9211, + "step": 2005 + }, + { + "epoch": 1.7829130096655927, + "grad_norm": 0.3513103425502777, + "learning_rate": 0.0001, + "loss": 0.9179, + "step": 2006 + }, + { + "epoch": 1.7838017998000222, + "grad_norm": 0.32477250695228577, + "learning_rate": 0.0001, + "loss": 0.8792, + "step": 2007 + }, + { + "epoch": 1.7846905899344518, + "grad_norm": 0.3490789234638214, + "learning_rate": 0.0001, + "loss": 0.9442, + "step": 2008 + }, + { + "epoch": 1.7855793800688813, + "grad_norm": 0.3724386692047119, + "learning_rate": 0.0001, + "loss": 0.9503, + "step": 2009 + }, + { + "epoch": 1.7864681702033107, + "grad_norm": 0.3316227197647095, + "learning_rate": 0.0001, + "loss": 0.9598, + "step": 2010 + }, + { + "epoch": 1.7873569603377404, + "grad_norm": 0.35122936964035034, + "learning_rate": 0.0001, + "loss": 0.8149, + "step": 2011 + }, + { + "epoch": 1.7882457504721696, + "grad_norm": 0.4029070734977722, + "learning_rate": 0.0001, + "loss": 0.9305, + "step": 2012 + }, + { + "epoch": 1.7891345406065993, + "grad_norm": 0.31067872047424316, + "learning_rate": 0.0001, + "loss": 0.8878, + "step": 2013 + }, + { + "epoch": 1.790023330741029, + "grad_norm": 0.34684422612190247, + "learning_rate": 0.0001, + "loss": 0.9143, + "step": 2014 + }, + { + "epoch": 1.7909121208754581, + "grad_norm": 0.32135358452796936, + "learning_rate": 0.0001, + "loss": 0.926, + "step": 2015 + }, + { + "epoch": 1.7918009110098878, + "grad_norm": 0.32894784212112427, + "learning_rate": 0.0001, + "loss": 0.9249, + "step": 2016 + }, + { + "epoch": 1.7926897011443172, + "grad_norm": 0.35362952947616577, + "learning_rate": 0.0001, + "loss": 0.9004, + "step": 2017 + }, + { + "epoch": 1.7935784912787467, + "grad_norm": 0.3030708432197571, + "learning_rate": 0.0001, + "loss": 1.0155, + "step": 2018 + }, + { + "epoch": 1.7944672814131764, + "grad_norm": 0.3144596219062805, + "learning_rate": 0.0001, + "loss": 0.9367, + "step": 2019 + }, + { + "epoch": 1.7953560715476058, + "grad_norm": 0.34851983189582825, + "learning_rate": 0.0001, + "loss": 0.8684, + "step": 2020 + }, + { + "epoch": 1.7962448616820352, + "grad_norm": 0.32248032093048096, + "learning_rate": 0.0001, + "loss": 0.8977, + "step": 2021 + }, + { + "epoch": 1.797133651816465, + "grad_norm": 0.3193919062614441, + "learning_rate": 0.0001, + "loss": 0.9297, + "step": 2022 + }, + { + "epoch": 1.7980224419508943, + "grad_norm": 0.3607577085494995, + "learning_rate": 0.0001, + "loss": 0.9774, + "step": 2023 + }, + { + "epoch": 1.7989112320853238, + "grad_norm": 0.30439522862434387, + "learning_rate": 0.0001, + "loss": 0.9313, + "step": 2024 + }, + { + "epoch": 1.7998000222197534, + "grad_norm": 0.31033584475517273, + "learning_rate": 0.0001, + "loss": 0.9535, + "step": 2025 + }, + { + "epoch": 1.8006888123541829, + "grad_norm": 0.34924376010894775, + "learning_rate": 0.0001, + "loss": 1.0392, + "step": 2026 + }, + { + "epoch": 1.8015776024886123, + "grad_norm": 0.3580887019634247, + "learning_rate": 0.0001, + "loss": 0.8772, + "step": 2027 + }, + { + "epoch": 1.802466392623042, + "grad_norm": 0.30029070377349854, + "learning_rate": 0.0001, + "loss": 0.904, + "step": 2028 + }, + { + "epoch": 1.8033551827574714, + "grad_norm": 0.36260783672332764, + "learning_rate": 0.0001, + "loss": 0.8598, + "step": 2029 + }, + { + "epoch": 1.8042439728919009, + "grad_norm": 0.3398006558418274, + "learning_rate": 0.0001, + "loss": 0.926, + "step": 2030 + }, + { + "epoch": 1.8051327630263305, + "grad_norm": 0.31382572650909424, + "learning_rate": 0.0001, + "loss": 0.871, + "step": 2031 + }, + { + "epoch": 1.80602155316076, + "grad_norm": 0.33854883909225464, + "learning_rate": 0.0001, + "loss": 0.9493, + "step": 2032 + }, + { + "epoch": 1.8069103432951894, + "grad_norm": 0.334824800491333, + "learning_rate": 0.0001, + "loss": 0.8451, + "step": 2033 + }, + { + "epoch": 1.807799133429619, + "grad_norm": 0.31903523206710815, + "learning_rate": 0.0001, + "loss": 0.8204, + "step": 2034 + }, + { + "epoch": 1.8086879235640483, + "grad_norm": 0.351361483335495, + "learning_rate": 0.0001, + "loss": 0.8484, + "step": 2035 + }, + { + "epoch": 1.809576713698478, + "grad_norm": 0.3341825604438782, + "learning_rate": 0.0001, + "loss": 0.9075, + "step": 2036 + }, + { + "epoch": 1.8104655038329076, + "grad_norm": 0.3398033678531647, + "learning_rate": 0.0001, + "loss": 0.9491, + "step": 2037 + }, + { + "epoch": 1.8113542939673368, + "grad_norm": 0.3391786813735962, + "learning_rate": 0.0001, + "loss": 0.9576, + "step": 2038 + }, + { + "epoch": 1.8122430841017665, + "grad_norm": 0.31477200984954834, + "learning_rate": 0.0001, + "loss": 0.8863, + "step": 2039 + }, + { + "epoch": 1.813131874236196, + "grad_norm": 0.31994014978408813, + "learning_rate": 0.0001, + "loss": 0.903, + "step": 2040 + }, + { + "epoch": 1.8140206643706254, + "grad_norm": 0.38486775755882263, + "learning_rate": 0.0001, + "loss": 0.9182, + "step": 2041 + }, + { + "epoch": 1.814909454505055, + "grad_norm": 0.3258659839630127, + "learning_rate": 0.0001, + "loss": 0.8561, + "step": 2042 + }, + { + "epoch": 1.8157982446394845, + "grad_norm": 0.36081910133361816, + "learning_rate": 0.0001, + "loss": 0.9326, + "step": 2043 + }, + { + "epoch": 1.816687034773914, + "grad_norm": 0.33326515555381775, + "learning_rate": 0.0001, + "loss": 0.941, + "step": 2044 + }, + { + "epoch": 1.8175758249083436, + "grad_norm": 0.3134666383266449, + "learning_rate": 0.0001, + "loss": 0.7828, + "step": 2045 + }, + { + "epoch": 1.818464615042773, + "grad_norm": 0.318588525056839, + "learning_rate": 0.0001, + "loss": 0.974, + "step": 2046 + }, + { + "epoch": 1.8193534051772025, + "grad_norm": 0.3159397840499878, + "learning_rate": 0.0001, + "loss": 0.8277, + "step": 2047 + }, + { + "epoch": 1.8202421953116321, + "grad_norm": 0.3912515938282013, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 2048 + }, + { + "epoch": 1.8211309854460616, + "grad_norm": 0.3366560935974121, + "learning_rate": 0.0001, + "loss": 0.9652, + "step": 2049 + }, + { + "epoch": 1.822019775580491, + "grad_norm": 0.33880704641342163, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 2050 + }, + { + "epoch": 1.8229085657149207, + "grad_norm": 0.3547055721282959, + "learning_rate": 0.0001, + "loss": 0.8965, + "step": 2051 + }, + { + "epoch": 1.82379735584935, + "grad_norm": 0.3428584337234497, + "learning_rate": 0.0001, + "loss": 0.8787, + "step": 2052 + }, + { + "epoch": 1.8246861459837795, + "grad_norm": 0.3745492398738861, + "learning_rate": 0.0001, + "loss": 0.9322, + "step": 2053 + }, + { + "epoch": 1.8255749361182092, + "grad_norm": 0.32607874274253845, + "learning_rate": 0.0001, + "loss": 0.8768, + "step": 2054 + }, + { + "epoch": 1.8264637262526386, + "grad_norm": 0.5843383073806763, + "learning_rate": 0.0001, + "loss": 0.9578, + "step": 2055 + }, + { + "epoch": 1.827352516387068, + "grad_norm": 0.3096837103366852, + "learning_rate": 0.0001, + "loss": 0.8705, + "step": 2056 + }, + { + "epoch": 1.8282413065214977, + "grad_norm": 0.32685980200767517, + "learning_rate": 0.0001, + "loss": 0.907, + "step": 2057 + }, + { + "epoch": 1.829130096655927, + "grad_norm": 0.3481847643852234, + "learning_rate": 0.0001, + "loss": 0.9312, + "step": 2058 + }, + { + "epoch": 1.8300188867903566, + "grad_norm": 0.3510027229785919, + "learning_rate": 0.0001, + "loss": 0.9272, + "step": 2059 + }, + { + "epoch": 1.8309076769247863, + "grad_norm": 0.3125600814819336, + "learning_rate": 0.0001, + "loss": 0.8919, + "step": 2060 + }, + { + "epoch": 1.8317964670592155, + "grad_norm": 0.343363881111145, + "learning_rate": 0.0001, + "loss": 0.9294, + "step": 2061 + }, + { + "epoch": 1.8326852571936452, + "grad_norm": 0.30843785405158997, + "learning_rate": 0.0001, + "loss": 0.9, + "step": 2062 + }, + { + "epoch": 1.8335740473280746, + "grad_norm": 0.38246282935142517, + "learning_rate": 0.0001, + "loss": 0.9301, + "step": 2063 + }, + { + "epoch": 1.834462837462504, + "grad_norm": 0.30538472533226013, + "learning_rate": 0.0001, + "loss": 1.0181, + "step": 2064 + }, + { + "epoch": 1.8353516275969337, + "grad_norm": 0.3176038861274719, + "learning_rate": 0.0001, + "loss": 0.8301, + "step": 2065 + }, + { + "epoch": 1.8362404177313632, + "grad_norm": 0.3275960683822632, + "learning_rate": 0.0001, + "loss": 0.8787, + "step": 2066 + }, + { + "epoch": 1.8371292078657926, + "grad_norm": 0.3405584394931793, + "learning_rate": 0.0001, + "loss": 0.8562, + "step": 2067 + }, + { + "epoch": 1.8380179980002223, + "grad_norm": 0.3240833580493927, + "learning_rate": 0.0001, + "loss": 0.8733, + "step": 2068 + }, + { + "epoch": 1.8389067881346517, + "grad_norm": 0.3166552484035492, + "learning_rate": 0.0001, + "loss": 0.9458, + "step": 2069 + }, + { + "epoch": 1.8397955782690811, + "grad_norm": 0.3474705219268799, + "learning_rate": 0.0001, + "loss": 0.8703, + "step": 2070 + }, + { + "epoch": 1.8406843684035108, + "grad_norm": 0.3441227376461029, + "learning_rate": 0.0001, + "loss": 0.9149, + "step": 2071 + }, + { + "epoch": 1.8415731585379402, + "grad_norm": 0.3015057146549225, + "learning_rate": 0.0001, + "loss": 0.8754, + "step": 2072 + }, + { + "epoch": 1.8424619486723697, + "grad_norm": 0.33986207842826843, + "learning_rate": 0.0001, + "loss": 0.9179, + "step": 2073 + }, + { + "epoch": 1.8433507388067993, + "grad_norm": 0.33093711733818054, + "learning_rate": 0.0001, + "loss": 0.9072, + "step": 2074 + }, + { + "epoch": 1.8442395289412288, + "grad_norm": 0.338056743144989, + "learning_rate": 0.0001, + "loss": 0.8816, + "step": 2075 + }, + { + "epoch": 1.8451283190756582, + "grad_norm": 0.32622647285461426, + "learning_rate": 0.0001, + "loss": 0.9017, + "step": 2076 + }, + { + "epoch": 1.8460171092100879, + "grad_norm": 0.33805835247039795, + "learning_rate": 0.0001, + "loss": 0.9135, + "step": 2077 + }, + { + "epoch": 1.8469058993445173, + "grad_norm": 0.3360745906829834, + "learning_rate": 0.0001, + "loss": 0.8238, + "step": 2078 + }, + { + "epoch": 1.8477946894789468, + "grad_norm": 0.3394176959991455, + "learning_rate": 0.0001, + "loss": 0.9539, + "step": 2079 + }, + { + "epoch": 1.8486834796133764, + "grad_norm": 0.3259996175765991, + "learning_rate": 0.0001, + "loss": 0.8874, + "step": 2080 + }, + { + "epoch": 1.8495722697478056, + "grad_norm": 0.30118152499198914, + "learning_rate": 0.0001, + "loss": 0.8875, + "step": 2081 + }, + { + "epoch": 1.8504610598822353, + "grad_norm": 0.3274901211261749, + "learning_rate": 0.0001, + "loss": 0.9579, + "step": 2082 + }, + { + "epoch": 1.851349850016665, + "grad_norm": 0.34811556339263916, + "learning_rate": 0.0001, + "loss": 0.9363, + "step": 2083 + }, + { + "epoch": 1.8522386401510942, + "grad_norm": 0.35961511731147766, + "learning_rate": 0.0001, + "loss": 0.8803, + "step": 2084 + }, + { + "epoch": 1.8531274302855238, + "grad_norm": 0.3167450726032257, + "learning_rate": 0.0001, + "loss": 0.8289, + "step": 2085 + }, + { + "epoch": 1.8540162204199533, + "grad_norm": 0.32763344049453735, + "learning_rate": 0.0001, + "loss": 0.8965, + "step": 2086 + }, + { + "epoch": 1.8549050105543827, + "grad_norm": 0.32265403866767883, + "learning_rate": 0.0001, + "loss": 0.9522, + "step": 2087 + }, + { + "epoch": 1.8557938006888124, + "grad_norm": 0.3623351752758026, + "learning_rate": 0.0001, + "loss": 0.9497, + "step": 2088 + }, + { + "epoch": 1.8566825908232418, + "grad_norm": 0.34022286534309387, + "learning_rate": 0.0001, + "loss": 0.9482, + "step": 2089 + }, + { + "epoch": 1.8575713809576713, + "grad_norm": 0.34971117973327637, + "learning_rate": 0.0001, + "loss": 0.9088, + "step": 2090 + }, + { + "epoch": 1.858460171092101, + "grad_norm": 0.35238999128341675, + "learning_rate": 0.0001, + "loss": 0.893, + "step": 2091 + }, + { + "epoch": 1.8593489612265304, + "grad_norm": 0.3588760197162628, + "learning_rate": 0.0001, + "loss": 0.9416, + "step": 2092 + }, + { + "epoch": 1.8602377513609598, + "grad_norm": 0.3537149131298065, + "learning_rate": 0.0001, + "loss": 0.9008, + "step": 2093 + }, + { + "epoch": 1.8611265414953895, + "grad_norm": 0.35885384678840637, + "learning_rate": 0.0001, + "loss": 0.9227, + "step": 2094 + }, + { + "epoch": 1.862015331629819, + "grad_norm": 0.44651269912719727, + "learning_rate": 0.0001, + "loss": 0.8945, + "step": 2095 + }, + { + "epoch": 1.8629041217642484, + "grad_norm": 0.8166044354438782, + "learning_rate": 0.0001, + "loss": 0.9453, + "step": 2096 + }, + { + "epoch": 1.863792911898678, + "grad_norm": 0.4329817593097687, + "learning_rate": 0.0001, + "loss": 0.9554, + "step": 2097 + }, + { + "epoch": 1.8646817020331075, + "grad_norm": 0.3563162684440613, + "learning_rate": 0.0001, + "loss": 0.9519, + "step": 2098 + }, + { + "epoch": 1.865570492167537, + "grad_norm": 0.3440234065055847, + "learning_rate": 0.0001, + "loss": 0.9086, + "step": 2099 + }, + { + "epoch": 1.8664592823019666, + "grad_norm": 0.5276257991790771, + "learning_rate": 0.0001, + "loss": 0.9137, + "step": 2100 + }, + { + "epoch": 1.867348072436396, + "grad_norm": 0.39348065853118896, + "learning_rate": 0.0001, + "loss": 0.9041, + "step": 2101 + }, + { + "epoch": 1.8682368625708254, + "grad_norm": 0.3502536416053772, + "learning_rate": 0.0001, + "loss": 0.9357, + "step": 2102 + }, + { + "epoch": 1.869125652705255, + "grad_norm": 0.3747837245464325, + "learning_rate": 0.0001, + "loss": 0.9268, + "step": 2103 + }, + { + "epoch": 1.8700144428396843, + "grad_norm": 0.3654274344444275, + "learning_rate": 0.0001, + "loss": 0.9001, + "step": 2104 + }, + { + "epoch": 1.870903232974114, + "grad_norm": 0.34981828927993774, + "learning_rate": 0.0001, + "loss": 0.8879, + "step": 2105 + }, + { + "epoch": 1.8717920231085436, + "grad_norm": 0.34973669052124023, + "learning_rate": 0.0001, + "loss": 0.9614, + "step": 2106 + }, + { + "epoch": 1.8726808132429729, + "grad_norm": 0.35739874839782715, + "learning_rate": 0.0001, + "loss": 0.9628, + "step": 2107 + }, + { + "epoch": 1.8735696033774025, + "grad_norm": 0.33049049973487854, + "learning_rate": 0.0001, + "loss": 0.8869, + "step": 2108 + }, + { + "epoch": 1.874458393511832, + "grad_norm": 0.3104158639907837, + "learning_rate": 0.0001, + "loss": 0.8905, + "step": 2109 + }, + { + "epoch": 1.8753471836462614, + "grad_norm": 0.3537830114364624, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 2110 + }, + { + "epoch": 1.876235973780691, + "grad_norm": 0.36299172043800354, + "learning_rate": 0.0001, + "loss": 0.8937, + "step": 2111 + }, + { + "epoch": 1.8771247639151205, + "grad_norm": 0.30172330141067505, + "learning_rate": 0.0001, + "loss": 0.8999, + "step": 2112 + }, + { + "epoch": 1.87801355404955, + "grad_norm": 0.34734198451042175, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 2113 + }, + { + "epoch": 1.8789023441839796, + "grad_norm": 0.3040066659450531, + "learning_rate": 0.0001, + "loss": 0.8276, + "step": 2114 + }, + { + "epoch": 1.879791134318409, + "grad_norm": 0.38280028104782104, + "learning_rate": 0.0001, + "loss": 0.8841, + "step": 2115 + }, + { + "epoch": 1.8806799244528385, + "grad_norm": 0.3382204473018646, + "learning_rate": 0.0001, + "loss": 0.9274, + "step": 2116 + }, + { + "epoch": 1.8815687145872682, + "grad_norm": 0.3138622045516968, + "learning_rate": 0.0001, + "loss": 0.8872, + "step": 2117 + }, + { + "epoch": 1.8824575047216976, + "grad_norm": 0.37905508279800415, + "learning_rate": 0.0001, + "loss": 0.9759, + "step": 2118 + }, + { + "epoch": 1.883346294856127, + "grad_norm": 0.36490878462791443, + "learning_rate": 0.0001, + "loss": 0.935, + "step": 2119 + }, + { + "epoch": 1.8842350849905567, + "grad_norm": 0.30611562728881836, + "learning_rate": 0.0001, + "loss": 0.8862, + "step": 2120 + }, + { + "epoch": 1.8851238751249861, + "grad_norm": 0.30281272530555725, + "learning_rate": 0.0001, + "loss": 0.9486, + "step": 2121 + }, + { + "epoch": 1.8860126652594156, + "grad_norm": 0.3309618830680847, + "learning_rate": 0.0001, + "loss": 0.9686, + "step": 2122 + }, + { + "epoch": 1.8869014553938452, + "grad_norm": 0.36201414465904236, + "learning_rate": 0.0001, + "loss": 0.8934, + "step": 2123 + }, + { + "epoch": 1.8877902455282747, + "grad_norm": 0.3028503656387329, + "learning_rate": 0.0001, + "loss": 0.916, + "step": 2124 + }, + { + "epoch": 1.8886790356627041, + "grad_norm": 0.3417774736881256, + "learning_rate": 0.0001, + "loss": 0.96, + "step": 2125 + }, + { + "epoch": 1.8895678257971338, + "grad_norm": 0.3560897707939148, + "learning_rate": 0.0001, + "loss": 0.9264, + "step": 2126 + }, + { + "epoch": 1.890456615931563, + "grad_norm": 0.33282026648521423, + "learning_rate": 0.0001, + "loss": 0.9066, + "step": 2127 + }, + { + "epoch": 1.8913454060659927, + "grad_norm": 0.3309515416622162, + "learning_rate": 0.0001, + "loss": 0.8867, + "step": 2128 + }, + { + "epoch": 1.8922341962004223, + "grad_norm": 0.32819992303848267, + "learning_rate": 0.0001, + "loss": 0.9102, + "step": 2129 + }, + { + "epoch": 1.8931229863348515, + "grad_norm": 0.3443058133125305, + "learning_rate": 0.0001, + "loss": 0.9559, + "step": 2130 + }, + { + "epoch": 1.8940117764692812, + "grad_norm": 0.3516992926597595, + "learning_rate": 0.0001, + "loss": 0.917, + "step": 2131 + }, + { + "epoch": 1.8949005666037106, + "grad_norm": 0.32620400190353394, + "learning_rate": 0.0001, + "loss": 0.8944, + "step": 2132 + }, + { + "epoch": 1.89578935673814, + "grad_norm": 0.34699490666389465, + "learning_rate": 0.0001, + "loss": 0.87, + "step": 2133 + }, + { + "epoch": 1.8966781468725697, + "grad_norm": 0.32767391204833984, + "learning_rate": 0.0001, + "loss": 0.9523, + "step": 2134 + }, + { + "epoch": 1.8975669370069992, + "grad_norm": 0.34350037574768066, + "learning_rate": 0.0001, + "loss": 0.9449, + "step": 2135 + }, + { + "epoch": 1.8984557271414286, + "grad_norm": 0.36585548520088196, + "learning_rate": 0.0001, + "loss": 0.9338, + "step": 2136 + }, + { + "epoch": 1.8993445172758583, + "grad_norm": 0.3513524830341339, + "learning_rate": 0.0001, + "loss": 0.9429, + "step": 2137 + }, + { + "epoch": 1.9002333074102877, + "grad_norm": 0.3541501462459564, + "learning_rate": 0.0001, + "loss": 0.8976, + "step": 2138 + }, + { + "epoch": 1.9011220975447172, + "grad_norm": 0.3008817434310913, + "learning_rate": 0.0001, + "loss": 0.8689, + "step": 2139 + }, + { + "epoch": 1.9020108876791468, + "grad_norm": 0.3268721401691437, + "learning_rate": 0.0001, + "loss": 0.8712, + "step": 2140 + }, + { + "epoch": 1.9028996778135763, + "grad_norm": 0.3459062874317169, + "learning_rate": 0.0001, + "loss": 0.9399, + "step": 2141 + }, + { + "epoch": 1.9037884679480057, + "grad_norm": 0.3577088713645935, + "learning_rate": 0.0001, + "loss": 0.9029, + "step": 2142 + }, + { + "epoch": 1.9046772580824354, + "grad_norm": 0.34948304295539856, + "learning_rate": 0.0001, + "loss": 0.8714, + "step": 2143 + }, + { + "epoch": 1.9055660482168648, + "grad_norm": 0.34985479712486267, + "learning_rate": 0.0001, + "loss": 0.902, + "step": 2144 + }, + { + "epoch": 1.9064548383512943, + "grad_norm": 0.32033130526542664, + "learning_rate": 0.0001, + "loss": 0.8456, + "step": 2145 + }, + { + "epoch": 1.907343628485724, + "grad_norm": 0.32239830493927, + "learning_rate": 0.0001, + "loss": 0.9245, + "step": 2146 + }, + { + "epoch": 1.9082324186201534, + "grad_norm": 0.35271769762039185, + "learning_rate": 0.0001, + "loss": 0.9427, + "step": 2147 + }, + { + "epoch": 1.9091212087545828, + "grad_norm": 0.34471169114112854, + "learning_rate": 0.0001, + "loss": 0.9369, + "step": 2148 + }, + { + "epoch": 1.9100099988890125, + "grad_norm": 0.2854446470737457, + "learning_rate": 0.0001, + "loss": 0.8902, + "step": 2149 + }, + { + "epoch": 1.9108987890234417, + "grad_norm": 0.32251161336898804, + "learning_rate": 0.0001, + "loss": 0.8536, + "step": 2150 + }, + { + "epoch": 1.9117875791578713, + "grad_norm": 0.33492353558540344, + "learning_rate": 0.0001, + "loss": 0.8761, + "step": 2151 + }, + { + "epoch": 1.912676369292301, + "grad_norm": 0.3147629201412201, + "learning_rate": 0.0001, + "loss": 0.8802, + "step": 2152 + }, + { + "epoch": 1.9135651594267302, + "grad_norm": 0.32801344990730286, + "learning_rate": 0.0001, + "loss": 0.8123, + "step": 2153 + }, + { + "epoch": 1.9144539495611599, + "grad_norm": 0.2836940586566925, + "learning_rate": 0.0001, + "loss": 0.9012, + "step": 2154 + }, + { + "epoch": 1.9153427396955893, + "grad_norm": 0.3101726770401001, + "learning_rate": 0.0001, + "loss": 0.8274, + "step": 2155 + }, + { + "epoch": 1.9162315298300188, + "grad_norm": 0.3067977726459503, + "learning_rate": 0.0001, + "loss": 0.8986, + "step": 2156 + }, + { + "epoch": 1.9171203199644484, + "grad_norm": 0.27377212047576904, + "learning_rate": 0.0001, + "loss": 0.8844, + "step": 2157 + }, + { + "epoch": 1.9180091100988779, + "grad_norm": 0.36923712491989136, + "learning_rate": 0.0001, + "loss": 0.9264, + "step": 2158 + }, + { + "epoch": 1.9188979002333073, + "grad_norm": 0.3664684295654297, + "learning_rate": 0.0001, + "loss": 0.953, + "step": 2159 + }, + { + "epoch": 1.919786690367737, + "grad_norm": 0.3705950081348419, + "learning_rate": 0.0001, + "loss": 0.873, + "step": 2160 + }, + { + "epoch": 1.9206754805021664, + "grad_norm": 0.33334994316101074, + "learning_rate": 0.0001, + "loss": 0.8896, + "step": 2161 + }, + { + "epoch": 1.9215642706365959, + "grad_norm": 0.32042577862739563, + "learning_rate": 0.0001, + "loss": 0.832, + "step": 2162 + }, + { + "epoch": 1.9224530607710255, + "grad_norm": 0.2969491183757782, + "learning_rate": 0.0001, + "loss": 0.8963, + "step": 2163 + }, + { + "epoch": 1.923341850905455, + "grad_norm": 0.3007069528102875, + "learning_rate": 0.0001, + "loss": 0.8812, + "step": 2164 + }, + { + "epoch": 1.9242306410398844, + "grad_norm": 0.34228307008743286, + "learning_rate": 0.0001, + "loss": 0.9252, + "step": 2165 + }, + { + "epoch": 1.925119431174314, + "grad_norm": 0.34786513447761536, + "learning_rate": 0.0001, + "loss": 0.8791, + "step": 2166 + }, + { + "epoch": 1.9260082213087435, + "grad_norm": 0.30037039518356323, + "learning_rate": 0.0001, + "loss": 0.8397, + "step": 2167 + }, + { + "epoch": 1.926897011443173, + "grad_norm": 0.29265427589416504, + "learning_rate": 0.0001, + "loss": 0.8792, + "step": 2168 + }, + { + "epoch": 1.9277858015776026, + "grad_norm": 0.35532552003860474, + "learning_rate": 0.0001, + "loss": 0.9429, + "step": 2169 + }, + { + "epoch": 1.928674591712032, + "grad_norm": 0.3534153997898102, + "learning_rate": 0.0001, + "loss": 0.8774, + "step": 2170 + }, + { + "epoch": 1.9295633818464615, + "grad_norm": 0.29537761211395264, + "learning_rate": 0.0001, + "loss": 0.9146, + "step": 2171 + }, + { + "epoch": 1.9304521719808911, + "grad_norm": 0.34193140268325806, + "learning_rate": 0.0001, + "loss": 0.9506, + "step": 2172 + }, + { + "epoch": 1.9313409621153204, + "grad_norm": 0.33322104811668396, + "learning_rate": 0.0001, + "loss": 0.965, + "step": 2173 + }, + { + "epoch": 1.93222975224975, + "grad_norm": 0.34304308891296387, + "learning_rate": 0.0001, + "loss": 0.9704, + "step": 2174 + }, + { + "epoch": 1.9331185423841797, + "grad_norm": 0.3284062445163727, + "learning_rate": 0.0001, + "loss": 0.7752, + "step": 2175 + }, + { + "epoch": 1.934007332518609, + "grad_norm": 0.34800034761428833, + "learning_rate": 0.0001, + "loss": 0.9484, + "step": 2176 + }, + { + "epoch": 1.9348961226530386, + "grad_norm": 0.34290215373039246, + "learning_rate": 0.0001, + "loss": 0.9032, + "step": 2177 + }, + { + "epoch": 1.935784912787468, + "grad_norm": 0.34737300872802734, + "learning_rate": 0.0001, + "loss": 0.9158, + "step": 2178 + }, + { + "epoch": 1.9366737029218974, + "grad_norm": 0.34877312183380127, + "learning_rate": 0.0001, + "loss": 0.8934, + "step": 2179 + }, + { + "epoch": 1.937562493056327, + "grad_norm": 0.3398269712924957, + "learning_rate": 0.0001, + "loss": 0.9713, + "step": 2180 + }, + { + "epoch": 1.9384512831907565, + "grad_norm": 0.33079251646995544, + "learning_rate": 0.0001, + "loss": 1.01, + "step": 2181 + }, + { + "epoch": 1.939340073325186, + "grad_norm": 0.3067607581615448, + "learning_rate": 0.0001, + "loss": 0.8631, + "step": 2182 + }, + { + "epoch": 1.9402288634596156, + "grad_norm": 0.34091418981552124, + "learning_rate": 0.0001, + "loss": 0.9544, + "step": 2183 + }, + { + "epoch": 1.941117653594045, + "grad_norm": 0.30287304520606995, + "learning_rate": 0.0001, + "loss": 0.941, + "step": 2184 + }, + { + "epoch": 1.9420064437284745, + "grad_norm": 0.2976100742816925, + "learning_rate": 0.0001, + "loss": 0.9028, + "step": 2185 + }, + { + "epoch": 1.9428952338629042, + "grad_norm": 0.3236287236213684, + "learning_rate": 0.0001, + "loss": 0.9633, + "step": 2186 + }, + { + "epoch": 1.9437840239973336, + "grad_norm": 0.35301899909973145, + "learning_rate": 0.0001, + "loss": 0.9464, + "step": 2187 + }, + { + "epoch": 1.944672814131763, + "grad_norm": 0.35564854741096497, + "learning_rate": 0.0001, + "loss": 1.0202, + "step": 2188 + }, + { + "epoch": 1.9455616042661927, + "grad_norm": 0.31409159302711487, + "learning_rate": 0.0001, + "loss": 0.965, + "step": 2189 + }, + { + "epoch": 1.9464503944006222, + "grad_norm": 0.3617132306098938, + "learning_rate": 0.0001, + "loss": 0.9124, + "step": 2190 + }, + { + "epoch": 1.9473391845350516, + "grad_norm": 0.2927257716655731, + "learning_rate": 0.0001, + "loss": 0.9333, + "step": 2191 + }, + { + "epoch": 1.9482279746694813, + "grad_norm": 0.3173227906227112, + "learning_rate": 0.0001, + "loss": 0.8952, + "step": 2192 + }, + { + "epoch": 1.9491167648039107, + "grad_norm": 0.29953786730766296, + "learning_rate": 0.0001, + "loss": 0.8728, + "step": 2193 + }, + { + "epoch": 1.9500055549383402, + "grad_norm": 0.36522650718688965, + "learning_rate": 0.0001, + "loss": 0.8915, + "step": 2194 + }, + { + "epoch": 1.9508943450727698, + "grad_norm": 0.36083757877349854, + "learning_rate": 0.0001, + "loss": 0.9012, + "step": 2195 + }, + { + "epoch": 1.951783135207199, + "grad_norm": 0.3177006244659424, + "learning_rate": 0.0001, + "loss": 0.8705, + "step": 2196 + }, + { + "epoch": 1.9526719253416287, + "grad_norm": 0.33240652084350586, + "learning_rate": 0.0001, + "loss": 0.9201, + "step": 2197 + }, + { + "epoch": 1.9535607154760584, + "grad_norm": 0.33079612255096436, + "learning_rate": 0.0001, + "loss": 0.8434, + "step": 2198 + }, + { + "epoch": 1.9544495056104876, + "grad_norm": 0.3984503149986267, + "learning_rate": 0.0001, + "loss": 0.9963, + "step": 2199 + }, + { + "epoch": 1.9553382957449172, + "grad_norm": 0.33371084928512573, + "learning_rate": 0.0001, + "loss": 0.9055, + "step": 2200 + }, + { + "epoch": 1.956227085879347, + "grad_norm": 0.3662305772304535, + "learning_rate": 0.0001, + "loss": 0.8911, + "step": 2201 + }, + { + "epoch": 1.9571158760137761, + "grad_norm": 0.3294123411178589, + "learning_rate": 0.0001, + "loss": 0.9329, + "step": 2202 + }, + { + "epoch": 1.9580046661482058, + "grad_norm": 0.32030513882637024, + "learning_rate": 0.0001, + "loss": 0.9697, + "step": 2203 + }, + { + "epoch": 1.9588934562826352, + "grad_norm": 0.3227706551551819, + "learning_rate": 0.0001, + "loss": 0.8552, + "step": 2204 + }, + { + "epoch": 1.9597822464170647, + "grad_norm": 0.3211442828178406, + "learning_rate": 0.0001, + "loss": 0.8235, + "step": 2205 + }, + { + "epoch": 1.9606710365514943, + "grad_norm": 0.3925526440143585, + "learning_rate": 0.0001, + "loss": 0.8938, + "step": 2206 + }, + { + "epoch": 1.9615598266859238, + "grad_norm": 0.2847107946872711, + "learning_rate": 0.0001, + "loss": 0.8749, + "step": 2207 + }, + { + "epoch": 1.9624486168203532, + "grad_norm": 0.35088127851486206, + "learning_rate": 0.0001, + "loss": 0.9328, + "step": 2208 + }, + { + "epoch": 1.9633374069547829, + "grad_norm": 0.3825710713863373, + "learning_rate": 0.0001, + "loss": 0.8982, + "step": 2209 + }, + { + "epoch": 1.9642261970892123, + "grad_norm": 0.3291476368904114, + "learning_rate": 0.0001, + "loss": 0.8782, + "step": 2210 + }, + { + "epoch": 1.9651149872236418, + "grad_norm": 0.3212360441684723, + "learning_rate": 0.0001, + "loss": 0.9278, + "step": 2211 + }, + { + "epoch": 1.9660037773580714, + "grad_norm": 0.3502598702907562, + "learning_rate": 0.0001, + "loss": 0.8873, + "step": 2212 + }, + { + "epoch": 1.9668925674925009, + "grad_norm": 0.3515768051147461, + "learning_rate": 0.0001, + "loss": 0.8633, + "step": 2213 + }, + { + "epoch": 1.9677813576269303, + "grad_norm": 0.340701162815094, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 2214 + }, + { + "epoch": 1.96867014776136, + "grad_norm": 0.3128719925880432, + "learning_rate": 0.0001, + "loss": 0.8933, + "step": 2215 + }, + { + "epoch": 1.9695589378957894, + "grad_norm": 0.34365659952163696, + "learning_rate": 0.0001, + "loss": 0.9546, + "step": 2216 + }, + { + "epoch": 1.9704477280302188, + "grad_norm": 0.31693172454833984, + "learning_rate": 0.0001, + "loss": 0.8905, + "step": 2217 + }, + { + "epoch": 1.9713365181646485, + "grad_norm": 0.36978334188461304, + "learning_rate": 0.0001, + "loss": 0.8712, + "step": 2218 + }, + { + "epoch": 1.9722253082990777, + "grad_norm": 0.31684744358062744, + "learning_rate": 0.0001, + "loss": 0.9069, + "step": 2219 + }, + { + "epoch": 1.9731140984335074, + "grad_norm": 0.3603816330432892, + "learning_rate": 0.0001, + "loss": 0.9562, + "step": 2220 + }, + { + "epoch": 1.974002888567937, + "grad_norm": 0.3792661428451538, + "learning_rate": 0.0001, + "loss": 0.9382, + "step": 2221 + }, + { + "epoch": 1.9748916787023663, + "grad_norm": 0.4025600254535675, + "learning_rate": 0.0001, + "loss": 0.9571, + "step": 2222 + }, + { + "epoch": 1.975780468836796, + "grad_norm": 0.3250136077404022, + "learning_rate": 0.0001, + "loss": 0.849, + "step": 2223 + }, + { + "epoch": 1.9766692589712256, + "grad_norm": 0.3059399425983429, + "learning_rate": 0.0001, + "loss": 0.9364, + "step": 2224 + }, + { + "epoch": 1.9775580491056548, + "grad_norm": 0.2995017468929291, + "learning_rate": 0.0001, + "loss": 0.8323, + "step": 2225 + }, + { + "epoch": 1.9784468392400845, + "grad_norm": 0.33151498436927795, + "learning_rate": 0.0001, + "loss": 0.9296, + "step": 2226 + }, + { + "epoch": 1.979335629374514, + "grad_norm": 0.36364221572875977, + "learning_rate": 0.0001, + "loss": 0.9274, + "step": 2227 + }, + { + "epoch": 1.9802244195089433, + "grad_norm": 0.3598885238170624, + "learning_rate": 0.0001, + "loss": 0.9747, + "step": 2228 + }, + { + "epoch": 1.981113209643373, + "grad_norm": 0.3541521430015564, + "learning_rate": 0.0001, + "loss": 0.9024, + "step": 2229 + }, + { + "epoch": 1.9820019997778024, + "grad_norm": 0.312953382730484, + "learning_rate": 0.0001, + "loss": 0.8793, + "step": 2230 + }, + { + "epoch": 1.9828907899122319, + "grad_norm": 0.3080136477947235, + "learning_rate": 0.0001, + "loss": 0.8732, + "step": 2231 + }, + { + "epoch": 1.9837795800466616, + "grad_norm": 0.32917580008506775, + "learning_rate": 0.0001, + "loss": 0.8911, + "step": 2232 + }, + { + "epoch": 1.984668370181091, + "grad_norm": 0.34910324215888977, + "learning_rate": 0.0001, + "loss": 0.9626, + "step": 2233 + }, + { + "epoch": 1.9855571603155204, + "grad_norm": 0.38250458240509033, + "learning_rate": 0.0001, + "loss": 0.9059, + "step": 2234 + }, + { + "epoch": 1.98644595044995, + "grad_norm": 0.34895434975624084, + "learning_rate": 0.0001, + "loss": 0.9963, + "step": 2235 + }, + { + "epoch": 1.9873347405843795, + "grad_norm": 0.3334400951862335, + "learning_rate": 0.0001, + "loss": 0.8768, + "step": 2236 + }, + { + "epoch": 1.988223530718809, + "grad_norm": 0.33048558235168457, + "learning_rate": 0.0001, + "loss": 0.9229, + "step": 2237 + }, + { + "epoch": 1.9891123208532386, + "grad_norm": 0.3428661823272705, + "learning_rate": 0.0001, + "loss": 0.9175, + "step": 2238 + }, + { + "epoch": 1.990001110987668, + "grad_norm": 0.3587013781070709, + "learning_rate": 0.0001, + "loss": 0.9277, + "step": 2239 + }, + { + "epoch": 1.9908899011220975, + "grad_norm": 0.3457068204879761, + "learning_rate": 0.0001, + "loss": 0.8732, + "step": 2240 + }, + { + "epoch": 1.9917786912565272, + "grad_norm": 0.3348812162876129, + "learning_rate": 0.0001, + "loss": 0.903, + "step": 2241 + }, + { + "epoch": 1.9926674813909564, + "grad_norm": 0.3357104957103729, + "learning_rate": 0.0001, + "loss": 0.898, + "step": 2242 + }, + { + "epoch": 1.993556271525386, + "grad_norm": 0.346883088350296, + "learning_rate": 0.0001, + "loss": 0.9087, + "step": 2243 + }, + { + "epoch": 1.9944450616598157, + "grad_norm": 0.30482059717178345, + "learning_rate": 0.0001, + "loss": 0.9312, + "step": 2244 + }, + { + "epoch": 1.995333851794245, + "grad_norm": 0.34484198689460754, + "learning_rate": 0.0001, + "loss": 0.8785, + "step": 2245 + }, + { + "epoch": 1.9962226419286746, + "grad_norm": 0.2776843309402466, + "learning_rate": 0.0001, + "loss": 0.8985, + "step": 2246 + }, + { + "epoch": 1.9971114320631043, + "grad_norm": 0.33060091733932495, + "learning_rate": 0.0001, + "loss": 0.9854, + "step": 2247 + }, + { + "epoch": 1.9980002221975335, + "grad_norm": 0.31561362743377686, + "learning_rate": 0.0001, + "loss": 0.8927, + "step": 2248 + }, + { + "epoch": 1.9988890123319631, + "grad_norm": 0.29034098982810974, + "learning_rate": 0.0001, + "loss": 0.868, + "step": 2249 + }, + { + "epoch": 1.9997778024663926, + "grad_norm": 0.3135145604610443, + "learning_rate": 0.0001, + "loss": 0.7991, + "step": 2250 + }, + { + "epoch": 2.000666592600822, + "grad_norm": 0.36708906292915344, + "learning_rate": 0.0001, + "loss": 0.8307, + "step": 2251 + }, + { + "epoch": 2.0015553827352517, + "grad_norm": 0.3554810881614685, + "learning_rate": 0.0001, + "loss": 0.9073, + "step": 2252 + }, + { + "epoch": 2.0024441728696813, + "grad_norm": 0.3140956461429596, + "learning_rate": 0.0001, + "loss": 0.8779, + "step": 2253 + }, + { + "epoch": 2.0033329630041106, + "grad_norm": 0.3324379622936249, + "learning_rate": 0.0001, + "loss": 0.8612, + "step": 2254 + }, + { + "epoch": 2.0042217531385402, + "grad_norm": 0.3427555561065674, + "learning_rate": 0.0001, + "loss": 0.8843, + "step": 2255 + }, + { + "epoch": 2.00511054327297, + "grad_norm": 0.298017293214798, + "learning_rate": 0.0001, + "loss": 0.8393, + "step": 2256 + }, + { + "epoch": 2.005999333407399, + "grad_norm": 0.3473068177700043, + "learning_rate": 0.0001, + "loss": 0.9666, + "step": 2257 + }, + { + "epoch": 2.0068881235418288, + "grad_norm": 0.32553350925445557, + "learning_rate": 0.0001, + "loss": 0.9294, + "step": 2258 + }, + { + "epoch": 2.007776913676258, + "grad_norm": 0.29851260781288147, + "learning_rate": 0.0001, + "loss": 0.9186, + "step": 2259 + }, + { + "epoch": 2.0086657038106877, + "grad_norm": 0.3403533101081848, + "learning_rate": 0.0001, + "loss": 0.8777, + "step": 2260 + }, + { + "epoch": 2.0095544939451173, + "grad_norm": 0.31564658880233765, + "learning_rate": 0.0001, + "loss": 0.8203, + "step": 2261 + }, + { + "epoch": 2.0104432840795465, + "grad_norm": 0.34301063418388367, + "learning_rate": 0.0001, + "loss": 0.8448, + "step": 2262 + }, + { + "epoch": 2.011332074213976, + "grad_norm": 0.34970736503601074, + "learning_rate": 0.0001, + "loss": 0.8925, + "step": 2263 + }, + { + "epoch": 2.012220864348406, + "grad_norm": 0.3418786823749542, + "learning_rate": 0.0001, + "loss": 0.9322, + "step": 2264 + }, + { + "epoch": 2.013109654482835, + "grad_norm": 0.3406841456890106, + "learning_rate": 0.0001, + "loss": 0.8289, + "step": 2265 + }, + { + "epoch": 2.0139984446172647, + "grad_norm": 0.35538503527641296, + "learning_rate": 0.0001, + "loss": 0.9544, + "step": 2266 + }, + { + "epoch": 2.0148872347516944, + "grad_norm": 0.35400450229644775, + "learning_rate": 0.0001, + "loss": 0.9106, + "step": 2267 + }, + { + "epoch": 2.0157760248861236, + "grad_norm": 0.3589840829372406, + "learning_rate": 0.0001, + "loss": 0.9033, + "step": 2268 + }, + { + "epoch": 2.0166648150205533, + "grad_norm": 0.3830343782901764, + "learning_rate": 0.0001, + "loss": 0.9254, + "step": 2269 + }, + { + "epoch": 2.017553605154983, + "grad_norm": 0.3211663067340851, + "learning_rate": 0.0001, + "loss": 0.8952, + "step": 2270 + }, + { + "epoch": 2.018442395289412, + "grad_norm": 0.37262219190597534, + "learning_rate": 0.0001, + "loss": 0.9151, + "step": 2271 + }, + { + "epoch": 2.019331185423842, + "grad_norm": 0.34682542085647583, + "learning_rate": 0.0001, + "loss": 0.8174, + "step": 2272 + }, + { + "epoch": 2.0202199755582715, + "grad_norm": 0.3054455816745758, + "learning_rate": 0.0001, + "loss": 0.8983, + "step": 2273 + }, + { + "epoch": 2.0211087656927007, + "grad_norm": 0.2940855622291565, + "learning_rate": 0.0001, + "loss": 0.8573, + "step": 2274 + }, + { + "epoch": 2.0219975558271304, + "grad_norm": 0.37542182207107544, + "learning_rate": 0.0001, + "loss": 0.9686, + "step": 2275 + }, + { + "epoch": 2.02288634596156, + "grad_norm": 0.29696211218833923, + "learning_rate": 0.0001, + "loss": 0.8948, + "step": 2276 + }, + { + "epoch": 2.0237751360959892, + "grad_norm": 0.35127562284469604, + "learning_rate": 0.0001, + "loss": 0.9528, + "step": 2277 + }, + { + "epoch": 2.024663926230419, + "grad_norm": 0.3054238259792328, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 2278 + }, + { + "epoch": 2.0255527163648486, + "grad_norm": 0.33190521597862244, + "learning_rate": 0.0001, + "loss": 0.8616, + "step": 2279 + }, + { + "epoch": 2.026441506499278, + "grad_norm": 0.3622536361217499, + "learning_rate": 0.0001, + "loss": 0.9324, + "step": 2280 + }, + { + "epoch": 2.0273302966337075, + "grad_norm": 0.32910165190696716, + "learning_rate": 0.0001, + "loss": 0.8391, + "step": 2281 + }, + { + "epoch": 2.0282190867681367, + "grad_norm": 0.31704333424568176, + "learning_rate": 0.0001, + "loss": 0.8393, + "step": 2282 + }, + { + "epoch": 2.0291078769025663, + "grad_norm": 0.39209648966789246, + "learning_rate": 0.0001, + "loss": 0.8637, + "step": 2283 + }, + { + "epoch": 2.029996667036996, + "grad_norm": 0.3536166846752167, + "learning_rate": 0.0001, + "loss": 0.8957, + "step": 2284 + }, + { + "epoch": 2.030885457171425, + "grad_norm": 0.33245381712913513, + "learning_rate": 0.0001, + "loss": 0.8915, + "step": 2285 + }, + { + "epoch": 2.031774247305855, + "grad_norm": 0.3466053307056427, + "learning_rate": 0.0001, + "loss": 0.8671, + "step": 2286 + }, + { + "epoch": 2.0326630374402845, + "grad_norm": 0.32629281282424927, + "learning_rate": 0.0001, + "loss": 0.93, + "step": 2287 + }, + { + "epoch": 2.0335518275747138, + "grad_norm": 0.321918785572052, + "learning_rate": 0.0001, + "loss": 0.8497, + "step": 2288 + }, + { + "epoch": 2.0344406177091434, + "grad_norm": 0.3338482081890106, + "learning_rate": 0.0001, + "loss": 0.8984, + "step": 2289 + }, + { + "epoch": 2.035329407843573, + "grad_norm": 0.32754746079444885, + "learning_rate": 0.0001, + "loss": 0.8787, + "step": 2290 + }, + { + "epoch": 2.0362181979780023, + "grad_norm": 0.34894460439682007, + "learning_rate": 0.0001, + "loss": 0.8761, + "step": 2291 + }, + { + "epoch": 2.037106988112432, + "grad_norm": 0.34283894300460815, + "learning_rate": 0.0001, + "loss": 0.9411, + "step": 2292 + }, + { + "epoch": 2.0379957782468616, + "grad_norm": 0.3812194764614105, + "learning_rate": 0.0001, + "loss": 0.9111, + "step": 2293 + }, + { + "epoch": 2.038884568381291, + "grad_norm": 0.323310911655426, + "learning_rate": 0.0001, + "loss": 0.8984, + "step": 2294 + }, + { + "epoch": 2.0397733585157205, + "grad_norm": 0.376645565032959, + "learning_rate": 0.0001, + "loss": 0.8604, + "step": 2295 + }, + { + "epoch": 2.04066214865015, + "grad_norm": 0.3399569094181061, + "learning_rate": 0.0001, + "loss": 0.8495, + "step": 2296 + }, + { + "epoch": 2.0415509387845794, + "grad_norm": 0.3920535147190094, + "learning_rate": 0.0001, + "loss": 0.8972, + "step": 2297 + }, + { + "epoch": 2.042439728919009, + "grad_norm": 0.3221544623374939, + "learning_rate": 0.0001, + "loss": 0.9215, + "step": 2298 + }, + { + "epoch": 2.0433285190534387, + "grad_norm": 0.32029077410697937, + "learning_rate": 0.0001, + "loss": 0.8555, + "step": 2299 + }, + { + "epoch": 2.044217309187868, + "grad_norm": 0.3429443836212158, + "learning_rate": 0.0001, + "loss": 0.8572, + "step": 2300 + }, + { + "epoch": 2.0451060993222976, + "grad_norm": 0.32114291191101074, + "learning_rate": 0.0001, + "loss": 0.8466, + "step": 2301 + }, + { + "epoch": 2.0459948894567273, + "grad_norm": 0.35281145572662354, + "learning_rate": 0.0001, + "loss": 0.9114, + "step": 2302 + }, + { + "epoch": 2.0468836795911565, + "grad_norm": 0.32281407713890076, + "learning_rate": 0.0001, + "loss": 0.9293, + "step": 2303 + }, + { + "epoch": 2.047772469725586, + "grad_norm": 0.3905143439769745, + "learning_rate": 0.0001, + "loss": 0.9392, + "step": 2304 + }, + { + "epoch": 2.0486612598600153, + "grad_norm": 0.34570732712745667, + "learning_rate": 0.0001, + "loss": 0.8764, + "step": 2305 + }, + { + "epoch": 2.049550049994445, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0001, + "loss": 0.9282, + "step": 2306 + }, + { + "epoch": 2.0504388401288747, + "grad_norm": 0.3642049729824066, + "learning_rate": 0.0001, + "loss": 0.8468, + "step": 2307 + }, + { + "epoch": 2.051327630263304, + "grad_norm": 0.3626710772514343, + "learning_rate": 0.0001, + "loss": 0.8742, + "step": 2308 + }, + { + "epoch": 2.0522164203977336, + "grad_norm": 0.3760134279727936, + "learning_rate": 0.0001, + "loss": 0.8199, + "step": 2309 + }, + { + "epoch": 2.053105210532163, + "grad_norm": 0.3483445346355438, + "learning_rate": 0.0001, + "loss": 0.9071, + "step": 2310 + }, + { + "epoch": 2.0539940006665924, + "grad_norm": 0.3375813663005829, + "learning_rate": 0.0001, + "loss": 0.8971, + "step": 2311 + }, + { + "epoch": 2.054882790801022, + "grad_norm": 0.3339730501174927, + "learning_rate": 0.0001, + "loss": 0.8204, + "step": 2312 + }, + { + "epoch": 2.0557715809354518, + "grad_norm": 0.36098751425743103, + "learning_rate": 0.0001, + "loss": 0.8845, + "step": 2313 + }, + { + "epoch": 2.056660371069881, + "grad_norm": 0.34348028898239136, + "learning_rate": 0.0001, + "loss": 0.8818, + "step": 2314 + }, + { + "epoch": 2.0575491612043106, + "grad_norm": 0.36092281341552734, + "learning_rate": 0.0001, + "loss": 0.9322, + "step": 2315 + }, + { + "epoch": 2.0584379513387403, + "grad_norm": 0.4044939875602722, + "learning_rate": 0.0001, + "loss": 0.868, + "step": 2316 + }, + { + "epoch": 2.0593267414731695, + "grad_norm": 0.3279127776622772, + "learning_rate": 0.0001, + "loss": 0.886, + "step": 2317 + }, + { + "epoch": 2.060215531607599, + "grad_norm": 0.3734513819217682, + "learning_rate": 0.0001, + "loss": 0.8934, + "step": 2318 + }, + { + "epoch": 2.061104321742029, + "grad_norm": 0.3154412508010864, + "learning_rate": 0.0001, + "loss": 0.8556, + "step": 2319 + }, + { + "epoch": 2.061993111876458, + "grad_norm": 0.35183727741241455, + "learning_rate": 0.0001, + "loss": 0.9645, + "step": 2320 + }, + { + "epoch": 2.0628819020108877, + "grad_norm": 0.35789453983306885, + "learning_rate": 0.0001, + "loss": 0.8836, + "step": 2321 + }, + { + "epoch": 2.0637706921453174, + "grad_norm": 0.3455105423927307, + "learning_rate": 0.0001, + "loss": 0.9554, + "step": 2322 + }, + { + "epoch": 2.0646594822797466, + "grad_norm": 0.35150641202926636, + "learning_rate": 0.0001, + "loss": 0.8408, + "step": 2323 + }, + { + "epoch": 2.0655482724141763, + "grad_norm": 0.3445546627044678, + "learning_rate": 0.0001, + "loss": 0.9087, + "step": 2324 + }, + { + "epoch": 2.066437062548606, + "grad_norm": 0.3782254755496979, + "learning_rate": 0.0001, + "loss": 0.9708, + "step": 2325 + }, + { + "epoch": 2.067325852683035, + "grad_norm": 0.30935153365135193, + "learning_rate": 0.0001, + "loss": 0.8883, + "step": 2326 + }, + { + "epoch": 2.068214642817465, + "grad_norm": 0.34302398562431335, + "learning_rate": 0.0001, + "loss": 0.9016, + "step": 2327 + }, + { + "epoch": 2.069103432951894, + "grad_norm": 0.38530564308166504, + "learning_rate": 0.0001, + "loss": 0.8869, + "step": 2328 + }, + { + "epoch": 2.0699922230863237, + "grad_norm": 0.34200990200042725, + "learning_rate": 0.0001, + "loss": 0.8758, + "step": 2329 + }, + { + "epoch": 2.0708810132207534, + "grad_norm": 0.36103734374046326, + "learning_rate": 0.0001, + "loss": 0.9055, + "step": 2330 + }, + { + "epoch": 2.0717698033551826, + "grad_norm": 0.36420273780822754, + "learning_rate": 0.0001, + "loss": 0.9257, + "step": 2331 + }, + { + "epoch": 2.0726585934896122, + "grad_norm": 0.3854941725730896, + "learning_rate": 0.0001, + "loss": 0.9891, + "step": 2332 + }, + { + "epoch": 2.073547383624042, + "grad_norm": 0.33720192313194275, + "learning_rate": 0.0001, + "loss": 0.8197, + "step": 2333 + }, + { + "epoch": 2.074436173758471, + "grad_norm": 0.3435489237308502, + "learning_rate": 0.0001, + "loss": 0.8943, + "step": 2334 + }, + { + "epoch": 2.0753249638929008, + "grad_norm": 0.33030179142951965, + "learning_rate": 0.0001, + "loss": 0.8799, + "step": 2335 + }, + { + "epoch": 2.0762137540273304, + "grad_norm": 0.48584482073783875, + "learning_rate": 0.0001, + "loss": 0.8402, + "step": 2336 + }, + { + "epoch": 2.0771025441617597, + "grad_norm": 0.39869070053100586, + "learning_rate": 0.0001, + "loss": 0.8668, + "step": 2337 + }, + { + "epoch": 2.0779913342961893, + "grad_norm": 0.3875821530818939, + "learning_rate": 0.0001, + "loss": 0.9544, + "step": 2338 + }, + { + "epoch": 2.078880124430619, + "grad_norm": 0.3594411611557007, + "learning_rate": 0.0001, + "loss": 0.9415, + "step": 2339 + }, + { + "epoch": 2.079768914565048, + "grad_norm": 0.34104394912719727, + "learning_rate": 0.0001, + "loss": 0.8698, + "step": 2340 + }, + { + "epoch": 2.080657704699478, + "grad_norm": 0.3437696695327759, + "learning_rate": 0.0001, + "loss": 0.8795, + "step": 2341 + }, + { + "epoch": 2.0815464948339075, + "grad_norm": 0.3021574318408966, + "learning_rate": 0.0001, + "loss": 0.8896, + "step": 2342 + }, + { + "epoch": 2.0824352849683367, + "grad_norm": 0.306111216545105, + "learning_rate": 0.0001, + "loss": 0.8579, + "step": 2343 + }, + { + "epoch": 2.0833240751027664, + "grad_norm": 0.3199199140071869, + "learning_rate": 0.0001, + "loss": 0.8107, + "step": 2344 + }, + { + "epoch": 2.084212865237196, + "grad_norm": 0.34308746457099915, + "learning_rate": 0.0001, + "loss": 0.9077, + "step": 2345 + }, + { + "epoch": 2.0851016553716253, + "grad_norm": 0.366335928440094, + "learning_rate": 0.0001, + "loss": 0.9323, + "step": 2346 + }, + { + "epoch": 2.085990445506055, + "grad_norm": 0.3313388228416443, + "learning_rate": 0.0001, + "loss": 0.8682, + "step": 2347 + }, + { + "epoch": 2.0868792356404846, + "grad_norm": 0.31360548734664917, + "learning_rate": 0.0001, + "loss": 0.8854, + "step": 2348 + }, + { + "epoch": 2.087768025774914, + "grad_norm": 0.3268912434577942, + "learning_rate": 0.0001, + "loss": 0.867, + "step": 2349 + }, + { + "epoch": 2.0886568159093435, + "grad_norm": 0.33978188037872314, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 2350 + }, + { + "epoch": 2.0895456060437727, + "grad_norm": 0.3470538258552551, + "learning_rate": 0.0001, + "loss": 0.9443, + "step": 2351 + }, + { + "epoch": 2.0904343961782024, + "grad_norm": 0.35913875699043274, + "learning_rate": 0.0001, + "loss": 0.943, + "step": 2352 + }, + { + "epoch": 2.091323186312632, + "grad_norm": 0.37776345014572144, + "learning_rate": 0.0001, + "loss": 0.8927, + "step": 2353 + }, + { + "epoch": 2.0922119764470613, + "grad_norm": 0.39539778232574463, + "learning_rate": 0.0001, + "loss": 0.8981, + "step": 2354 + }, + { + "epoch": 2.093100766581491, + "grad_norm": 0.352658748626709, + "learning_rate": 0.0001, + "loss": 0.8996, + "step": 2355 + }, + { + "epoch": 2.0939895567159206, + "grad_norm": 0.31923985481262207, + "learning_rate": 0.0001, + "loss": 0.9008, + "step": 2356 + }, + { + "epoch": 2.09487834685035, + "grad_norm": 0.36446240544319153, + "learning_rate": 0.0001, + "loss": 0.9393, + "step": 2357 + }, + { + "epoch": 2.0957671369847795, + "grad_norm": 0.3254162073135376, + "learning_rate": 0.0001, + "loss": 0.7878, + "step": 2358 + }, + { + "epoch": 2.096655927119209, + "grad_norm": 0.3216734826564789, + "learning_rate": 0.0001, + "loss": 0.8606, + "step": 2359 + }, + { + "epoch": 2.0975447172536383, + "grad_norm": 0.3242090344429016, + "learning_rate": 0.0001, + "loss": 0.8695, + "step": 2360 + }, + { + "epoch": 2.098433507388068, + "grad_norm": 0.29055359959602356, + "learning_rate": 0.0001, + "loss": 0.8997, + "step": 2361 + }, + { + "epoch": 2.0993222975224977, + "grad_norm": 0.3655925691127777, + "learning_rate": 0.0001, + "loss": 0.9162, + "step": 2362 + }, + { + "epoch": 2.100211087656927, + "grad_norm": 0.3490038514137268, + "learning_rate": 0.0001, + "loss": 0.9202, + "step": 2363 + }, + { + "epoch": 2.1010998777913565, + "grad_norm": 0.3455352485179901, + "learning_rate": 0.0001, + "loss": 0.8759, + "step": 2364 + }, + { + "epoch": 2.101988667925786, + "grad_norm": 0.2887704074382782, + "learning_rate": 0.0001, + "loss": 0.8869, + "step": 2365 + }, + { + "epoch": 2.1028774580602154, + "grad_norm": 0.32117751240730286, + "learning_rate": 0.0001, + "loss": 0.8057, + "step": 2366 + }, + { + "epoch": 2.103766248194645, + "grad_norm": 0.33989858627319336, + "learning_rate": 0.0001, + "loss": 1.0061, + "step": 2367 + }, + { + "epoch": 2.1046550383290747, + "grad_norm": 0.3154880106449127, + "learning_rate": 0.0001, + "loss": 0.8518, + "step": 2368 + }, + { + "epoch": 2.105543828463504, + "grad_norm": 0.2925361692905426, + "learning_rate": 0.0001, + "loss": 0.8935, + "step": 2369 + }, + { + "epoch": 2.1064326185979336, + "grad_norm": 0.3639398217201233, + "learning_rate": 0.0001, + "loss": 0.8115, + "step": 2370 + }, + { + "epoch": 2.1073214087323633, + "grad_norm": 0.3522791266441345, + "learning_rate": 0.0001, + "loss": 0.8857, + "step": 2371 + }, + { + "epoch": 2.1082101988667925, + "grad_norm": 0.3359151780605316, + "learning_rate": 0.0001, + "loss": 0.8931, + "step": 2372 + }, + { + "epoch": 2.109098989001222, + "grad_norm": 0.33028051257133484, + "learning_rate": 0.0001, + "loss": 0.8142, + "step": 2373 + }, + { + "epoch": 2.1099877791356514, + "grad_norm": 0.3378293514251709, + "learning_rate": 0.0001, + "loss": 0.9244, + "step": 2374 + }, + { + "epoch": 2.110876569270081, + "grad_norm": 0.36423346400260925, + "learning_rate": 0.0001, + "loss": 0.8713, + "step": 2375 + }, + { + "epoch": 2.1117653594045107, + "grad_norm": 0.32155025005340576, + "learning_rate": 0.0001, + "loss": 0.8168, + "step": 2376 + }, + { + "epoch": 2.11265414953894, + "grad_norm": 0.627085268497467, + "learning_rate": 0.0001, + "loss": 0.8542, + "step": 2377 + }, + { + "epoch": 2.1135429396733696, + "grad_norm": 0.32278314232826233, + "learning_rate": 0.0001, + "loss": 0.8938, + "step": 2378 + }, + { + "epoch": 2.1144317298077993, + "grad_norm": 0.3442274332046509, + "learning_rate": 0.0001, + "loss": 0.9269, + "step": 2379 + }, + { + "epoch": 2.1153205199422285, + "grad_norm": 0.3448584973812103, + "learning_rate": 0.0001, + "loss": 0.8243, + "step": 2380 + }, + { + "epoch": 2.116209310076658, + "grad_norm": 0.3396780490875244, + "learning_rate": 0.0001, + "loss": 0.8122, + "step": 2381 + }, + { + "epoch": 2.117098100211088, + "grad_norm": 0.3797731101512909, + "learning_rate": 0.0001, + "loss": 0.9614, + "step": 2382 + }, + { + "epoch": 2.117986890345517, + "grad_norm": 0.41133296489715576, + "learning_rate": 0.0001, + "loss": 0.9597, + "step": 2383 + }, + { + "epoch": 2.1188756804799467, + "grad_norm": 0.4031218886375427, + "learning_rate": 0.0001, + "loss": 0.9289, + "step": 2384 + }, + { + "epoch": 2.1197644706143763, + "grad_norm": 0.3527853488922119, + "learning_rate": 0.0001, + "loss": 0.9029, + "step": 2385 + }, + { + "epoch": 2.1206532607488056, + "grad_norm": 0.3235529363155365, + "learning_rate": 0.0001, + "loss": 0.9195, + "step": 2386 + }, + { + "epoch": 2.121542050883235, + "grad_norm": 0.31495201587677, + "learning_rate": 0.0001, + "loss": 0.8524, + "step": 2387 + }, + { + "epoch": 2.122430841017665, + "grad_norm": 0.378337562084198, + "learning_rate": 0.0001, + "loss": 0.8896, + "step": 2388 + }, + { + "epoch": 2.123319631152094, + "grad_norm": 0.34486186504364014, + "learning_rate": 0.0001, + "loss": 0.9178, + "step": 2389 + }, + { + "epoch": 2.1242084212865238, + "grad_norm": 0.36222806572914124, + "learning_rate": 0.0001, + "loss": 0.8327, + "step": 2390 + }, + { + "epoch": 2.1250972114209534, + "grad_norm": 0.3327309191226959, + "learning_rate": 0.0001, + "loss": 0.8055, + "step": 2391 + }, + { + "epoch": 2.1259860015553826, + "grad_norm": 0.3839544951915741, + "learning_rate": 0.0001, + "loss": 0.9453, + "step": 2392 + }, + { + "epoch": 2.1268747916898123, + "grad_norm": 0.33836036920547485, + "learning_rate": 0.0001, + "loss": 0.8824, + "step": 2393 + }, + { + "epoch": 2.127763581824242, + "grad_norm": 0.3374810218811035, + "learning_rate": 0.0001, + "loss": 0.8261, + "step": 2394 + }, + { + "epoch": 2.128652371958671, + "grad_norm": 0.3791443109512329, + "learning_rate": 0.0001, + "loss": 0.9758, + "step": 2395 + }, + { + "epoch": 2.129541162093101, + "grad_norm": 0.3412923216819763, + "learning_rate": 0.0001, + "loss": 0.8655, + "step": 2396 + }, + { + "epoch": 2.13042995222753, + "grad_norm": 0.3597002625465393, + "learning_rate": 0.0001, + "loss": 0.8728, + "step": 2397 + }, + { + "epoch": 2.1313187423619597, + "grad_norm": 0.35584625601768494, + "learning_rate": 0.0001, + "loss": 0.8583, + "step": 2398 + }, + { + "epoch": 2.1322075324963894, + "grad_norm": 0.3240017890930176, + "learning_rate": 0.0001, + "loss": 0.9065, + "step": 2399 + }, + { + "epoch": 2.1330963226308186, + "grad_norm": 0.31349146366119385, + "learning_rate": 0.0001, + "loss": 0.8752, + "step": 2400 + }, + { + "epoch": 2.1339851127652483, + "grad_norm": 0.3268539011478424, + "learning_rate": 0.0001, + "loss": 0.867, + "step": 2401 + }, + { + "epoch": 2.134873902899678, + "grad_norm": 1.1880651712417603, + "learning_rate": 0.0001, + "loss": 0.9108, + "step": 2402 + }, + { + "epoch": 2.135762693034107, + "grad_norm": 0.32068613171577454, + "learning_rate": 0.0001, + "loss": 0.865, + "step": 2403 + }, + { + "epoch": 2.136651483168537, + "grad_norm": 0.3671189248561859, + "learning_rate": 0.0001, + "loss": 0.922, + "step": 2404 + }, + { + "epoch": 2.1375402733029665, + "grad_norm": 0.3628632724285126, + "learning_rate": 0.0001, + "loss": 0.8714, + "step": 2405 + }, + { + "epoch": 2.1384290634373957, + "grad_norm": 0.38792744278907776, + "learning_rate": 0.0001, + "loss": 0.9345, + "step": 2406 + }, + { + "epoch": 2.1393178535718254, + "grad_norm": 0.38119903206825256, + "learning_rate": 0.0001, + "loss": 0.8692, + "step": 2407 + }, + { + "epoch": 2.140206643706255, + "grad_norm": 0.431945264339447, + "learning_rate": 0.0001, + "loss": 0.9405, + "step": 2408 + }, + { + "epoch": 2.1410954338406842, + "grad_norm": 0.3509438633918762, + "learning_rate": 0.0001, + "loss": 0.9128, + "step": 2409 + }, + { + "epoch": 2.141984223975114, + "grad_norm": 0.35793623328208923, + "learning_rate": 0.0001, + "loss": 0.8824, + "step": 2410 + }, + { + "epoch": 2.1428730141095436, + "grad_norm": 0.6159213185310364, + "learning_rate": 0.0001, + "loss": 0.8774, + "step": 2411 + }, + { + "epoch": 2.143761804243973, + "grad_norm": 0.36963677406311035, + "learning_rate": 0.0001, + "loss": 0.8521, + "step": 2412 + }, + { + "epoch": 2.1446505943784024, + "grad_norm": 0.36160334944725037, + "learning_rate": 0.0001, + "loss": 0.8255, + "step": 2413 + }, + { + "epoch": 2.145539384512832, + "grad_norm": 0.35096341371536255, + "learning_rate": 0.0001, + "loss": 0.955, + "step": 2414 + }, + { + "epoch": 2.1464281746472613, + "grad_norm": 0.5263632535934448, + "learning_rate": 0.0001, + "loss": 0.9105, + "step": 2415 + }, + { + "epoch": 2.147316964781691, + "grad_norm": 0.37115469574928284, + "learning_rate": 0.0001, + "loss": 0.8369, + "step": 2416 + }, + { + "epoch": 2.1482057549161206, + "grad_norm": 0.3329596519470215, + "learning_rate": 0.0001, + "loss": 0.8283, + "step": 2417 + }, + { + "epoch": 2.14909454505055, + "grad_norm": 0.5003162026405334, + "learning_rate": 0.0001, + "loss": 0.9056, + "step": 2418 + }, + { + "epoch": 2.1499833351849795, + "grad_norm": 0.3137914538383484, + "learning_rate": 0.0001, + "loss": 0.8761, + "step": 2419 + }, + { + "epoch": 2.1508721253194087, + "grad_norm": 0.35285452008247375, + "learning_rate": 0.0001, + "loss": 0.8635, + "step": 2420 + }, + { + "epoch": 2.1517609154538384, + "grad_norm": 0.35029417276382446, + "learning_rate": 0.0001, + "loss": 0.8181, + "step": 2421 + }, + { + "epoch": 2.152649705588268, + "grad_norm": 0.36478570103645325, + "learning_rate": 0.0001, + "loss": 0.8536, + "step": 2422 + }, + { + "epoch": 2.1535384957226973, + "grad_norm": 0.43607574701309204, + "learning_rate": 0.0001, + "loss": 0.8965, + "step": 2423 + }, + { + "epoch": 2.154427285857127, + "grad_norm": 0.36862248182296753, + "learning_rate": 0.0001, + "loss": 0.8031, + "step": 2424 + }, + { + "epoch": 2.1553160759915566, + "grad_norm": 0.49188151955604553, + "learning_rate": 0.0001, + "loss": 0.9334, + "step": 2425 + }, + { + "epoch": 2.156204866125986, + "grad_norm": 0.7494956254959106, + "learning_rate": 0.0001, + "loss": 0.9039, + "step": 2426 + }, + { + "epoch": 2.1570936562604155, + "grad_norm": 0.9286481738090515, + "learning_rate": 0.0001, + "loss": 0.9374, + "step": 2427 + }, + { + "epoch": 2.157982446394845, + "grad_norm": 0.4706733822822571, + "learning_rate": 0.0001, + "loss": 0.9537, + "step": 2428 + }, + { + "epoch": 2.1588712365292744, + "grad_norm": 0.4074293076992035, + "learning_rate": 0.0001, + "loss": 0.8819, + "step": 2429 + }, + { + "epoch": 2.159760026663704, + "grad_norm": 0.3723183870315552, + "learning_rate": 0.0001, + "loss": 0.8569, + "step": 2430 + }, + { + "epoch": 2.1606488167981337, + "grad_norm": 0.5987865924835205, + "learning_rate": 0.0001, + "loss": 0.8495, + "step": 2431 + }, + { + "epoch": 2.161537606932563, + "grad_norm": 0.3815048038959503, + "learning_rate": 0.0001, + "loss": 0.8312, + "step": 2432 + }, + { + "epoch": 2.1624263970669926, + "grad_norm": 0.33688196539878845, + "learning_rate": 0.0001, + "loss": 0.8509, + "step": 2433 + }, + { + "epoch": 2.1633151872014222, + "grad_norm": 0.3331957161426544, + "learning_rate": 0.0001, + "loss": 0.8446, + "step": 2434 + }, + { + "epoch": 2.1642039773358515, + "grad_norm": 0.8897263407707214, + "learning_rate": 0.0001, + "loss": 0.7964, + "step": 2435 + }, + { + "epoch": 2.165092767470281, + "grad_norm": 0.3500574231147766, + "learning_rate": 0.0001, + "loss": 0.8289, + "step": 2436 + }, + { + "epoch": 2.165981557604711, + "grad_norm": 0.45759961009025574, + "learning_rate": 0.0001, + "loss": 0.8902, + "step": 2437 + }, + { + "epoch": 2.16687034773914, + "grad_norm": 0.34242385625839233, + "learning_rate": 0.0001, + "loss": 0.8415, + "step": 2438 + }, + { + "epoch": 2.1677591378735697, + "grad_norm": 0.4210834205150604, + "learning_rate": 0.0001, + "loss": 0.9238, + "step": 2439 + }, + { + "epoch": 2.1686479280079993, + "grad_norm": 0.6454192399978638, + "learning_rate": 0.0001, + "loss": 0.8155, + "step": 2440 + }, + { + "epoch": 2.1695367181424285, + "grad_norm": 0.34665006399154663, + "learning_rate": 0.0001, + "loss": 0.8507, + "step": 2441 + }, + { + "epoch": 2.170425508276858, + "grad_norm": 0.3612930178642273, + "learning_rate": 0.0001, + "loss": 0.8899, + "step": 2442 + }, + { + "epoch": 2.1713142984112874, + "grad_norm": 0.36009481549263, + "learning_rate": 0.0001, + "loss": 0.8985, + "step": 2443 + }, + { + "epoch": 2.172203088545717, + "grad_norm": 0.392411470413208, + "learning_rate": 0.0001, + "loss": 0.947, + "step": 2444 + }, + { + "epoch": 2.1730918786801467, + "grad_norm": 0.346246600151062, + "learning_rate": 0.0001, + "loss": 0.8688, + "step": 2445 + }, + { + "epoch": 2.173980668814576, + "grad_norm": 0.3527586758136749, + "learning_rate": 0.0001, + "loss": 0.8966, + "step": 2446 + }, + { + "epoch": 2.1748694589490056, + "grad_norm": 0.3666391968727112, + "learning_rate": 0.0001, + "loss": 0.945, + "step": 2447 + }, + { + "epoch": 2.1757582490834353, + "grad_norm": 0.36609384417533875, + "learning_rate": 0.0001, + "loss": 0.844, + "step": 2448 + }, + { + "epoch": 2.1766470392178645, + "grad_norm": 0.33666878938674927, + "learning_rate": 0.0001, + "loss": 0.7822, + "step": 2449 + }, + { + "epoch": 2.177535829352294, + "grad_norm": 0.41050276160240173, + "learning_rate": 0.0001, + "loss": 0.9306, + "step": 2450 + }, + { + "epoch": 2.178424619486724, + "grad_norm": 0.33751270174980164, + "learning_rate": 0.0001, + "loss": 0.8642, + "step": 2451 + }, + { + "epoch": 2.179313409621153, + "grad_norm": 0.32639411091804504, + "learning_rate": 0.0001, + "loss": 0.8057, + "step": 2452 + }, + { + "epoch": 2.1802021997555827, + "grad_norm": 0.34592291712760925, + "learning_rate": 0.0001, + "loss": 0.9396, + "step": 2453 + }, + { + "epoch": 2.1810909898900124, + "grad_norm": 0.32606473565101624, + "learning_rate": 0.0001, + "loss": 0.8235, + "step": 2454 + }, + { + "epoch": 2.1819797800244416, + "grad_norm": 0.33683574199676514, + "learning_rate": 0.0001, + "loss": 0.8805, + "step": 2455 + }, + { + "epoch": 2.1828685701588713, + "grad_norm": 0.36452654004096985, + "learning_rate": 0.0001, + "loss": 0.8843, + "step": 2456 + }, + { + "epoch": 2.183757360293301, + "grad_norm": 0.3444773256778717, + "learning_rate": 0.0001, + "loss": 0.9527, + "step": 2457 + }, + { + "epoch": 2.18464615042773, + "grad_norm": 0.32133087515830994, + "learning_rate": 0.0001, + "loss": 0.8259, + "step": 2458 + }, + { + "epoch": 2.18553494056216, + "grad_norm": 0.3569784462451935, + "learning_rate": 0.0001, + "loss": 0.9475, + "step": 2459 + }, + { + "epoch": 2.1864237306965895, + "grad_norm": 0.3464709222316742, + "learning_rate": 0.0001, + "loss": 0.8372, + "step": 2460 + }, + { + "epoch": 2.1873125208310187, + "grad_norm": 0.30103108286857605, + "learning_rate": 0.0001, + "loss": 0.843, + "step": 2461 + }, + { + "epoch": 2.1882013109654483, + "grad_norm": 0.31629863381385803, + "learning_rate": 0.0001, + "loss": 0.8342, + "step": 2462 + }, + { + "epoch": 2.189090101099878, + "grad_norm": 0.31571292877197266, + "learning_rate": 0.0001, + "loss": 0.8531, + "step": 2463 + }, + { + "epoch": 2.1899788912343072, + "grad_norm": 0.36305055022239685, + "learning_rate": 0.0001, + "loss": 0.8899, + "step": 2464 + }, + { + "epoch": 2.190867681368737, + "grad_norm": 0.31181925535202026, + "learning_rate": 0.0001, + "loss": 0.8869, + "step": 2465 + }, + { + "epoch": 2.191756471503166, + "grad_norm": 0.33491456508636475, + "learning_rate": 0.0001, + "loss": 0.911, + "step": 2466 + }, + { + "epoch": 2.1926452616375958, + "grad_norm": 0.3482362926006317, + "learning_rate": 0.0001, + "loss": 0.8501, + "step": 2467 + }, + { + "epoch": 2.1935340517720254, + "grad_norm": 0.3489706516265869, + "learning_rate": 0.0001, + "loss": 0.854, + "step": 2468 + }, + { + "epoch": 2.1944228419064546, + "grad_norm": 0.33438587188720703, + "learning_rate": 0.0001, + "loss": 0.8383, + "step": 2469 + }, + { + "epoch": 2.1953116320408843, + "grad_norm": 0.3124372363090515, + "learning_rate": 0.0001, + "loss": 0.8203, + "step": 2470 + }, + { + "epoch": 2.196200422175314, + "grad_norm": 0.349856972694397, + "learning_rate": 0.0001, + "loss": 0.9404, + "step": 2471 + }, + { + "epoch": 2.197089212309743, + "grad_norm": 0.3826901316642761, + "learning_rate": 0.0001, + "loss": 0.9455, + "step": 2472 + }, + { + "epoch": 2.197978002444173, + "grad_norm": 0.34957224130630493, + "learning_rate": 0.0001, + "loss": 0.938, + "step": 2473 + }, + { + "epoch": 2.1988667925786025, + "grad_norm": 0.28016459941864014, + "learning_rate": 0.0001, + "loss": 0.9001, + "step": 2474 + }, + { + "epoch": 2.1997555827130317, + "grad_norm": 0.39419126510620117, + "learning_rate": 0.0001, + "loss": 0.9849, + "step": 2475 + }, + { + "epoch": 2.2006443728474614, + "grad_norm": 0.3508966267108917, + "learning_rate": 0.0001, + "loss": 0.8562, + "step": 2476 + }, + { + "epoch": 2.201533162981891, + "grad_norm": 0.32400640845298767, + "learning_rate": 0.0001, + "loss": 0.8503, + "step": 2477 + }, + { + "epoch": 2.2024219531163203, + "grad_norm": 0.31029975414276123, + "learning_rate": 0.0001, + "loss": 0.9593, + "step": 2478 + }, + { + "epoch": 2.20331074325075, + "grad_norm": 0.34035494923591614, + "learning_rate": 0.0001, + "loss": 1.0073, + "step": 2479 + }, + { + "epoch": 2.2041995333851796, + "grad_norm": 0.34316977858543396, + "learning_rate": 0.0001, + "loss": 0.8212, + "step": 2480 + }, + { + "epoch": 2.205088323519609, + "grad_norm": 0.3366788625717163, + "learning_rate": 0.0001, + "loss": 0.8808, + "step": 2481 + }, + { + "epoch": 2.2059771136540385, + "grad_norm": 0.36008214950561523, + "learning_rate": 0.0001, + "loss": 0.8823, + "step": 2482 + }, + { + "epoch": 2.206865903788468, + "grad_norm": 0.3107230067253113, + "learning_rate": 0.0001, + "loss": 0.9265, + "step": 2483 + }, + { + "epoch": 2.2077546939228974, + "grad_norm": 0.328205406665802, + "learning_rate": 0.0001, + "loss": 0.8796, + "step": 2484 + }, + { + "epoch": 2.208643484057327, + "grad_norm": 0.33115193247795105, + "learning_rate": 0.0001, + "loss": 0.9549, + "step": 2485 + }, + { + "epoch": 2.2095322741917567, + "grad_norm": 0.31935736536979675, + "learning_rate": 0.0001, + "loss": 0.9173, + "step": 2486 + }, + { + "epoch": 2.210421064326186, + "grad_norm": 0.3087455630302429, + "learning_rate": 0.0001, + "loss": 0.8084, + "step": 2487 + }, + { + "epoch": 2.2113098544606156, + "grad_norm": 0.38220590353012085, + "learning_rate": 0.0001, + "loss": 1.0343, + "step": 2488 + }, + { + "epoch": 2.212198644595045, + "grad_norm": 0.3595533072948456, + "learning_rate": 0.0001, + "loss": 0.9296, + "step": 2489 + }, + { + "epoch": 2.2130874347294744, + "grad_norm": 0.35236823558807373, + "learning_rate": 0.0001, + "loss": 0.9115, + "step": 2490 + }, + { + "epoch": 2.213976224863904, + "grad_norm": 0.2661668360233307, + "learning_rate": 0.0001, + "loss": 0.829, + "step": 2491 + }, + { + "epoch": 2.2148650149983333, + "grad_norm": 0.3272991478443146, + "learning_rate": 0.0001, + "loss": 0.8619, + "step": 2492 + }, + { + "epoch": 2.215753805132763, + "grad_norm": 0.3380112648010254, + "learning_rate": 0.0001, + "loss": 0.8567, + "step": 2493 + }, + { + "epoch": 2.2166425952671927, + "grad_norm": 0.36716368794441223, + "learning_rate": 0.0001, + "loss": 0.9482, + "step": 2494 + }, + { + "epoch": 2.217531385401622, + "grad_norm": 0.32290229201316833, + "learning_rate": 0.0001, + "loss": 0.8206, + "step": 2495 + }, + { + "epoch": 2.2184201755360515, + "grad_norm": 0.3434554934501648, + "learning_rate": 0.0001, + "loss": 0.829, + "step": 2496 + }, + { + "epoch": 2.219308965670481, + "grad_norm": 0.3272440433502197, + "learning_rate": 0.0001, + "loss": 0.9398, + "step": 2497 + }, + { + "epoch": 2.2201977558049104, + "grad_norm": 0.35004308819770813, + "learning_rate": 0.0001, + "loss": 0.9388, + "step": 2498 + }, + { + "epoch": 2.22108654593934, + "grad_norm": 0.38155871629714966, + "learning_rate": 0.0001, + "loss": 0.8243, + "step": 2499 + }, + { + "epoch": 2.2219753360737697, + "grad_norm": 0.36860787868499756, + "learning_rate": 0.0001, + "loss": 0.9036, + "step": 2500 + } + ], + "logging_steps": 1.0, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 1.3367420220730245e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}