diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,27309 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.98961937716263, + "eval_steps": 500, + "global_step": 3897, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002306805074971165, + "grad_norm": 0.0, + "learning_rate": 1.7094017094017097e-07, + "loss": 8.4508, + "step": 1 + }, + { + "epoch": 0.00461361014994233, + "grad_norm": 0.0, + "learning_rate": 3.4188034188034194e-07, + "loss": 8.0474, + "step": 2 + }, + { + "epoch": 0.006920415224913495, + "grad_norm": 0.0, + "learning_rate": 5.128205128205128e-07, + "loss": 8.3708, + "step": 3 + }, + { + "epoch": 0.00922722029988466, + "grad_norm": 0.0, + "learning_rate": 6.837606837606839e-07, + "loss": 8.4639, + "step": 4 + }, + { + "epoch": 0.011534025374855825, + "grad_norm": 0.0, + "learning_rate": 8.547008547008548e-07, + "loss": 8.0678, + "step": 5 + }, + { + "epoch": 0.01384083044982699, + "grad_norm": 0.0, + "learning_rate": 1.0256410256410257e-06, + "loss": 7.6616, + "step": 6 + }, + { + "epoch": 0.016147635524798153, + "grad_norm": 0.0, + "learning_rate": 1.1965811965811968e-06, + "loss": 8.9771, + "step": 7 + }, + { + "epoch": 0.01845444059976932, + "grad_norm": 0.0, + "learning_rate": 1.3675213675213678e-06, + "loss": 8.5794, + "step": 8 + }, + { + "epoch": 0.020761245674740483, + "grad_norm": 0.0, + "learning_rate": 1.5384615384615387e-06, + "loss": 8.0316, + "step": 9 + }, + { + "epoch": 0.02306805074971165, + "grad_norm": 0.0, + "learning_rate": 1.7094017094017097e-06, + "loss": 8.0619, + "step": 10 + }, + { + "epoch": 0.025374855824682813, + "grad_norm": 0.0, + "learning_rate": 1.8803418803418804e-06, + "loss": 7.922, + "step": 11 + }, + { + "epoch": 0.02768166089965398, + "grad_norm": 0.0, + "learning_rate": 2.0512820512820513e-06, + "loss": 8.1813, + "step": 12 + }, + { + "epoch": 0.029988465974625143, + "grad_norm": 0.0, + "learning_rate": 2.222222222222222e-06, + "loss": 7.2392, + "step": 13 + }, + { + "epoch": 0.03229527104959631, + "grad_norm": 0.0, + "learning_rate": 2.3931623931623937e-06, + "loss": 6.1259, + "step": 14 + }, + { + "epoch": 0.03460207612456748, + "grad_norm": 0.0, + "learning_rate": 2.564102564102564e-06, + "loss": 6.2235, + "step": 15 + }, + { + "epoch": 0.03690888119953864, + "grad_norm": 0.0, + "learning_rate": 2.7350427350427355e-06, + "loss": 5.5032, + "step": 16 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 0.0, + "learning_rate": 2.9059829059829063e-06, + "loss": 5.3191, + "step": 17 + }, + { + "epoch": 0.04152249134948097, + "grad_norm": 0.0, + "learning_rate": 3.0769230769230774e-06, + "loss": 5.3464, + "step": 18 + }, + { + "epoch": 0.04382929642445214, + "grad_norm": 0.0, + "learning_rate": 3.247863247863248e-06, + "loss": 4.8332, + "step": 19 + }, + { + "epoch": 0.0461361014994233, + "grad_norm": 0.0, + "learning_rate": 3.4188034188034193e-06, + "loss": 4.3945, + "step": 20 + }, + { + "epoch": 0.04844290657439446, + "grad_norm": 0.0, + "learning_rate": 3.58974358974359e-06, + "loss": 4.1624, + "step": 21 + }, + { + "epoch": 0.05074971164936563, + "grad_norm": 0.0, + "learning_rate": 3.760683760683761e-06, + "loss": 2.8312, + "step": 22 + }, + { + "epoch": 0.0530565167243368, + "grad_norm": 0.0, + "learning_rate": 3.9316239316239315e-06, + "loss": 2.5994, + "step": 23 + }, + { + "epoch": 0.05536332179930796, + "grad_norm": 0.0, + "learning_rate": 4.102564102564103e-06, + "loss": 2.7753, + "step": 24 + }, + { + "epoch": 0.05767012687427912, + "grad_norm": 0.0, + "learning_rate": 4.273504273504274e-06, + "loss": 2.7568, + "step": 25 + }, + { + "epoch": 0.05997693194925029, + "grad_norm": 0.0, + "learning_rate": 4.444444444444444e-06, + "loss": 2.5088, + "step": 26 + }, + { + "epoch": 0.06228373702422145, + "grad_norm": 0.0, + "learning_rate": 4.615384615384616e-06, + "loss": 2.3828, + "step": 27 + }, + { + "epoch": 0.06459054209919261, + "grad_norm": 0.0, + "learning_rate": 4.786324786324787e-06, + "loss": 2.2915, + "step": 28 + }, + { + "epoch": 0.06689734717416378, + "grad_norm": 0.0, + "learning_rate": 4.957264957264958e-06, + "loss": 2.2262, + "step": 29 + }, + { + "epoch": 0.06920415224913495, + "grad_norm": 0.0, + "learning_rate": 5.128205128205128e-06, + "loss": 3.11, + "step": 30 + }, + { + "epoch": 0.07151095732410612, + "grad_norm": 0.0, + "learning_rate": 5.2991452991453e-06, + "loss": 2.3123, + "step": 31 + }, + { + "epoch": 0.07381776239907728, + "grad_norm": 0.0, + "learning_rate": 5.470085470085471e-06, + "loss": 2.3687, + "step": 32 + }, + { + "epoch": 0.07612456747404844, + "grad_norm": 0.0, + "learning_rate": 5.641025641025641e-06, + "loss": 2.1863, + "step": 33 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.0, + "learning_rate": 5.8119658119658126e-06, + "loss": 2.8675, + "step": 34 + }, + { + "epoch": 0.08073817762399077, + "grad_norm": 0.0, + "learning_rate": 5.982905982905983e-06, + "loss": 1.8582, + "step": 35 + }, + { + "epoch": 0.08304498269896193, + "grad_norm": 0.0, + "learning_rate": 6.153846153846155e-06, + "loss": 2.0706, + "step": 36 + }, + { + "epoch": 0.0853517877739331, + "grad_norm": 0.0, + "learning_rate": 6.324786324786325e-06, + "loss": 2.4735, + "step": 37 + }, + { + "epoch": 0.08765859284890427, + "grad_norm": 0.0, + "learning_rate": 6.495726495726496e-06, + "loss": 2.0614, + "step": 38 + }, + { + "epoch": 0.08996539792387544, + "grad_norm": 0.0, + "learning_rate": 6.666666666666667e-06, + "loss": 2.5187, + "step": 39 + }, + { + "epoch": 0.0922722029988466, + "grad_norm": 0.0, + "learning_rate": 6.837606837606839e-06, + "loss": 1.9505, + "step": 40 + }, + { + "epoch": 0.09457900807381776, + "grad_norm": 0.0, + "learning_rate": 7.008547008547009e-06, + "loss": 1.9036, + "step": 41 + }, + { + "epoch": 0.09688581314878893, + "grad_norm": 0.0, + "learning_rate": 7.17948717948718e-06, + "loss": 1.8737, + "step": 42 + }, + { + "epoch": 0.09919261822376009, + "grad_norm": 0.0, + "learning_rate": 7.350427350427351e-06, + "loss": 2.0507, + "step": 43 + }, + { + "epoch": 0.10149942329873125, + "grad_norm": 0.0, + "learning_rate": 7.521367521367522e-06, + "loss": 1.3861, + "step": 44 + }, + { + "epoch": 0.10380622837370242, + "grad_norm": 0.0, + "learning_rate": 7.692307692307694e-06, + "loss": 1.7961, + "step": 45 + }, + { + "epoch": 0.1061130334486736, + "grad_norm": 0.0, + "learning_rate": 7.863247863247863e-06, + "loss": 1.8537, + "step": 46 + }, + { + "epoch": 0.10841983852364476, + "grad_norm": 0.0, + "learning_rate": 8.034188034188036e-06, + "loss": 2.1222, + "step": 47 + }, + { + "epoch": 0.11072664359861592, + "grad_norm": 0.0, + "learning_rate": 8.205128205128205e-06, + "loss": 1.7843, + "step": 48 + }, + { + "epoch": 0.11303344867358708, + "grad_norm": 0.0, + "learning_rate": 8.376068376068377e-06, + "loss": 1.607, + "step": 49 + }, + { + "epoch": 0.11534025374855825, + "grad_norm": 0.0, + "learning_rate": 8.547008547008548e-06, + "loss": 2.1408, + "step": 50 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.0, + "learning_rate": 8.717948717948719e-06, + "loss": 1.9771, + "step": 51 + }, + { + "epoch": 0.11995386389850057, + "grad_norm": 0.0, + "learning_rate": 8.888888888888888e-06, + "loss": 1.6614, + "step": 52 + }, + { + "epoch": 0.12226066897347174, + "grad_norm": 0.0, + "learning_rate": 9.059829059829061e-06, + "loss": 1.3775, + "step": 53 + }, + { + "epoch": 0.1245674740484429, + "grad_norm": 0.0, + "learning_rate": 9.230769230769232e-06, + "loss": 1.7402, + "step": 54 + }, + { + "epoch": 0.12687427912341406, + "grad_norm": 0.0, + "learning_rate": 9.401709401709402e-06, + "loss": 1.6679, + "step": 55 + }, + { + "epoch": 0.12918108419838523, + "grad_norm": 0.0, + "learning_rate": 9.572649572649575e-06, + "loss": 1.6464, + "step": 56 + }, + { + "epoch": 0.1314878892733564, + "grad_norm": 0.0, + "learning_rate": 9.743589743589744e-06, + "loss": 1.4238, + "step": 57 + }, + { + "epoch": 0.13379469434832755, + "grad_norm": 0.0, + "learning_rate": 9.914529914529915e-06, + "loss": 1.7369, + "step": 58 + }, + { + "epoch": 0.13610149942329874, + "grad_norm": 0.0, + "learning_rate": 1.0085470085470086e-05, + "loss": 1.0986, + "step": 59 + }, + { + "epoch": 0.1384083044982699, + "grad_norm": 0.0, + "learning_rate": 1.0256410256410256e-05, + "loss": 1.5508, + "step": 60 + }, + { + "epoch": 0.14071510957324107, + "grad_norm": 0.0, + "learning_rate": 1.0427350427350429e-05, + "loss": 1.1049, + "step": 61 + }, + { + "epoch": 0.14302191464821223, + "grad_norm": 0.0, + "learning_rate": 1.05982905982906e-05, + "loss": 1.6256, + "step": 62 + }, + { + "epoch": 0.1453287197231834, + "grad_norm": 0.0, + "learning_rate": 1.076923076923077e-05, + "loss": 1.4749, + "step": 63 + }, + { + "epoch": 0.14763552479815456, + "grad_norm": 0.0, + "learning_rate": 1.0940170940170942e-05, + "loss": 1.6895, + "step": 64 + }, + { + "epoch": 0.14994232987312572, + "grad_norm": 0.0, + "learning_rate": 1.1111111111111113e-05, + "loss": 1.8539, + "step": 65 + }, + { + "epoch": 0.1522491349480969, + "grad_norm": 0.0, + "learning_rate": 1.1282051282051283e-05, + "loss": 1.5428, + "step": 66 + }, + { + "epoch": 0.15455594002306805, + "grad_norm": 0.0, + "learning_rate": 1.1452991452991454e-05, + "loss": 1.1057, + "step": 67 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.0, + "learning_rate": 1.1623931623931625e-05, + "loss": 1.4131, + "step": 68 + }, + { + "epoch": 0.15916955017301038, + "grad_norm": 0.0, + "learning_rate": 1.1794871794871796e-05, + "loss": 1.6288, + "step": 69 + }, + { + "epoch": 0.16147635524798154, + "grad_norm": 0.0, + "learning_rate": 1.1965811965811966e-05, + "loss": 1.6981, + "step": 70 + }, + { + "epoch": 0.1637831603229527, + "grad_norm": 0.0, + "learning_rate": 1.2136752136752137e-05, + "loss": 1.5066, + "step": 71 + }, + { + "epoch": 0.16608996539792387, + "grad_norm": 0.0, + "learning_rate": 1.230769230769231e-05, + "loss": 2.0456, + "step": 72 + }, + { + "epoch": 0.16839677047289503, + "grad_norm": 0.0, + "learning_rate": 1.247863247863248e-05, + "loss": 1.2475, + "step": 73 + }, + { + "epoch": 0.1707035755478662, + "grad_norm": 0.0, + "learning_rate": 1.264957264957265e-05, + "loss": 1.7418, + "step": 74 + }, + { + "epoch": 0.17301038062283736, + "grad_norm": 0.0, + "learning_rate": 1.2820512820512823e-05, + "loss": 1.444, + "step": 75 + }, + { + "epoch": 0.17531718569780855, + "grad_norm": 0.0, + "learning_rate": 1.2991452991452993e-05, + "loss": 1.688, + "step": 76 + }, + { + "epoch": 0.1776239907727797, + "grad_norm": 0.0, + "learning_rate": 1.3162393162393164e-05, + "loss": 1.4214, + "step": 77 + }, + { + "epoch": 0.17993079584775087, + "grad_norm": 0.0, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.9668, + "step": 78 + }, + { + "epoch": 0.18223760092272204, + "grad_norm": 0.0, + "learning_rate": 1.3504273504273506e-05, + "loss": 1.748, + "step": 79 + }, + { + "epoch": 0.1845444059976932, + "grad_norm": 0.0, + "learning_rate": 1.3675213675213677e-05, + "loss": 1.4736, + "step": 80 + }, + { + "epoch": 0.18685121107266436, + "grad_norm": 0.0, + "learning_rate": 1.3846153846153847e-05, + "loss": 1.6399, + "step": 81 + }, + { + "epoch": 0.18915801614763553, + "grad_norm": 0.0, + "learning_rate": 1.4017094017094018e-05, + "loss": 1.1332, + "step": 82 + }, + { + "epoch": 0.1914648212226067, + "grad_norm": 0.0, + "learning_rate": 1.4188034188034189e-05, + "loss": 1.4893, + "step": 83 + }, + { + "epoch": 0.19377162629757785, + "grad_norm": 0.0, + "learning_rate": 1.435897435897436e-05, + "loss": 1.3734, + "step": 84 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.0, + "learning_rate": 1.4529914529914531e-05, + "loss": 1.3934, + "step": 85 + }, + { + "epoch": 0.19838523644752018, + "grad_norm": 0.0, + "learning_rate": 1.4700854700854703e-05, + "loss": 1.3689, + "step": 86 + }, + { + "epoch": 0.20069204152249134, + "grad_norm": 0.0, + "learning_rate": 1.4871794871794874e-05, + "loss": 1.0738, + "step": 87 + }, + { + "epoch": 0.2029988465974625, + "grad_norm": 0.0, + "learning_rate": 1.5042735042735043e-05, + "loss": 1.2578, + "step": 88 + }, + { + "epoch": 0.20530565167243367, + "grad_norm": 0.0, + "learning_rate": 1.5213675213675214e-05, + "loss": 1.6239, + "step": 89 + }, + { + "epoch": 0.20761245674740483, + "grad_norm": 0.0, + "learning_rate": 1.5384615384615387e-05, + "loss": 1.7462, + "step": 90 + }, + { + "epoch": 0.209919261822376, + "grad_norm": 0.0, + "learning_rate": 1.555555555555556e-05, + "loss": 1.2939, + "step": 91 + }, + { + "epoch": 0.2122260668973472, + "grad_norm": 0.0, + "learning_rate": 1.5726495726495726e-05, + "loss": 1.2444, + "step": 92 + }, + { + "epoch": 0.21453287197231835, + "grad_norm": 0.0, + "learning_rate": 1.5897435897435897e-05, + "loss": 1.4019, + "step": 93 + }, + { + "epoch": 0.21683967704728951, + "grad_norm": 0.0, + "learning_rate": 1.6068376068376072e-05, + "loss": 1.3926, + "step": 94 + }, + { + "epoch": 0.21914648212226068, + "grad_norm": 0.0, + "learning_rate": 1.623931623931624e-05, + "loss": 1.4956, + "step": 95 + }, + { + "epoch": 0.22145328719723184, + "grad_norm": 0.0, + "learning_rate": 1.641025641025641e-05, + "loss": 1.4025, + "step": 96 + }, + { + "epoch": 0.223760092272203, + "grad_norm": 0.0, + "learning_rate": 1.6581196581196585e-05, + "loss": 2.0986, + "step": 97 + }, + { + "epoch": 0.22606689734717417, + "grad_norm": 0.0, + "learning_rate": 1.6752136752136753e-05, + "loss": 1.3982, + "step": 98 + }, + { + "epoch": 0.22837370242214533, + "grad_norm": 0.0, + "learning_rate": 1.6923076923076924e-05, + "loss": 1.6585, + "step": 99 + }, + { + "epoch": 0.2306805074971165, + "grad_norm": 0.0, + "learning_rate": 1.7094017094017095e-05, + "loss": 1.5725, + "step": 100 + }, + { + "epoch": 0.23298731257208766, + "grad_norm": 0.0, + "learning_rate": 1.7264957264957267e-05, + "loss": 1.3001, + "step": 101 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.0, + "learning_rate": 1.7435897435897438e-05, + "loss": 1.4395, + "step": 102 + }, + { + "epoch": 0.23760092272202998, + "grad_norm": 0.0, + "learning_rate": 1.760683760683761e-05, + "loss": 1.7317, + "step": 103 + }, + { + "epoch": 0.23990772779700115, + "grad_norm": 0.0, + "learning_rate": 1.7777777777777777e-05, + "loss": 1.5907, + "step": 104 + }, + { + "epoch": 0.2422145328719723, + "grad_norm": 0.0, + "learning_rate": 1.794871794871795e-05, + "loss": 1.7229, + "step": 105 + }, + { + "epoch": 0.24452133794694347, + "grad_norm": 0.0, + "learning_rate": 1.8119658119658122e-05, + "loss": 1.7672, + "step": 106 + }, + { + "epoch": 0.24682814302191464, + "grad_norm": 0.0, + "learning_rate": 1.829059829059829e-05, + "loss": 1.511, + "step": 107 + }, + { + "epoch": 0.2491349480968858, + "grad_norm": 0.0, + "learning_rate": 1.8461538461538465e-05, + "loss": 1.6003, + "step": 108 + }, + { + "epoch": 0.25144175317185696, + "grad_norm": 0.0, + "learning_rate": 1.8632478632478636e-05, + "loss": 1.2508, + "step": 109 + }, + { + "epoch": 0.2537485582468281, + "grad_norm": 0.0, + "learning_rate": 1.8803418803418804e-05, + "loss": 1.2233, + "step": 110 + }, + { + "epoch": 0.2560553633217993, + "grad_norm": 0.0, + "learning_rate": 1.8974358974358975e-05, + "loss": 1.4052, + "step": 111 + }, + { + "epoch": 0.25836216839677045, + "grad_norm": 0.0, + "learning_rate": 1.914529914529915e-05, + "loss": 1.1971, + "step": 112 + }, + { + "epoch": 0.2606689734717416, + "grad_norm": 0.0, + "learning_rate": 1.9316239316239317e-05, + "loss": 1.4098, + "step": 113 + }, + { + "epoch": 0.2629757785467128, + "grad_norm": 0.0, + "learning_rate": 1.9487179487179488e-05, + "loss": 1.4005, + "step": 114 + }, + { + "epoch": 0.26528258362168394, + "grad_norm": 0.0, + "learning_rate": 1.965811965811966e-05, + "loss": 1.5905, + "step": 115 + }, + { + "epoch": 0.2675893886966551, + "grad_norm": 0.0, + "learning_rate": 1.982905982905983e-05, + "loss": 1.2455, + "step": 116 + }, + { + "epoch": 0.2698961937716263, + "grad_norm": 0.0, + "learning_rate": 2e-05, + "loss": 0.967, + "step": 117 + }, + { + "epoch": 0.2722029988465975, + "grad_norm": 0.0, + "learning_rate": 1.9999996546287957e-05, + "loss": 2.0659, + "step": 118 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.0, + "learning_rate": 1.9999986185154213e-05, + "loss": 1.5323, + "step": 119 + }, + { + "epoch": 0.2768166089965398, + "grad_norm": 0.0, + "learning_rate": 1.999996891660592e-05, + "loss": 1.1827, + "step": 120 + }, + { + "epoch": 0.279123414071511, + "grad_norm": 0.0, + "learning_rate": 1.9999944740655016e-05, + "loss": 1.4273, + "step": 121 + }, + { + "epoch": 0.28143021914648214, + "grad_norm": 0.0, + "learning_rate": 1.999991365731819e-05, + "loss": 1.4416, + "step": 122 + }, + { + "epoch": 0.2837370242214533, + "grad_norm": 0.0, + "learning_rate": 1.9999875666616918e-05, + "loss": 1.9135, + "step": 123 + }, + { + "epoch": 0.28604382929642447, + "grad_norm": 0.0, + "learning_rate": 1.9999830768577445e-05, + "loss": 1.7693, + "step": 124 + }, + { + "epoch": 0.28835063437139563, + "grad_norm": 0.0, + "learning_rate": 1.9999778963230775e-05, + "loss": 1.6087, + "step": 125 + }, + { + "epoch": 0.2906574394463668, + "grad_norm": 0.0, + "learning_rate": 1.99997202506127e-05, + "loss": 1.473, + "step": 126 + }, + { + "epoch": 0.29296424452133796, + "grad_norm": 0.0, + "learning_rate": 1.999965463076377e-05, + "loss": 1.4247, + "step": 127 + }, + { + "epoch": 0.2952710495963091, + "grad_norm": 0.0, + "learning_rate": 1.9999582103729316e-05, + "loss": 1.2409, + "step": 128 + }, + { + "epoch": 0.2975778546712803, + "grad_norm": 0.0, + "learning_rate": 1.9999502669559432e-05, + "loss": 1.5903, + "step": 129 + }, + { + "epoch": 0.29988465974625145, + "grad_norm": 0.0, + "learning_rate": 1.999941632830899e-05, + "loss": 1.2961, + "step": 130 + }, + { + "epoch": 0.3021914648212226, + "grad_norm": 0.0, + "learning_rate": 1.9999323080037623e-05, + "loss": 1.3102, + "step": 131 + }, + { + "epoch": 0.3044982698961938, + "grad_norm": 0.0, + "learning_rate": 1.999922292480975e-05, + "loss": 1.5295, + "step": 132 + }, + { + "epoch": 0.30680507497116494, + "grad_norm": 0.0, + "learning_rate": 1.9999115862694547e-05, + "loss": 1.2908, + "step": 133 + }, + { + "epoch": 0.3091118800461361, + "grad_norm": 0.0, + "learning_rate": 1.999900189376597e-05, + "loss": 1.0529, + "step": 134 + }, + { + "epoch": 0.31141868512110726, + "grad_norm": 0.0, + "learning_rate": 1.9998881018102735e-05, + "loss": 1.6819, + "step": 135 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.0, + "learning_rate": 1.9998753235788345e-05, + "loss": 1.2558, + "step": 136 + }, + { + "epoch": 0.3160322952710496, + "grad_norm": 0.0, + "learning_rate": 1.999861854691106e-05, + "loss": 1.5775, + "step": 137 + }, + { + "epoch": 0.31833910034602075, + "grad_norm": 0.0, + "learning_rate": 1.9998476951563914e-05, + "loss": 1.4106, + "step": 138 + }, + { + "epoch": 0.3206459054209919, + "grad_norm": 0.0, + "learning_rate": 1.9998328449844715e-05, + "loss": 1.4993, + "step": 139 + }, + { + "epoch": 0.3229527104959631, + "grad_norm": 0.0, + "learning_rate": 1.9998173041856042e-05, + "loss": 1.4359, + "step": 140 + }, + { + "epoch": 0.32525951557093424, + "grad_norm": 0.0, + "learning_rate": 1.9998010727705237e-05, + "loss": 1.2593, + "step": 141 + }, + { + "epoch": 0.3275663206459054, + "grad_norm": 0.0, + "learning_rate": 1.999784150750442e-05, + "loss": 1.1497, + "step": 142 + }, + { + "epoch": 0.32987312572087657, + "grad_norm": 0.0, + "learning_rate": 1.9997665381370477e-05, + "loss": 1.279, + "step": 143 + }, + { + "epoch": 0.33217993079584773, + "grad_norm": 0.0, + "learning_rate": 1.999748234942507e-05, + "loss": 1.1369, + "step": 144 + }, + { + "epoch": 0.3344867358708189, + "grad_norm": 0.0, + "learning_rate": 1.999729241179462e-05, + "loss": 1.5372, + "step": 145 + }, + { + "epoch": 0.33679354094579006, + "grad_norm": 0.0, + "learning_rate": 1.9997095568610326e-05, + "loss": 0.9758, + "step": 146 + }, + { + "epoch": 0.3391003460207612, + "grad_norm": 0.0, + "learning_rate": 1.9996891820008165e-05, + "loss": 1.4776, + "step": 147 + }, + { + "epoch": 0.3414071510957324, + "grad_norm": 0.0, + "learning_rate": 1.9996681166128862e-05, + "loss": 1.3023, + "step": 148 + }, + { + "epoch": 0.34371395617070355, + "grad_norm": 0.0, + "learning_rate": 1.999646360711794e-05, + "loss": 1.3596, + "step": 149 + }, + { + "epoch": 0.3460207612456747, + "grad_norm": 0.0, + "learning_rate": 1.999623914312566e-05, + "loss": 1.426, + "step": 150 + }, + { + "epoch": 0.34832756632064593, + "grad_norm": 0.0, + "learning_rate": 1.9996007774307077e-05, + "loss": 1.3939, + "step": 151 + }, + { + "epoch": 0.3506343713956171, + "grad_norm": 0.0, + "learning_rate": 1.9995769500822007e-05, + "loss": 0.88, + "step": 152 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.0, + "learning_rate": 1.9995524322835035e-05, + "loss": 1.4004, + "step": 153 + }, + { + "epoch": 0.3552479815455594, + "grad_norm": 0.0, + "learning_rate": 1.9995272240515515e-05, + "loss": 1.2727, + "step": 154 + }, + { + "epoch": 0.3575547866205306, + "grad_norm": 0.0, + "learning_rate": 1.9995013254037574e-05, + "loss": 1.2857, + "step": 155 + }, + { + "epoch": 0.35986159169550175, + "grad_norm": 0.0, + "learning_rate": 1.99947473635801e-05, + "loss": 1.0745, + "step": 156 + }, + { + "epoch": 0.3621683967704729, + "grad_norm": 0.0, + "learning_rate": 1.999447456932676e-05, + "loss": 1.232, + "step": 157 + }, + { + "epoch": 0.3644752018454441, + "grad_norm": 0.0, + "learning_rate": 1.9994194871465978e-05, + "loss": 1.6852, + "step": 158 + }, + { + "epoch": 0.36678200692041524, + "grad_norm": 0.0, + "learning_rate": 1.999390827019096e-05, + "loss": 1.257, + "step": 159 + }, + { + "epoch": 0.3690888119953864, + "grad_norm": 0.0, + "learning_rate": 1.999361476569967e-05, + "loss": 1.0449, + "step": 160 + }, + { + "epoch": 0.37139561707035756, + "grad_norm": 0.0, + "learning_rate": 1.9993314358194843e-05, + "loss": 1.0744, + "step": 161 + }, + { + "epoch": 0.3737024221453287, + "grad_norm": 0.0, + "learning_rate": 1.9993007047883988e-05, + "loss": 1.5147, + "step": 162 + }, + { + "epoch": 0.3760092272202999, + "grad_norm": 0.0, + "learning_rate": 1.999269283497937e-05, + "loss": 1.5317, + "step": 163 + }, + { + "epoch": 0.37831603229527105, + "grad_norm": 0.0, + "learning_rate": 1.999237171969804e-05, + "loss": 1.0542, + "step": 164 + }, + { + "epoch": 0.3806228373702422, + "grad_norm": 0.0, + "learning_rate": 1.9992043702261795e-05, + "loss": 1.4167, + "step": 165 + }, + { + "epoch": 0.3829296424452134, + "grad_norm": 0.0, + "learning_rate": 1.9991708782897214e-05, + "loss": 1.3442, + "step": 166 + }, + { + "epoch": 0.38523644752018454, + "grad_norm": 0.0, + "learning_rate": 1.9991366961835643e-05, + "loss": 1.2548, + "step": 167 + }, + { + "epoch": 0.3875432525951557, + "grad_norm": 0.0, + "learning_rate": 1.999101823931319e-05, + "loss": 1.3884, + "step": 168 + }, + { + "epoch": 0.38985005767012687, + "grad_norm": 0.0, + "learning_rate": 1.999066261557073e-05, + "loss": 1.3718, + "step": 169 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.0, + "learning_rate": 1.9990300090853917e-05, + "loss": 1.7129, + "step": 170 + }, + { + "epoch": 0.3944636678200692, + "grad_norm": 0.0, + "learning_rate": 1.9989930665413148e-05, + "loss": 1.31, + "step": 171 + }, + { + "epoch": 0.39677047289504036, + "grad_norm": 0.0, + "learning_rate": 1.9989554339503612e-05, + "loss": 1.1476, + "step": 172 + }, + { + "epoch": 0.3990772779700115, + "grad_norm": 0.0, + "learning_rate": 1.998917111338525e-05, + "loss": 1.0301, + "step": 173 + }, + { + "epoch": 0.4013840830449827, + "grad_norm": 0.0, + "learning_rate": 1.998878098732277e-05, + "loss": 1.1074, + "step": 174 + }, + { + "epoch": 0.40369088811995385, + "grad_norm": 0.0, + "learning_rate": 1.9988383961585646e-05, + "loss": 1.1239, + "step": 175 + }, + { + "epoch": 0.405997693194925, + "grad_norm": 0.0, + "learning_rate": 1.998798003644813e-05, + "loss": 1.6276, + "step": 176 + }, + { + "epoch": 0.4083044982698962, + "grad_norm": 0.0, + "learning_rate": 1.9987569212189224e-05, + "loss": 1.4676, + "step": 177 + }, + { + "epoch": 0.41061130334486734, + "grad_norm": 0.0, + "learning_rate": 1.9987151489092707e-05, + "loss": 1.2155, + "step": 178 + }, + { + "epoch": 0.4129181084198385, + "grad_norm": 0.0, + "learning_rate": 1.998672686744711e-05, + "loss": 1.6428, + "step": 179 + }, + { + "epoch": 0.41522491349480967, + "grad_norm": 0.0, + "learning_rate": 1.9986295347545738e-05, + "loss": 1.1825, + "step": 180 + }, + { + "epoch": 0.41753171856978083, + "grad_norm": 0.0, + "learning_rate": 1.998585692968667e-05, + "loss": 1.2577, + "step": 181 + }, + { + "epoch": 0.419838523644752, + "grad_norm": 0.0, + "learning_rate": 1.9985411614172728e-05, + "loss": 1.1877, + "step": 182 + }, + { + "epoch": 0.42214532871972316, + "grad_norm": 0.0, + "learning_rate": 1.998495940131152e-05, + "loss": 1.3028, + "step": 183 + }, + { + "epoch": 0.4244521337946944, + "grad_norm": 0.0, + "learning_rate": 1.9984500291415402e-05, + "loss": 1.0642, + "step": 184 + }, + { + "epoch": 0.42675893886966554, + "grad_norm": 0.0, + "learning_rate": 1.99840342848015e-05, + "loss": 1.7197, + "step": 185 + }, + { + "epoch": 0.4290657439446367, + "grad_norm": 0.0, + "learning_rate": 1.998356138179171e-05, + "loss": 1.1801, + "step": 186 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.0, + "learning_rate": 1.9983081582712684e-05, + "loss": 1.3857, + "step": 187 + }, + { + "epoch": 0.43367935409457903, + "grad_norm": 0.0, + "learning_rate": 1.9982594887895837e-05, + "loss": 1.4002, + "step": 188 + }, + { + "epoch": 0.4359861591695502, + "grad_norm": 0.0, + "learning_rate": 1.998210129767735e-05, + "loss": 1.0701, + "step": 189 + }, + { + "epoch": 0.43829296424452135, + "grad_norm": 0.0, + "learning_rate": 1.9981600812398175e-05, + "loss": 1.2737, + "step": 190 + }, + { + "epoch": 0.4405997693194925, + "grad_norm": 0.0, + "learning_rate": 1.9981093432404006e-05, + "loss": 1.0084, + "step": 191 + }, + { + "epoch": 0.4429065743944637, + "grad_norm": 0.0, + "learning_rate": 1.9980579158045322e-05, + "loss": 1.3017, + "step": 192 + }, + { + "epoch": 0.44521337946943484, + "grad_norm": 0.0, + "learning_rate": 1.9980057989677345e-05, + "loss": 1.0023, + "step": 193 + }, + { + "epoch": 0.447520184544406, + "grad_norm": 0.0, + "learning_rate": 1.9979529927660076e-05, + "loss": 1.226, + "step": 194 + }, + { + "epoch": 0.44982698961937717, + "grad_norm": 0.0, + "learning_rate": 1.9978994972358265e-05, + "loss": 0.7007, + "step": 195 + }, + { + "epoch": 0.45213379469434833, + "grad_norm": 0.0, + "learning_rate": 1.997845312414143e-05, + "loss": 1.0334, + "step": 196 + }, + { + "epoch": 0.4544405997693195, + "grad_norm": 0.0, + "learning_rate": 1.997790438338385e-05, + "loss": 1.2425, + "step": 197 + }, + { + "epoch": 0.45674740484429066, + "grad_norm": 0.0, + "learning_rate": 1.997734875046456e-05, + "loss": 1.3561, + "step": 198 + }, + { + "epoch": 0.4590542099192618, + "grad_norm": 0.0, + "learning_rate": 1.9976786225767365e-05, + "loss": 1.2625, + "step": 199 + }, + { + "epoch": 0.461361014994233, + "grad_norm": 0.0, + "learning_rate": 1.997621680968082e-05, + "loss": 1.1393, + "step": 200 + }, + { + "epoch": 0.46366782006920415, + "grad_norm": 0.0, + "learning_rate": 1.9975640502598243e-05, + "loss": 1.1219, + "step": 201 + }, + { + "epoch": 0.4659746251441753, + "grad_norm": 0.0, + "learning_rate": 1.997505730491772e-05, + "loss": 1.2967, + "step": 202 + }, + { + "epoch": 0.4682814302191465, + "grad_norm": 0.0, + "learning_rate": 1.9974467217042086e-05, + "loss": 1.2347, + "step": 203 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.0, + "learning_rate": 1.9973870239378938e-05, + "loss": 1.2762, + "step": 204 + }, + { + "epoch": 0.4728950403690888, + "grad_norm": 0.0, + "learning_rate": 1.9973266372340638e-05, + "loss": 0.8454, + "step": 205 + }, + { + "epoch": 0.47520184544405997, + "grad_norm": 0.0, + "learning_rate": 1.9972655616344303e-05, + "loss": 1.4203, + "step": 206 + }, + { + "epoch": 0.47750865051903113, + "grad_norm": 0.0, + "learning_rate": 1.9972037971811802e-05, + "loss": 1.4224, + "step": 207 + }, + { + "epoch": 0.4798154555940023, + "grad_norm": 0.0, + "learning_rate": 1.9971413439169777e-05, + "loss": 1.4759, + "step": 208 + }, + { + "epoch": 0.48212226066897346, + "grad_norm": 0.0, + "learning_rate": 1.997078201884961e-05, + "loss": 1.3751, + "step": 209 + }, + { + "epoch": 0.4844290657439446, + "grad_norm": 0.0, + "learning_rate": 1.997014371128746e-05, + "loss": 1.4652, + "step": 210 + }, + { + "epoch": 0.4867358708189158, + "grad_norm": 0.0, + "learning_rate": 1.996949851692422e-05, + "loss": 0.9827, + "step": 211 + }, + { + "epoch": 0.48904267589388695, + "grad_norm": 0.0, + "learning_rate": 1.9968846436205566e-05, + "loss": 1.4189, + "step": 212 + }, + { + "epoch": 0.4913494809688581, + "grad_norm": 0.0, + "learning_rate": 1.996818746958191e-05, + "loss": 1.1802, + "step": 213 + }, + { + "epoch": 0.4936562860438293, + "grad_norm": 0.0, + "learning_rate": 1.996752161750843e-05, + "loss": 0.8799, + "step": 214 + }, + { + "epoch": 0.49596309111880044, + "grad_norm": 0.0, + "learning_rate": 1.996684888044506e-05, + "loss": 1.6296, + "step": 215 + }, + { + "epoch": 0.4982698961937716, + "grad_norm": 0.0, + "learning_rate": 1.9966169258856488e-05, + "loss": 0.8003, + "step": 216 + }, + { + "epoch": 0.5005767012687428, + "grad_norm": 0.0, + "learning_rate": 1.9965482753212154e-05, + "loss": 1.4662, + "step": 217 + }, + { + "epoch": 0.5028835063437139, + "grad_norm": 0.0, + "learning_rate": 1.9964789363986262e-05, + "loss": 1.6058, + "step": 218 + }, + { + "epoch": 0.5051903114186851, + "grad_norm": 0.0, + "learning_rate": 1.996408909165776e-05, + "loss": 1.3763, + "step": 219 + }, + { + "epoch": 0.5074971164936563, + "grad_norm": 0.0, + "learning_rate": 1.996338193671036e-05, + "loss": 1.097, + "step": 220 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.0, + "learning_rate": 1.996266789963252e-05, + "loss": 0.9124, + "step": 221 + }, + { + "epoch": 0.5121107266435986, + "grad_norm": 0.0, + "learning_rate": 1.9961946980917457e-05, + "loss": 1.6222, + "step": 222 + }, + { + "epoch": 0.5144175317185697, + "grad_norm": 0.0, + "learning_rate": 1.996121918106314e-05, + "loss": 1.1872, + "step": 223 + }, + { + "epoch": 0.5167243367935409, + "grad_norm": 0.0, + "learning_rate": 1.9960484500572293e-05, + "loss": 0.9698, + "step": 224 + }, + { + "epoch": 0.5190311418685121, + "grad_norm": 0.0, + "learning_rate": 1.9959742939952393e-05, + "loss": 1.5913, + "step": 225 + }, + { + "epoch": 0.5213379469434832, + "grad_norm": 0.0, + "learning_rate": 1.995899449971566e-05, + "loss": 1.1774, + "step": 226 + }, + { + "epoch": 0.5236447520184544, + "grad_norm": 0.0, + "learning_rate": 1.995823918037908e-05, + "loss": 0.9025, + "step": 227 + }, + { + "epoch": 0.5259515570934256, + "grad_norm": 0.0, + "learning_rate": 1.9957476982464382e-05, + "loss": 0.9416, + "step": 228 + }, + { + "epoch": 0.5282583621683967, + "grad_norm": 0.0, + "learning_rate": 1.9956707906498046e-05, + "loss": 1.2601, + "step": 229 + }, + { + "epoch": 0.5305651672433679, + "grad_norm": 0.0, + "learning_rate": 1.995593195301131e-05, + "loss": 1.2503, + "step": 230 + }, + { + "epoch": 0.532871972318339, + "grad_norm": 0.0, + "learning_rate": 1.995514912254015e-05, + "loss": 0.93, + "step": 231 + }, + { + "epoch": 0.5351787773933102, + "grad_norm": 0.0, + "learning_rate": 1.9954359415625313e-05, + "loss": 1.0196, + "step": 232 + }, + { + "epoch": 0.5374855824682814, + "grad_norm": 0.0, + "learning_rate": 1.995356283281227e-05, + "loss": 1.5443, + "step": 233 + }, + { + "epoch": 0.5397923875432526, + "grad_norm": 0.0, + "learning_rate": 1.9952759374651266e-05, + "loss": 0.9655, + "step": 234 + }, + { + "epoch": 0.5420991926182238, + "grad_norm": 0.0, + "learning_rate": 1.9951949041697272e-05, + "loss": 0.8188, + "step": 235 + }, + { + "epoch": 0.544405997693195, + "grad_norm": 0.0, + "learning_rate": 1.9951131834510034e-05, + "loss": 0.9194, + "step": 236 + }, + { + "epoch": 0.5467128027681661, + "grad_norm": 0.0, + "learning_rate": 1.9950307753654016e-05, + "loss": 1.3431, + "step": 237 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.0, + "learning_rate": 1.9949476799698453e-05, + "loss": 1.4364, + "step": 238 + }, + { + "epoch": 0.5513264129181085, + "grad_norm": 0.0, + "learning_rate": 1.9948638973217324e-05, + "loss": 0.7999, + "step": 239 + }, + { + "epoch": 0.5536332179930796, + "grad_norm": 0.0, + "learning_rate": 1.994779427478934e-05, + "loss": 0.947, + "step": 240 + }, + { + "epoch": 0.5559400230680508, + "grad_norm": 0.0, + "learning_rate": 1.9946942704997982e-05, + "loss": 0.8431, + "step": 241 + }, + { + "epoch": 0.558246828143022, + "grad_norm": 0.0, + "learning_rate": 1.994608426443146e-05, + "loss": 1.3369, + "step": 242 + }, + { + "epoch": 0.5605536332179931, + "grad_norm": 0.0, + "learning_rate": 1.9945218953682736e-05, + "loss": 1.0996, + "step": 243 + }, + { + "epoch": 0.5628604382929643, + "grad_norm": 0.0, + "learning_rate": 1.9944346773349515e-05, + "loss": 1.5973, + "step": 244 + }, + { + "epoch": 0.5651672433679354, + "grad_norm": 0.0, + "learning_rate": 1.9943467724034252e-05, + "loss": 1.2742, + "step": 245 + }, + { + "epoch": 0.5674740484429066, + "grad_norm": 0.0, + "learning_rate": 1.994258180634414e-05, + "loss": 0.9267, + "step": 246 + }, + { + "epoch": 0.5697808535178778, + "grad_norm": 0.0, + "learning_rate": 1.994168902089112e-05, + "loss": 1.3129, + "step": 247 + }, + { + "epoch": 0.5720876585928489, + "grad_norm": 0.0, + "learning_rate": 1.9940789368291888e-05, + "loss": 1.3963, + "step": 248 + }, + { + "epoch": 0.5743944636678201, + "grad_norm": 0.0, + "learning_rate": 1.9939882849167853e-05, + "loss": 1.3136, + "step": 249 + }, + { + "epoch": 0.5767012687427913, + "grad_norm": 0.0, + "learning_rate": 1.99389694641452e-05, + "loss": 1.5613, + "step": 250 + }, + { + "epoch": 0.5790080738177624, + "grad_norm": 0.0, + "learning_rate": 1.993804921385484e-05, + "loss": 1.54, + "step": 251 + }, + { + "epoch": 0.5813148788927336, + "grad_norm": 0.0, + "learning_rate": 1.9937122098932428e-05, + "loss": 1.3285, + "step": 252 + }, + { + "epoch": 0.5836216839677048, + "grad_norm": 0.0, + "learning_rate": 1.993618812001836e-05, + "loss": 1.0518, + "step": 253 + }, + { + "epoch": 0.5859284890426759, + "grad_norm": 0.0, + "learning_rate": 1.9935247277757777e-05, + "loss": 1.4438, + "step": 254 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.0, + "learning_rate": 1.9934299572800556e-05, + "loss": 1.1655, + "step": 255 + }, + { + "epoch": 0.5905420991926182, + "grad_norm": 0.0, + "learning_rate": 1.9933345005801323e-05, + "loss": 1.3974, + "step": 256 + }, + { + "epoch": 0.5928489042675894, + "grad_norm": 0.0, + "learning_rate": 1.9932383577419432e-05, + "loss": 1.3321, + "step": 257 + }, + { + "epoch": 0.5951557093425606, + "grad_norm": 0.0, + "learning_rate": 1.9931415288318985e-05, + "loss": 1.5007, + "step": 258 + }, + { + "epoch": 0.5974625144175317, + "grad_norm": 0.0, + "learning_rate": 1.993044013916882e-05, + "loss": 1.536, + "step": 259 + }, + { + "epoch": 0.5997693194925029, + "grad_norm": 0.0, + "learning_rate": 1.992945813064251e-05, + "loss": 0.936, + "step": 260 + }, + { + "epoch": 0.6020761245674741, + "grad_norm": 0.0, + "learning_rate": 1.9928469263418376e-05, + "loss": 1.1215, + "step": 261 + }, + { + "epoch": 0.6043829296424452, + "grad_norm": 0.0, + "learning_rate": 1.9927473538179467e-05, + "loss": 1.1639, + "step": 262 + }, + { + "epoch": 0.6066897347174164, + "grad_norm": 0.0, + "learning_rate": 1.9926470955613573e-05, + "loss": 1.0756, + "step": 263 + }, + { + "epoch": 0.6089965397923875, + "grad_norm": 0.0, + "learning_rate": 1.9925461516413224e-05, + "loss": 1.0336, + "step": 264 + }, + { + "epoch": 0.6113033448673587, + "grad_norm": 0.0, + "learning_rate": 1.9924445221275673e-05, + "loss": 0.8772, + "step": 265 + }, + { + "epoch": 0.6136101499423299, + "grad_norm": 0.0, + "learning_rate": 1.9923422070902932e-05, + "loss": 1.2074, + "step": 266 + }, + { + "epoch": 0.615916955017301, + "grad_norm": 0.0, + "learning_rate": 1.9922392066001724e-05, + "loss": 1.0987, + "step": 267 + }, + { + "epoch": 0.6182237600922722, + "grad_norm": 0.0, + "learning_rate": 1.992135520728352e-05, + "loss": 1.0906, + "step": 268 + }, + { + "epoch": 0.6205305651672434, + "grad_norm": 0.0, + "learning_rate": 1.992031149546452e-05, + "loss": 1.2893, + "step": 269 + }, + { + "epoch": 0.6228373702422145, + "grad_norm": 0.0, + "learning_rate": 1.9919260931265666e-05, + "loss": 0.9538, + "step": 270 + }, + { + "epoch": 0.6251441753171857, + "grad_norm": 0.0, + "learning_rate": 1.9918203515412616e-05, + "loss": 1.1872, + "step": 271 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.0, + "learning_rate": 1.9917139248635788e-05, + "loss": 0.8881, + "step": 272 + }, + { + "epoch": 0.629757785467128, + "grad_norm": 0.0, + "learning_rate": 1.9916068131670302e-05, + "loss": 0.8143, + "step": 273 + }, + { + "epoch": 0.6320645905420992, + "grad_norm": 0.0, + "learning_rate": 1.9914990165256034e-05, + "loss": 0.9903, + "step": 274 + }, + { + "epoch": 0.6343713956170703, + "grad_norm": 0.0, + "learning_rate": 1.9913905350137575e-05, + "loss": 0.9394, + "step": 275 + }, + { + "epoch": 0.6366782006920415, + "grad_norm": 0.0, + "learning_rate": 1.9912813687064255e-05, + "loss": 1.3582, + "step": 276 + }, + { + "epoch": 0.6389850057670127, + "grad_norm": 0.0, + "learning_rate": 1.991171517679013e-05, + "loss": 1.3316, + "step": 277 + }, + { + "epoch": 0.6412918108419838, + "grad_norm": 0.0, + "learning_rate": 1.9910609820073986e-05, + "loss": 1.006, + "step": 278 + }, + { + "epoch": 0.643598615916955, + "grad_norm": 0.0, + "learning_rate": 1.990949761767935e-05, + "loss": 1.4049, + "step": 279 + }, + { + "epoch": 0.6459054209919262, + "grad_norm": 0.0, + "learning_rate": 1.9908378570374457e-05, + "loss": 0.9906, + "step": 280 + }, + { + "epoch": 0.6482122260668973, + "grad_norm": 0.0, + "learning_rate": 1.990725267893228e-05, + "loss": 1.1612, + "step": 281 + }, + { + "epoch": 0.6505190311418685, + "grad_norm": 0.0, + "learning_rate": 1.9906119944130527e-05, + "loss": 1.0548, + "step": 282 + }, + { + "epoch": 0.6528258362168397, + "grad_norm": 0.0, + "learning_rate": 1.9904980366751624e-05, + "loss": 1.3317, + "step": 283 + }, + { + "epoch": 0.6551326412918108, + "grad_norm": 0.0, + "learning_rate": 1.9903833947582722e-05, + "loss": 1.3214, + "step": 284 + }, + { + "epoch": 0.657439446366782, + "grad_norm": 0.0, + "learning_rate": 1.9902680687415704e-05, + "loss": 1.3262, + "step": 285 + }, + { + "epoch": 0.6597462514417531, + "grad_norm": 0.0, + "learning_rate": 1.9901520587047172e-05, + "loss": 0.9366, + "step": 286 + }, + { + "epoch": 0.6620530565167243, + "grad_norm": 0.0, + "learning_rate": 1.9900353647278466e-05, + "loss": 1.0506, + "step": 287 + }, + { + "epoch": 0.6643598615916955, + "grad_norm": 0.0, + "learning_rate": 1.989917986891563e-05, + "loss": 1.3567, + "step": 288 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.0, + "learning_rate": 1.989799925276945e-05, + "loss": 1.3788, + "step": 289 + }, + { + "epoch": 0.6689734717416378, + "grad_norm": 0.0, + "learning_rate": 1.989681179965542e-05, + "loss": 1.284, + "step": 290 + }, + { + "epoch": 0.671280276816609, + "grad_norm": 0.0, + "learning_rate": 1.9895617510393773e-05, + "loss": 0.6017, + "step": 291 + }, + { + "epoch": 0.6735870818915801, + "grad_norm": 0.0, + "learning_rate": 1.9894416385809444e-05, + "loss": 1.0031, + "step": 292 + }, + { + "epoch": 0.6758938869665513, + "grad_norm": 0.0, + "learning_rate": 1.9893208426732115e-05, + "loss": 1.3046, + "step": 293 + }, + { + "epoch": 0.6782006920415224, + "grad_norm": 0.0, + "learning_rate": 1.9891993633996164e-05, + "loss": 0.8548, + "step": 294 + }, + { + "epoch": 0.6805074971164936, + "grad_norm": 0.0, + "learning_rate": 1.9890772008440703e-05, + "loss": 0.9578, + "step": 295 + }, + { + "epoch": 0.6828143021914648, + "grad_norm": 0.0, + "learning_rate": 1.9889543550909562e-05, + "loss": 0.8643, + "step": 296 + }, + { + "epoch": 0.6851211072664359, + "grad_norm": 0.0, + "learning_rate": 1.9888308262251286e-05, + "loss": 1.0212, + "step": 297 + }, + { + "epoch": 0.6874279123414071, + "grad_norm": 0.0, + "learning_rate": 1.9887066143319145e-05, + "loss": 0.8272, + "step": 298 + }, + { + "epoch": 0.6897347174163783, + "grad_norm": 0.0, + "learning_rate": 1.9885817194971116e-05, + "loss": 1.0797, + "step": 299 + }, + { + "epoch": 0.6920415224913494, + "grad_norm": 0.0, + "learning_rate": 1.988456141806991e-05, + "loss": 1.1806, + "step": 300 + }, + { + "epoch": 0.6943483275663207, + "grad_norm": 0.0, + "learning_rate": 1.988329881348294e-05, + "loss": 1.3157, + "step": 301 + }, + { + "epoch": 0.6966551326412919, + "grad_norm": 0.0, + "learning_rate": 1.9882029382082342e-05, + "loss": 0.932, + "step": 302 + }, + { + "epoch": 0.698961937716263, + "grad_norm": 0.0, + "learning_rate": 1.9880753124744964e-05, + "loss": 1.2185, + "step": 303 + }, + { + "epoch": 0.7012687427912342, + "grad_norm": 0.0, + "learning_rate": 1.9879470042352372e-05, + "loss": 1.4559, + "step": 304 + }, + { + "epoch": 0.7035755478662054, + "grad_norm": 0.0, + "learning_rate": 1.9878180135790844e-05, + "loss": 1.3831, + "step": 305 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.0, + "learning_rate": 1.9876883405951378e-05, + "loss": 1.0567, + "step": 306 + }, + { + "epoch": 0.7081891580161477, + "grad_norm": 0.0, + "learning_rate": 1.9875579853729677e-05, + "loss": 0.7896, + "step": 307 + }, + { + "epoch": 0.7104959630911188, + "grad_norm": 0.0, + "learning_rate": 1.987426948002616e-05, + "loss": 0.958, + "step": 308 + }, + { + "epoch": 0.71280276816609, + "grad_norm": 0.0, + "learning_rate": 1.9872952285745958e-05, + "loss": 1.0281, + "step": 309 + }, + { + "epoch": 0.7151095732410612, + "grad_norm": 0.0, + "learning_rate": 1.987162827179891e-05, + "loss": 1.0784, + "step": 310 + }, + { + "epoch": 0.7174163783160323, + "grad_norm": 0.0, + "learning_rate": 1.9870297439099576e-05, + "loss": 1.2599, + "step": 311 + }, + { + "epoch": 0.7197231833910035, + "grad_norm": 0.0, + "learning_rate": 1.9868959788567213e-05, + "loss": 1.4855, + "step": 312 + }, + { + "epoch": 0.7220299884659747, + "grad_norm": 0.0, + "learning_rate": 1.9867615321125796e-05, + "loss": 1.1607, + "step": 313 + }, + { + "epoch": 0.7243367935409458, + "grad_norm": 0.0, + "learning_rate": 1.9866264037703996e-05, + "loss": 1.1057, + "step": 314 + }, + { + "epoch": 0.726643598615917, + "grad_norm": 0.0, + "learning_rate": 1.9864905939235215e-05, + "loss": 0.9996, + "step": 315 + }, + { + "epoch": 0.7289504036908881, + "grad_norm": 0.0, + "learning_rate": 1.9863541026657542e-05, + "loss": 1.5419, + "step": 316 + }, + { + "epoch": 0.7312572087658593, + "grad_norm": 0.0, + "learning_rate": 1.9862169300913784e-05, + "loss": 1.1848, + "step": 317 + }, + { + "epoch": 0.7335640138408305, + "grad_norm": 0.0, + "learning_rate": 1.9860790762951447e-05, + "loss": 1.1695, + "step": 318 + }, + { + "epoch": 0.7358708189158016, + "grad_norm": 0.0, + "learning_rate": 1.9859405413722745e-05, + "loss": 0.9904, + "step": 319 + }, + { + "epoch": 0.7381776239907728, + "grad_norm": 0.0, + "learning_rate": 1.9858013254184597e-05, + "loss": 0.9056, + "step": 320 + }, + { + "epoch": 0.740484429065744, + "grad_norm": 0.0, + "learning_rate": 1.985661428529863e-05, + "loss": 1.01, + "step": 321 + }, + { + "epoch": 0.7427912341407151, + "grad_norm": 0.0, + "learning_rate": 1.9855208508031173e-05, + "loss": 1.3457, + "step": 322 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.0, + "learning_rate": 1.985379592335325e-05, + "loss": 0.9593, + "step": 323 + }, + { + "epoch": 0.7474048442906575, + "grad_norm": 0.0, + "learning_rate": 1.9852376532240594e-05, + "loss": 0.7881, + "step": 324 + }, + { + "epoch": 0.7497116493656286, + "grad_norm": 0.0, + "learning_rate": 1.985095033567364e-05, + "loss": 0.9659, + "step": 325 + }, + { + "epoch": 0.7520184544405998, + "grad_norm": 0.0, + "learning_rate": 1.9849517334637527e-05, + "loss": 0.7841, + "step": 326 + }, + { + "epoch": 0.754325259515571, + "grad_norm": 0.0, + "learning_rate": 1.9848077530122083e-05, + "loss": 1.3307, + "step": 327 + }, + { + "epoch": 0.7566320645905421, + "grad_norm": 0.0, + "learning_rate": 1.9846630923121845e-05, + "loss": 1.2489, + "step": 328 + }, + { + "epoch": 0.7589388696655133, + "grad_norm": 0.0, + "learning_rate": 1.984517751463604e-05, + "loss": 1.2523, + "step": 329 + }, + { + "epoch": 0.7612456747404844, + "grad_norm": 0.0, + "learning_rate": 1.984371730566861e-05, + "loss": 1.1361, + "step": 330 + }, + { + "epoch": 0.7635524798154556, + "grad_norm": 0.0, + "learning_rate": 1.984225029722818e-05, + "loss": 0.9844, + "step": 331 + }, + { + "epoch": 0.7658592848904268, + "grad_norm": 0.0, + "learning_rate": 1.9840776490328067e-05, + "loss": 1.0674, + "step": 332 + }, + { + "epoch": 0.7681660899653979, + "grad_norm": 0.0, + "learning_rate": 1.98392958859863e-05, + "loss": 0.9208, + "step": 333 + }, + { + "epoch": 0.7704728950403691, + "grad_norm": 0.0, + "learning_rate": 1.983780848522559e-05, + "loss": 1.2463, + "step": 334 + }, + { + "epoch": 0.7727797001153403, + "grad_norm": 0.0, + "learning_rate": 1.983631428907335e-05, + "loss": 0.805, + "step": 335 + }, + { + "epoch": 0.7750865051903114, + "grad_norm": 0.0, + "learning_rate": 1.9834813298561686e-05, + "loss": 1.2016, + "step": 336 + }, + { + "epoch": 0.7773933102652826, + "grad_norm": 0.0, + "learning_rate": 1.9833305514727396e-05, + "loss": 0.7911, + "step": 337 + }, + { + "epoch": 0.7797001153402537, + "grad_norm": 0.0, + "learning_rate": 1.983179093861197e-05, + "loss": 1.2782, + "step": 338 + }, + { + "epoch": 0.7820069204152249, + "grad_norm": 0.0, + "learning_rate": 1.9830269571261585e-05, + "loss": 1.3264, + "step": 339 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.0, + "learning_rate": 1.9828741413727118e-05, + "loss": 1.096, + "step": 340 + }, + { + "epoch": 0.7866205305651672, + "grad_norm": 0.0, + "learning_rate": 1.9827206467064133e-05, + "loss": 0.6352, + "step": 341 + }, + { + "epoch": 0.7889273356401384, + "grad_norm": 0.0, + "learning_rate": 1.9825664732332886e-05, + "loss": 1.0133, + "step": 342 + }, + { + "epoch": 0.7912341407151096, + "grad_norm": 0.0, + "learning_rate": 1.982411621059831e-05, + "loss": 1.1626, + "step": 343 + }, + { + "epoch": 0.7935409457900807, + "grad_norm": 0.0, + "learning_rate": 1.982256090293004e-05, + "loss": 1.4643, + "step": 344 + }, + { + "epoch": 0.7958477508650519, + "grad_norm": 0.0, + "learning_rate": 1.982099881040239e-05, + "loss": 0.9051, + "step": 345 + }, + { + "epoch": 0.798154555940023, + "grad_norm": 0.0, + "learning_rate": 1.9819429934094366e-05, + "loss": 1.2828, + "step": 346 + }, + { + "epoch": 0.8004613610149942, + "grad_norm": 0.0, + "learning_rate": 1.981785427508966e-05, + "loss": 1.1372, + "step": 347 + }, + { + "epoch": 0.8027681660899654, + "grad_norm": 0.0, + "learning_rate": 1.9816271834476642e-05, + "loss": 1.165, + "step": 348 + }, + { + "epoch": 0.8050749711649365, + "grad_norm": 0.0, + "learning_rate": 1.981468261334837e-05, + "loss": 1.2639, + "step": 349 + }, + { + "epoch": 0.8073817762399077, + "grad_norm": 0.0, + "learning_rate": 1.981308661280259e-05, + "loss": 0.8564, + "step": 350 + }, + { + "epoch": 0.8096885813148789, + "grad_norm": 0.0, + "learning_rate": 1.9811483833941726e-05, + "loss": 0.787, + "step": 351 + }, + { + "epoch": 0.81199538638985, + "grad_norm": 0.0, + "learning_rate": 1.9809874277872886e-05, + "loss": 1.2472, + "step": 352 + }, + { + "epoch": 0.8143021914648212, + "grad_norm": 0.0, + "learning_rate": 1.980825794570786e-05, + "loss": 1.0271, + "step": 353 + }, + { + "epoch": 0.8166089965397924, + "grad_norm": 0.0, + "learning_rate": 1.9806634838563113e-05, + "loss": 1.3831, + "step": 354 + }, + { + "epoch": 0.8189158016147635, + "grad_norm": 0.0, + "learning_rate": 1.9805004957559795e-05, + "loss": 0.9824, + "step": 355 + }, + { + "epoch": 0.8212226066897347, + "grad_norm": 0.0, + "learning_rate": 1.9803368303823735e-05, + "loss": 1.3427, + "step": 356 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.0, + "learning_rate": 1.9801724878485438e-05, + "loss": 0.9904, + "step": 357 + }, + { + "epoch": 0.825836216839677, + "grad_norm": 0.0, + "learning_rate": 1.980007468268009e-05, + "loss": 0.8121, + "step": 358 + }, + { + "epoch": 0.8281430219146482, + "grad_norm": 0.0, + "learning_rate": 1.9798417717547552e-05, + "loss": 1.3321, + "step": 359 + }, + { + "epoch": 0.8304498269896193, + "grad_norm": 0.0, + "learning_rate": 1.9796753984232357e-05, + "loss": 1.1041, + "step": 360 + }, + { + "epoch": 0.8327566320645905, + "grad_norm": 0.0, + "learning_rate": 1.9795083483883715e-05, + "loss": 0.4604, + "step": 361 + }, + { + "epoch": 0.8350634371395617, + "grad_norm": 0.0, + "learning_rate": 1.9793406217655516e-05, + "loss": 0.9785, + "step": 362 + }, + { + "epoch": 0.8373702422145328, + "grad_norm": 0.0, + "learning_rate": 1.9791722186706318e-05, + "loss": 0.9659, + "step": 363 + }, + { + "epoch": 0.839677047289504, + "grad_norm": 0.0, + "learning_rate": 1.9790031392199348e-05, + "loss": 0.7622, + "step": 364 + }, + { + "epoch": 0.8419838523644751, + "grad_norm": 0.0, + "learning_rate": 1.9788333835302512e-05, + "loss": 1.3065, + "step": 365 + }, + { + "epoch": 0.8442906574394463, + "grad_norm": 0.0, + "learning_rate": 1.9786629517188384e-05, + "loss": 0.9951, + "step": 366 + }, + { + "epoch": 0.8465974625144176, + "grad_norm": 0.0, + "learning_rate": 1.9784918439034216e-05, + "loss": 1.1547, + "step": 367 + }, + { + "epoch": 0.8489042675893888, + "grad_norm": 0.0, + "learning_rate": 1.9783200602021912e-05, + "loss": 0.9821, + "step": 368 + }, + { + "epoch": 0.8512110726643599, + "grad_norm": 0.0, + "learning_rate": 1.9781476007338058e-05, + "loss": 1.0311, + "step": 369 + }, + { + "epoch": 0.8535178777393311, + "grad_norm": 0.0, + "learning_rate": 1.9779744656173907e-05, + "loss": 1.4481, + "step": 370 + }, + { + "epoch": 0.8558246828143022, + "grad_norm": 0.0, + "learning_rate": 1.9778006549725375e-05, + "loss": 1.1409, + "step": 371 + }, + { + "epoch": 0.8581314878892734, + "grad_norm": 0.0, + "learning_rate": 1.977626168919305e-05, + "loss": 0.988, + "step": 372 + }, + { + "epoch": 0.8604382929642446, + "grad_norm": 0.0, + "learning_rate": 1.977451007578217e-05, + "loss": 0.792, + "step": 373 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.0, + "learning_rate": 1.9772751710702663e-05, + "loss": 1.0579, + "step": 374 + }, + { + "epoch": 0.8650519031141869, + "grad_norm": 0.0, + "learning_rate": 1.9770986595169096e-05, + "loss": 0.8852, + "step": 375 + }, + { + "epoch": 0.8673587081891581, + "grad_norm": 0.0, + "learning_rate": 1.976921473040071e-05, + "loss": 1.3128, + "step": 376 + }, + { + "epoch": 0.8696655132641292, + "grad_norm": 0.0, + "learning_rate": 1.9767436117621416e-05, + "loss": 1.6096, + "step": 377 + }, + { + "epoch": 0.8719723183391004, + "grad_norm": 0.0, + "learning_rate": 1.9765650758059766e-05, + "loss": 0.9685, + "step": 378 + }, + { + "epoch": 0.8742791234140715, + "grad_norm": 0.0, + "learning_rate": 1.9763858652948992e-05, + "loss": 1.2446, + "step": 379 + }, + { + "epoch": 0.8765859284890427, + "grad_norm": 0.0, + "learning_rate": 1.976205980352697e-05, + "loss": 0.9631, + "step": 380 + }, + { + "epoch": 0.8788927335640139, + "grad_norm": 0.0, + "learning_rate": 1.9760254211036245e-05, + "loss": 1.0765, + "step": 381 + }, + { + "epoch": 0.881199538638985, + "grad_norm": 0.0, + "learning_rate": 1.975844187672402e-05, + "loss": 1.3127, + "step": 382 + }, + { + "epoch": 0.8835063437139562, + "grad_norm": 0.0, + "learning_rate": 1.9756622801842144e-05, + "loss": 1.0501, + "step": 383 + }, + { + "epoch": 0.8858131487889274, + "grad_norm": 0.0, + "learning_rate": 1.9754796987647134e-05, + "loss": 0.9982, + "step": 384 + }, + { + "epoch": 0.8881199538638985, + "grad_norm": 0.0, + "learning_rate": 1.9752964435400156e-05, + "loss": 0.8997, + "step": 385 + }, + { + "epoch": 0.8904267589388697, + "grad_norm": 0.0, + "learning_rate": 1.9751125146367036e-05, + "loss": 1.0249, + "step": 386 + }, + { + "epoch": 0.8927335640138409, + "grad_norm": 0.0, + "learning_rate": 1.9749279121818235e-05, + "loss": 0.8368, + "step": 387 + }, + { + "epoch": 0.895040369088812, + "grad_norm": 0.0, + "learning_rate": 1.9747426363028897e-05, + "loss": 1.2199, + "step": 388 + }, + { + "epoch": 0.8973471741637832, + "grad_norm": 0.0, + "learning_rate": 1.9745566871278794e-05, + "loss": 1.4212, + "step": 389 + }, + { + "epoch": 0.8996539792387543, + "grad_norm": 0.0, + "learning_rate": 1.9743700647852356e-05, + "loss": 0.9981, + "step": 390 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.0, + "learning_rate": 1.974182769403866e-05, + "loss": 0.5841, + "step": 391 + }, + { + "epoch": 0.9042675893886967, + "grad_norm": 0.0, + "learning_rate": 1.9739948011131438e-05, + "loss": 1.1561, + "step": 392 + }, + { + "epoch": 0.9065743944636678, + "grad_norm": 0.0, + "learning_rate": 1.9738061600429062e-05, + "loss": 1.0792, + "step": 393 + }, + { + "epoch": 0.908881199538639, + "grad_norm": 0.0, + "learning_rate": 1.9736168463234565e-05, + "loss": 0.7952, + "step": 394 + }, + { + "epoch": 0.9111880046136102, + "grad_norm": 0.0, + "learning_rate": 1.9734268600855612e-05, + "loss": 0.9505, + "step": 395 + }, + { + "epoch": 0.9134948096885813, + "grad_norm": 0.0, + "learning_rate": 1.9732362014604515e-05, + "loss": 0.819, + "step": 396 + }, + { + "epoch": 0.9158016147635525, + "grad_norm": 0.0, + "learning_rate": 1.973044870579824e-05, + "loss": 0.7531, + "step": 397 + }, + { + "epoch": 0.9181084198385236, + "grad_norm": 0.0, + "learning_rate": 1.972852867575839e-05, + "loss": 1.0519, + "step": 398 + }, + { + "epoch": 0.9204152249134948, + "grad_norm": 0.0, + "learning_rate": 1.9726601925811204e-05, + "loss": 1.2648, + "step": 399 + }, + { + "epoch": 0.922722029988466, + "grad_norm": 0.0, + "learning_rate": 1.9724668457287576e-05, + "loss": 0.9938, + "step": 400 + }, + { + "epoch": 0.9250288350634371, + "grad_norm": 0.0, + "learning_rate": 1.9722728271523035e-05, + "loss": 1.5296, + "step": 401 + }, + { + "epoch": 0.9273356401384083, + "grad_norm": 0.0, + "learning_rate": 1.9720781369857747e-05, + "loss": 0.925, + "step": 402 + }, + { + "epoch": 0.9296424452133795, + "grad_norm": 0.0, + "learning_rate": 1.9718827753636522e-05, + "loss": 1.3531, + "step": 403 + }, + { + "epoch": 0.9319492502883506, + "grad_norm": 0.0, + "learning_rate": 1.9716867424208805e-05, + "loss": 0.9067, + "step": 404 + }, + { + "epoch": 0.9342560553633218, + "grad_norm": 0.0, + "learning_rate": 1.9714900382928674e-05, + "loss": 1.0162, + "step": 405 + }, + { + "epoch": 0.936562860438293, + "grad_norm": 0.0, + "learning_rate": 1.9712926631154857e-05, + "loss": 0.9944, + "step": 406 + }, + { + "epoch": 0.9388696655132641, + "grad_norm": 0.0, + "learning_rate": 1.9710946170250702e-05, + "loss": 1.1977, + "step": 407 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.0, + "learning_rate": 1.9708959001584197e-05, + "loss": 0.9416, + "step": 408 + }, + { + "epoch": 0.9434832756632064, + "grad_norm": 0.0, + "learning_rate": 1.9706965126527963e-05, + "loss": 0.7721, + "step": 409 + }, + { + "epoch": 0.9457900807381776, + "grad_norm": 0.0, + "learning_rate": 1.9704964546459257e-05, + "loss": 1.0792, + "step": 410 + }, + { + "epoch": 0.9480968858131488, + "grad_norm": 0.0, + "learning_rate": 1.9702957262759964e-05, + "loss": 1.0404, + "step": 411 + }, + { + "epoch": 0.9504036908881199, + "grad_norm": 0.0, + "learning_rate": 1.9700943276816602e-05, + "loss": 1.043, + "step": 412 + }, + { + "epoch": 0.9527104959630911, + "grad_norm": 0.0, + "learning_rate": 1.9698922590020314e-05, + "loss": 1.162, + "step": 413 + }, + { + "epoch": 0.9550173010380623, + "grad_norm": 0.0, + "learning_rate": 1.969689520376687e-05, + "loss": 1.1567, + "step": 414 + }, + { + "epoch": 0.9573241061130334, + "grad_norm": 0.0, + "learning_rate": 1.9694861119456677e-05, + "loss": 0.952, + "step": 415 + }, + { + "epoch": 0.9596309111880046, + "grad_norm": 0.0, + "learning_rate": 1.9692820338494766e-05, + "loss": 1.2744, + "step": 416 + }, + { + "epoch": 0.9619377162629758, + "grad_norm": 0.0, + "learning_rate": 1.969077286229078e-05, + "loss": 1.0971, + "step": 417 + }, + { + "epoch": 0.9642445213379469, + "grad_norm": 0.0, + "learning_rate": 1.9688718692259007e-05, + "loss": 0.9815, + "step": 418 + }, + { + "epoch": 0.9665513264129181, + "grad_norm": 0.0, + "learning_rate": 1.9686657829818353e-05, + "loss": 1.19, + "step": 419 + }, + { + "epoch": 0.9688581314878892, + "grad_norm": 0.0, + "learning_rate": 1.968459027639233e-05, + "loss": 0.9618, + "step": 420 + }, + { + "epoch": 0.9711649365628604, + "grad_norm": 0.0, + "learning_rate": 1.9682516033409094e-05, + "loss": 1.2316, + "step": 421 + }, + { + "epoch": 0.9734717416378316, + "grad_norm": 0.0, + "learning_rate": 1.9680435102301412e-05, + "loss": 1.2232, + "step": 422 + }, + { + "epoch": 0.9757785467128027, + "grad_norm": 0.0, + "learning_rate": 1.9678347484506667e-05, + "loss": 0.9157, + "step": 423 + }, + { + "epoch": 0.9780853517877739, + "grad_norm": 0.0, + "learning_rate": 1.967625318146687e-05, + "loss": 1.1693, + "step": 424 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.0, + "learning_rate": 1.967415219462864e-05, + "loss": 1.3266, + "step": 425 + }, + { + "epoch": 0.9826989619377162, + "grad_norm": 0.0, + "learning_rate": 1.9672044525443222e-05, + "loss": 0.9772, + "step": 426 + }, + { + "epoch": 0.9850057670126874, + "grad_norm": 0.0, + "learning_rate": 1.9669930175366474e-05, + "loss": 1.2391, + "step": 427 + }, + { + "epoch": 0.9873125720876585, + "grad_norm": 0.0, + "learning_rate": 1.9667809145858863e-05, + "loss": 1.3138, + "step": 428 + }, + { + "epoch": 0.9896193771626297, + "grad_norm": 0.0, + "learning_rate": 1.9665681438385475e-05, + "loss": 0.8797, + "step": 429 + }, + { + "epoch": 0.9919261822376009, + "grad_norm": 0.0, + "learning_rate": 1.9663547054416006e-05, + "loss": 1.0686, + "step": 430 + }, + { + "epoch": 0.994232987312572, + "grad_norm": 0.0, + "learning_rate": 1.966140599542477e-05, + "loss": 1.3218, + "step": 431 + }, + { + "epoch": 0.9965397923875432, + "grad_norm": 0.0, + "learning_rate": 1.9659258262890683e-05, + "loss": 1.0326, + "step": 432 + }, + { + "epoch": 0.9988465974625144, + "grad_norm": 0.0, + "learning_rate": 1.965710385829728e-05, + "loss": 1.0026, + "step": 433 + }, + { + "epoch": 1.0011534025374855, + "grad_norm": 0.0, + "learning_rate": 1.9654942783132696e-05, + "loss": 1.0721, + "step": 434 + }, + { + "epoch": 1.0034602076124568, + "grad_norm": 0.0, + "learning_rate": 1.9652775038889676e-05, + "loss": 0.8152, + "step": 435 + }, + { + "epoch": 1.0057670126874279, + "grad_norm": 0.0, + "learning_rate": 1.9650600627065573e-05, + "loss": 1.1702, + "step": 436 + }, + { + "epoch": 1.0080738177623991, + "grad_norm": 0.0, + "learning_rate": 1.964841954916235e-05, + "loss": 0.9148, + "step": 437 + }, + { + "epoch": 1.0103806228373702, + "grad_norm": 0.0, + "learning_rate": 1.9646231806686566e-05, + "loss": 0.8995, + "step": 438 + }, + { + "epoch": 1.0126874279123415, + "grad_norm": 0.0, + "learning_rate": 1.964403740114939e-05, + "loss": 1.0761, + "step": 439 + }, + { + "epoch": 1.0149942329873125, + "grad_norm": 0.0, + "learning_rate": 1.964183633406659e-05, + "loss": 0.9926, + "step": 440 + }, + { + "epoch": 1.0173010380622838, + "grad_norm": 0.0, + "learning_rate": 1.9639628606958535e-05, + "loss": 0.653, + "step": 441 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 0.0, + "learning_rate": 1.9637414221350198e-05, + "loss": 0.7684, + "step": 442 + }, + { + "epoch": 1.021914648212226, + "grad_norm": 0.0, + "learning_rate": 1.9635193178771144e-05, + "loss": 0.5708, + "step": 443 + }, + { + "epoch": 1.0242214532871972, + "grad_norm": 0.0, + "learning_rate": 1.963296548075555e-05, + "loss": 0.5926, + "step": 444 + }, + { + "epoch": 1.0265282583621684, + "grad_norm": 0.0, + "learning_rate": 1.963073112884217e-05, + "loss": 0.594, + "step": 445 + }, + { + "epoch": 1.0288350634371395, + "grad_norm": 0.0, + "learning_rate": 1.962849012457438e-05, + "loss": 0.5756, + "step": 446 + }, + { + "epoch": 1.0311418685121108, + "grad_norm": 0.0, + "learning_rate": 1.962624246950012e-05, + "loss": 0.5881, + "step": 447 + }, + { + "epoch": 1.0334486735870818, + "grad_norm": 0.0, + "learning_rate": 1.9623988165171958e-05, + "loss": 0.5506, + "step": 448 + }, + { + "epoch": 1.035755478662053, + "grad_norm": 0.0, + "learning_rate": 1.9621727213147025e-05, + "loss": 0.8203, + "step": 449 + }, + { + "epoch": 1.0380622837370241, + "grad_norm": 0.0, + "learning_rate": 1.961945961498706e-05, + "loss": 0.8565, + "step": 450 + }, + { + "epoch": 1.0403690888119954, + "grad_norm": 0.0, + "learning_rate": 1.961718537225839e-05, + "loss": 0.9169, + "step": 451 + }, + { + "epoch": 1.0426758938869665, + "grad_norm": 0.0, + "learning_rate": 1.9614904486531935e-05, + "loss": 0.8638, + "step": 452 + }, + { + "epoch": 1.0449826989619377, + "grad_norm": 0.0, + "learning_rate": 1.961261695938319e-05, + "loss": 0.8872, + "step": 453 + }, + { + "epoch": 1.0472895040369088, + "grad_norm": 0.0, + "learning_rate": 1.9610322792392256e-05, + "loss": 0.7931, + "step": 454 + }, + { + "epoch": 1.04959630911188, + "grad_norm": 0.0, + "learning_rate": 1.9608021987143805e-05, + "loss": 1.0525, + "step": 455 + }, + { + "epoch": 1.0519031141868511, + "grad_norm": 0.0, + "learning_rate": 1.9605714545227105e-05, + "loss": 0.6539, + "step": 456 + }, + { + "epoch": 1.0542099192618224, + "grad_norm": 0.0, + "learning_rate": 1.9603400468236e-05, + "loss": 0.6141, + "step": 457 + }, + { + "epoch": 1.0565167243367934, + "grad_norm": 0.0, + "learning_rate": 1.9601079757768926e-05, + "loss": 0.5129, + "step": 458 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.0, + "learning_rate": 1.9598752415428893e-05, + "loss": 0.9438, + "step": 459 + }, + { + "epoch": 1.0611303344867358, + "grad_norm": 0.0, + "learning_rate": 1.9596418442823495e-05, + "loss": 0.7448, + "step": 460 + }, + { + "epoch": 1.063437139561707, + "grad_norm": 0.0, + "learning_rate": 1.9594077841564905e-05, + "loss": 1.0895, + "step": 461 + }, + { + "epoch": 1.065743944636678, + "grad_norm": 0.0, + "learning_rate": 1.9591730613269878e-05, + "loss": 0.7897, + "step": 462 + }, + { + "epoch": 1.0680507497116494, + "grad_norm": 0.0, + "learning_rate": 1.9589376759559747e-05, + "loss": 0.9522, + "step": 463 + }, + { + "epoch": 1.0703575547866206, + "grad_norm": 0.0, + "learning_rate": 1.958701628206041e-05, + "loss": 0.9497, + "step": 464 + }, + { + "epoch": 1.0726643598615917, + "grad_norm": 0.0, + "learning_rate": 1.9584649182402358e-05, + "loss": 0.6935, + "step": 465 + }, + { + "epoch": 1.0749711649365628, + "grad_norm": 0.0, + "learning_rate": 1.958227546222064e-05, + "loss": 0.8873, + "step": 466 + }, + { + "epoch": 1.077277970011534, + "grad_norm": 0.0, + "learning_rate": 1.957989512315489e-05, + "loss": 0.7781, + "step": 467 + }, + { + "epoch": 1.0795847750865053, + "grad_norm": 0.0, + "learning_rate": 1.9577508166849308e-05, + "loss": 0.7, + "step": 468 + }, + { + "epoch": 1.0818915801614764, + "grad_norm": 0.0, + "learning_rate": 1.9575114594952662e-05, + "loss": 0.7893, + "step": 469 + }, + { + "epoch": 1.0841983852364474, + "grad_norm": 0.0, + "learning_rate": 1.9572714409118297e-05, + "loss": 0.9558, + "step": 470 + }, + { + "epoch": 1.0865051903114187, + "grad_norm": 0.0, + "learning_rate": 1.9570307611004124e-05, + "loss": 0.5241, + "step": 471 + }, + { + "epoch": 1.08881199538639, + "grad_norm": 0.0, + "learning_rate": 1.9567894202272623e-05, + "loss": 0.7681, + "step": 472 + }, + { + "epoch": 1.091118800461361, + "grad_norm": 0.0, + "learning_rate": 1.9565474184590827e-05, + "loss": 0.7164, + "step": 473 + }, + { + "epoch": 1.0934256055363323, + "grad_norm": 0.0, + "learning_rate": 1.9563047559630356e-05, + "loss": 0.7642, + "step": 474 + }, + { + "epoch": 1.0957324106113033, + "grad_norm": 0.0, + "learning_rate": 1.956061432906738e-05, + "loss": 0.8846, + "step": 475 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 0.0, + "learning_rate": 1.955817449458263e-05, + "loss": 1.1322, + "step": 476 + }, + { + "epoch": 1.1003460207612457, + "grad_norm": 0.0, + "learning_rate": 1.955572805786141e-05, + "loss": 0.8562, + "step": 477 + }, + { + "epoch": 1.102652825836217, + "grad_norm": 0.0, + "learning_rate": 1.9553275020593573e-05, + "loss": 0.8604, + "step": 478 + }, + { + "epoch": 1.104959630911188, + "grad_norm": 0.0, + "learning_rate": 1.9550815384473534e-05, + "loss": 0.8703, + "step": 479 + }, + { + "epoch": 1.1072664359861593, + "grad_norm": 0.0, + "learning_rate": 1.9548349151200275e-05, + "loss": 0.9451, + "step": 480 + }, + { + "epoch": 1.1095732410611303, + "grad_norm": 0.0, + "learning_rate": 1.954587632247732e-05, + "loss": 0.7471, + "step": 481 + }, + { + "epoch": 1.1118800461361016, + "grad_norm": 0.0, + "learning_rate": 1.9543396900012763e-05, + "loss": 0.6657, + "step": 482 + }, + { + "epoch": 1.1141868512110726, + "grad_norm": 0.0, + "learning_rate": 1.9540910885519243e-05, + "loss": 0.8245, + "step": 483 + }, + { + "epoch": 1.116493656286044, + "grad_norm": 0.0, + "learning_rate": 1.9538418280713958e-05, + "loss": 0.8052, + "step": 484 + }, + { + "epoch": 1.118800461361015, + "grad_norm": 0.0, + "learning_rate": 1.9535919087318654e-05, + "loss": 0.7168, + "step": 485 + }, + { + "epoch": 1.1211072664359862, + "grad_norm": 0.0, + "learning_rate": 1.953341330705963e-05, + "loss": 0.8016, + "step": 486 + }, + { + "epoch": 1.1234140715109573, + "grad_norm": 0.0, + "learning_rate": 1.9530900941667733e-05, + "loss": 0.8135, + "step": 487 + }, + { + "epoch": 1.1257208765859286, + "grad_norm": 0.0, + "learning_rate": 1.9528381992878362e-05, + "loss": 0.6256, + "step": 488 + }, + { + "epoch": 1.1280276816608996, + "grad_norm": 0.0, + "learning_rate": 1.9525856462431463e-05, + "loss": 0.6397, + "step": 489 + }, + { + "epoch": 1.130334486735871, + "grad_norm": 0.0, + "learning_rate": 1.9523324352071527e-05, + "loss": 0.7365, + "step": 490 + }, + { + "epoch": 1.132641291810842, + "grad_norm": 0.0, + "learning_rate": 1.9520785663547586e-05, + "loss": 1.0175, + "step": 491 + }, + { + "epoch": 1.1349480968858132, + "grad_norm": 0.0, + "learning_rate": 1.9518240398613226e-05, + "loss": 1.0185, + "step": 492 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 0.0, + "learning_rate": 1.9515688559026564e-05, + "loss": 0.8635, + "step": 493 + }, + { + "epoch": 1.1395617070357555, + "grad_norm": 0.0, + "learning_rate": 1.9513130146550266e-05, + "loss": 0.592, + "step": 494 + }, + { + "epoch": 1.1418685121107266, + "grad_norm": 0.0, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.855, + "step": 495 + }, + { + "epoch": 1.1441753171856979, + "grad_norm": 0.0, + "learning_rate": 1.9507993610002118e-05, + "loss": 0.903, + "step": 496 + }, + { + "epoch": 1.146482122260669, + "grad_norm": 0.0, + "learning_rate": 1.9505415489478293e-05, + "loss": 0.854, + "step": 497 + }, + { + "epoch": 1.1487889273356402, + "grad_norm": 0.0, + "learning_rate": 1.9502830803160873e-05, + "loss": 0.6826, + "step": 498 + }, + { + "epoch": 1.1510957324106112, + "grad_norm": 0.0, + "learning_rate": 1.9500239552835216e-05, + "loss": 0.826, + "step": 499 + }, + { + "epoch": 1.1534025374855825, + "grad_norm": 0.0, + "learning_rate": 1.9497641740291207e-05, + "loss": 0.8879, + "step": 500 + }, + { + "epoch": 1.1557093425605536, + "grad_norm": 0.0, + "learning_rate": 1.9495037367323264e-05, + "loss": 0.9472, + "step": 501 + }, + { + "epoch": 1.1580161476355249, + "grad_norm": 0.0, + "learning_rate": 1.949242643573034e-05, + "loss": 0.7107, + "step": 502 + }, + { + "epoch": 1.160322952710496, + "grad_norm": 0.0, + "learning_rate": 1.9489808947315915e-05, + "loss": 1.0408, + "step": 503 + }, + { + "epoch": 1.1626297577854672, + "grad_norm": 0.0, + "learning_rate": 1.9487184903888e-05, + "loss": 0.8161, + "step": 504 + }, + { + "epoch": 1.1649365628604382, + "grad_norm": 0.0, + "learning_rate": 1.948455430725913e-05, + "loss": 0.7252, + "step": 505 + }, + { + "epoch": 1.1672433679354095, + "grad_norm": 0.0, + "learning_rate": 1.9481917159246375e-05, + "loss": 1.1161, + "step": 506 + }, + { + "epoch": 1.1695501730103806, + "grad_norm": 0.0, + "learning_rate": 1.947927346167132e-05, + "loss": 0.773, + "step": 507 + }, + { + "epoch": 1.1718569780853518, + "grad_norm": 0.0, + "learning_rate": 1.947662321636008e-05, + "loss": 1.0159, + "step": 508 + }, + { + "epoch": 1.1741637831603229, + "grad_norm": 0.0, + "learning_rate": 1.9473966425143292e-05, + "loss": 0.684, + "step": 509 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.0, + "learning_rate": 1.947130308985612e-05, + "loss": 0.6892, + "step": 510 + }, + { + "epoch": 1.1787773933102652, + "grad_norm": 0.0, + "learning_rate": 1.9468633212338236e-05, + "loss": 0.8094, + "step": 511 + }, + { + "epoch": 1.1810841983852365, + "grad_norm": 0.0, + "learning_rate": 1.9465956794433837e-05, + "loss": 0.9673, + "step": 512 + }, + { + "epoch": 1.1833910034602075, + "grad_norm": 0.0, + "learning_rate": 1.9463273837991643e-05, + "loss": 0.9528, + "step": 513 + }, + { + "epoch": 1.1856978085351788, + "grad_norm": 0.0, + "learning_rate": 1.9460584344864885e-05, + "loss": 0.6999, + "step": 514 + }, + { + "epoch": 1.1880046136101499, + "grad_norm": 0.0, + "learning_rate": 1.9457888316911305e-05, + "loss": 0.8349, + "step": 515 + }, + { + "epoch": 1.1903114186851211, + "grad_norm": 0.0, + "learning_rate": 1.945518575599317e-05, + "loss": 0.9279, + "step": 516 + }, + { + "epoch": 1.1926182237600922, + "grad_norm": 0.0, + "learning_rate": 1.945247666397725e-05, + "loss": 0.6596, + "step": 517 + }, + { + "epoch": 1.1949250288350635, + "grad_norm": 0.0, + "learning_rate": 1.944976104273483e-05, + "loss": 0.6871, + "step": 518 + }, + { + "epoch": 1.1972318339100345, + "grad_norm": 0.0, + "learning_rate": 1.9447038894141707e-05, + "loss": 0.9282, + "step": 519 + }, + { + "epoch": 1.1995386389850058, + "grad_norm": 0.0, + "learning_rate": 1.944431022007818e-05, + "loss": 0.8709, + "step": 520 + }, + { + "epoch": 1.2018454440599768, + "grad_norm": 0.0, + "learning_rate": 1.9441575022429065e-05, + "loss": 0.4996, + "step": 521 + }, + { + "epoch": 1.2041522491349481, + "grad_norm": 0.0, + "learning_rate": 1.9438833303083677e-05, + "loss": 0.8457, + "step": 522 + }, + { + "epoch": 1.2064590542099192, + "grad_norm": 0.0, + "learning_rate": 1.9436085063935837e-05, + "loss": 0.7837, + "step": 523 + }, + { + "epoch": 1.2087658592848904, + "grad_norm": 0.0, + "learning_rate": 1.943333030688387e-05, + "loss": 0.7091, + "step": 524 + }, + { + "epoch": 1.2110726643598615, + "grad_norm": 0.0, + "learning_rate": 1.9430569033830606e-05, + "loss": 0.7262, + "step": 525 + }, + { + "epoch": 1.2133794694348328, + "grad_norm": 0.0, + "learning_rate": 1.942780124668337e-05, + "loss": 0.88, + "step": 526 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 0.0, + "learning_rate": 1.9425026947353994e-05, + "loss": 0.6826, + "step": 527 + }, + { + "epoch": 1.217993079584775, + "grad_norm": 0.0, + "learning_rate": 1.94222461377588e-05, + "loss": 1.0498, + "step": 528 + }, + { + "epoch": 1.2202998846597461, + "grad_norm": 0.0, + "learning_rate": 1.9419458819818617e-05, + "loss": 0.7679, + "step": 529 + }, + { + "epoch": 1.2226066897347174, + "grad_norm": 0.0, + "learning_rate": 1.9416664995458756e-05, + "loss": 0.8326, + "step": 530 + }, + { + "epoch": 1.2249134948096887, + "grad_norm": 0.0, + "learning_rate": 1.9413864666609036e-05, + "loss": 0.902, + "step": 531 + }, + { + "epoch": 1.2272202998846597, + "grad_norm": 0.0, + "learning_rate": 1.9411057835203756e-05, + "loss": 0.7393, + "step": 532 + }, + { + "epoch": 1.2295271049596308, + "grad_norm": 0.0, + "learning_rate": 1.9408244503181723e-05, + "loss": 0.7971, + "step": 533 + }, + { + "epoch": 1.231833910034602, + "grad_norm": 0.0, + "learning_rate": 1.9405424672486218e-05, + "loss": 0.651, + "step": 534 + }, + { + "epoch": 1.2341407151095733, + "grad_norm": 0.0, + "learning_rate": 1.940259834506502e-05, + "loss": 0.799, + "step": 535 + }, + { + "epoch": 1.2364475201845444, + "grad_norm": 0.0, + "learning_rate": 1.939976552287039e-05, + "loss": 0.9757, + "step": 536 + }, + { + "epoch": 1.2387543252595155, + "grad_norm": 0.0, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.9407, + "step": 537 + }, + { + "epoch": 1.2410611303344867, + "grad_norm": 0.0, + "learning_rate": 1.9394080401992336e-05, + "loss": 0.7027, + "step": 538 + }, + { + "epoch": 1.243367935409458, + "grad_norm": 0.0, + "learning_rate": 1.939122810723586e-05, + "loss": 0.7995, + "step": 539 + }, + { + "epoch": 1.245674740484429, + "grad_norm": 0.0, + "learning_rate": 1.9388369325559862e-05, + "loss": 0.8117, + "step": 540 + }, + { + "epoch": 1.2479815455594, + "grad_norm": 0.0, + "learning_rate": 1.9385504058939023e-05, + "loss": 0.6787, + "step": 541 + }, + { + "epoch": 1.2502883506343714, + "grad_norm": 0.0, + "learning_rate": 1.9382632309352503e-05, + "loss": 0.8654, + "step": 542 + }, + { + "epoch": 1.2525951557093427, + "grad_norm": 0.0, + "learning_rate": 1.937975407878394e-05, + "loss": 0.8878, + "step": 543 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.0, + "learning_rate": 1.937686936922145e-05, + "loss": 0.7783, + "step": 544 + }, + { + "epoch": 1.2572087658592848, + "grad_norm": 0.0, + "learning_rate": 1.9373978182657628e-05, + "loss": 1.1215, + "step": 545 + }, + { + "epoch": 1.259515570934256, + "grad_norm": 0.0, + "learning_rate": 1.9371080521089536e-05, + "loss": 0.8044, + "step": 546 + }, + { + "epoch": 1.2618223760092273, + "grad_norm": 0.0, + "learning_rate": 1.936817638651871e-05, + "loss": 0.7656, + "step": 547 + }, + { + "epoch": 1.2641291810841984, + "grad_norm": 0.0, + "learning_rate": 1.9365265780951167e-05, + "loss": 0.8099, + "step": 548 + }, + { + "epoch": 1.2664359861591694, + "grad_norm": 0.0, + "learning_rate": 1.9362348706397374e-05, + "loss": 0.9516, + "step": 549 + }, + { + "epoch": 1.2687427912341407, + "grad_norm": 0.0, + "learning_rate": 1.9359425164872285e-05, + "loss": 0.7079, + "step": 550 + }, + { + "epoch": 1.271049596309112, + "grad_norm": 0.0, + "learning_rate": 1.9356495158395317e-05, + "loss": 0.6025, + "step": 551 + }, + { + "epoch": 1.273356401384083, + "grad_norm": 0.0, + "learning_rate": 1.935355868899034e-05, + "loss": 0.8121, + "step": 552 + }, + { + "epoch": 1.2756632064590543, + "grad_norm": 0.0, + "learning_rate": 1.935061575868571e-05, + "loss": 0.5483, + "step": 553 + }, + { + "epoch": 1.2779700115340253, + "grad_norm": 0.0, + "learning_rate": 1.9347666369514225e-05, + "loss": 0.8122, + "step": 554 + }, + { + "epoch": 1.2802768166089966, + "grad_norm": 0.0, + "learning_rate": 1.9344710523513157e-05, + "loss": 0.8315, + "step": 555 + }, + { + "epoch": 1.2825836216839677, + "grad_norm": 0.0, + "learning_rate": 1.9341748222724233e-05, + "loss": 0.818, + "step": 556 + }, + { + "epoch": 1.284890426758939, + "grad_norm": 0.0, + "learning_rate": 1.9338779469193638e-05, + "loss": 0.802, + "step": 557 + }, + { + "epoch": 1.28719723183391, + "grad_norm": 0.0, + "learning_rate": 1.9335804264972018e-05, + "loss": 0.527, + "step": 558 + }, + { + "epoch": 1.2895040369088813, + "grad_norm": 0.0, + "learning_rate": 1.9332822612114474e-05, + "loss": 0.8959, + "step": 559 + }, + { + "epoch": 1.2918108419838523, + "grad_norm": 0.0, + "learning_rate": 1.9329834512680558e-05, + "loss": 1.0901, + "step": 560 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.0, + "learning_rate": 1.9326839968734278e-05, + "loss": 1.0168, + "step": 561 + }, + { + "epoch": 1.2964244521337946, + "grad_norm": 0.0, + "learning_rate": 1.9323838982344092e-05, + "loss": 0.6239, + "step": 562 + }, + { + "epoch": 1.298731257208766, + "grad_norm": 0.0, + "learning_rate": 1.9320831555582908e-05, + "loss": 0.7308, + "step": 563 + }, + { + "epoch": 1.301038062283737, + "grad_norm": 0.0, + "learning_rate": 1.9317817690528086e-05, + "loss": 0.8554, + "step": 564 + }, + { + "epoch": 1.3033448673587082, + "grad_norm": 0.0, + "learning_rate": 1.9314797389261426e-05, + "loss": 1.0442, + "step": 565 + }, + { + "epoch": 1.3056516724336793, + "grad_norm": 0.0, + "learning_rate": 1.931177065386918e-05, + "loss": 1.1273, + "step": 566 + }, + { + "epoch": 1.3079584775086506, + "grad_norm": 0.0, + "learning_rate": 1.9308737486442045e-05, + "loss": 0.5567, + "step": 567 + }, + { + "epoch": 1.3102652825836216, + "grad_norm": 0.0, + "learning_rate": 1.9305697889075155e-05, + "loss": 0.6401, + "step": 568 + }, + { + "epoch": 1.312572087658593, + "grad_norm": 0.0, + "learning_rate": 1.9302651863868093e-05, + "loss": 0.667, + "step": 569 + }, + { + "epoch": 1.314878892733564, + "grad_norm": 0.0, + "learning_rate": 1.929959941292487e-05, + "loss": 0.8023, + "step": 570 + }, + { + "epoch": 1.3171856978085352, + "grad_norm": 0.0, + "learning_rate": 1.9296540538353948e-05, + "loss": 0.9496, + "step": 571 + }, + { + "epoch": 1.3194925028835063, + "grad_norm": 0.0, + "learning_rate": 1.9293475242268224e-05, + "loss": 0.5982, + "step": 572 + }, + { + "epoch": 1.3217993079584776, + "grad_norm": 0.0, + "learning_rate": 1.9290403526785025e-05, + "loss": 0.8044, + "step": 573 + }, + { + "epoch": 1.3241061130334486, + "grad_norm": 0.0, + "learning_rate": 1.928732539402612e-05, + "loss": 0.5639, + "step": 574 + }, + { + "epoch": 1.3264129181084199, + "grad_norm": 0.0, + "learning_rate": 1.9284240846117698e-05, + "loss": 0.9358, + "step": 575 + }, + { + "epoch": 1.328719723183391, + "grad_norm": 0.0, + "learning_rate": 1.928114988519039e-05, + "loss": 0.5317, + "step": 576 + }, + { + "epoch": 1.3310265282583622, + "grad_norm": 0.0, + "learning_rate": 1.9278052513379256e-05, + "loss": 0.629, + "step": 577 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.0, + "learning_rate": 1.927494873282378e-05, + "loss": 1.0959, + "step": 578 + }, + { + "epoch": 1.3356401384083045, + "grad_norm": 0.0, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.5953, + "step": 579 + }, + { + "epoch": 1.3379469434832756, + "grad_norm": 0.0, + "learning_rate": 1.9268721954059878e-05, + "loss": 0.9326, + "step": 580 + }, + { + "epoch": 1.3402537485582469, + "grad_norm": 0.0, + "learning_rate": 1.9265598960152556e-05, + "loss": 0.6775, + "step": 581 + }, + { + "epoch": 1.342560553633218, + "grad_norm": 0.0, + "learning_rate": 1.926246956610309e-05, + "loss": 0.6612, + "step": 582 + }, + { + "epoch": 1.3448673587081892, + "grad_norm": 0.0, + "learning_rate": 1.9259333774073084e-05, + "loss": 0.7408, + "step": 583 + }, + { + "epoch": 1.3471741637831602, + "grad_norm": 0.0, + "learning_rate": 1.925619158622856e-05, + "loss": 0.7277, + "step": 584 + }, + { + "epoch": 1.3494809688581315, + "grad_norm": 0.0, + "learning_rate": 1.9253043004739967e-05, + "loss": 0.8082, + "step": 585 + }, + { + "epoch": 1.3517877739331028, + "grad_norm": 0.0, + "learning_rate": 1.924988803178216e-05, + "loss": 0.9178, + "step": 586 + }, + { + "epoch": 1.3540945790080738, + "grad_norm": 0.0, + "learning_rate": 1.9246726669534416e-05, + "loss": 0.677, + "step": 587 + }, + { + "epoch": 1.356401384083045, + "grad_norm": 0.0, + "learning_rate": 1.9243558920180417e-05, + "loss": 0.7831, + "step": 588 + }, + { + "epoch": 1.3587081891580162, + "grad_norm": 0.0, + "learning_rate": 1.9240384785908267e-05, + "loss": 0.9029, + "step": 589 + }, + { + "epoch": 1.3610149942329874, + "grad_norm": 0.0, + "learning_rate": 1.923720426891047e-05, + "loss": 0.7002, + "step": 590 + }, + { + "epoch": 1.3633217993079585, + "grad_norm": 0.0, + "learning_rate": 1.9234017371383946e-05, + "loss": 0.7205, + "step": 591 + }, + { + "epoch": 1.3656286043829295, + "grad_norm": 0.0, + "learning_rate": 1.923082409553002e-05, + "loss": 0.713, + "step": 592 + }, + { + "epoch": 1.3679354094579008, + "grad_norm": 0.0, + "learning_rate": 1.9227624443554425e-05, + "loss": 0.712, + "step": 593 + }, + { + "epoch": 1.370242214532872, + "grad_norm": 0.0, + "learning_rate": 1.9224418417667295e-05, + "loss": 1.0618, + "step": 594 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.0, + "learning_rate": 1.9221206020083166e-05, + "loss": 1.0404, + "step": 595 + }, + { + "epoch": 1.3748558246828142, + "grad_norm": 0.0, + "learning_rate": 1.9217987253020982e-05, + "loss": 0.8067, + "step": 596 + }, + { + "epoch": 1.3771626297577855, + "grad_norm": 0.0, + "learning_rate": 1.921476211870408e-05, + "loss": 0.7118, + "step": 597 + }, + { + "epoch": 1.3794694348327567, + "grad_norm": 0.0, + "learning_rate": 1.9211530619360194e-05, + "loss": 0.6202, + "step": 598 + }, + { + "epoch": 1.3817762399077278, + "grad_norm": 0.0, + "learning_rate": 1.920829275722146e-05, + "loss": 1.0615, + "step": 599 + }, + { + "epoch": 1.3840830449826989, + "grad_norm": 0.0, + "learning_rate": 1.9205048534524405e-05, + "loss": 0.7915, + "step": 600 + }, + { + "epoch": 1.3863898500576701, + "grad_norm": 0.0, + "learning_rate": 1.9201797953509954e-05, + "loss": 0.9722, + "step": 601 + }, + { + "epoch": 1.3886966551326414, + "grad_norm": 0.0, + "learning_rate": 1.919854101642342e-05, + "loss": 0.8785, + "step": 602 + }, + { + "epoch": 1.3910034602076125, + "grad_norm": 0.0, + "learning_rate": 1.919527772551451e-05, + "loss": 0.7116, + "step": 603 + }, + { + "epoch": 1.3933102652825835, + "grad_norm": 0.0, + "learning_rate": 1.9192008083037314e-05, + "loss": 0.7417, + "step": 604 + }, + { + "epoch": 1.3956170703575548, + "grad_norm": 0.0, + "learning_rate": 1.918873209125031e-05, + "loss": 0.865, + "step": 605 + }, + { + "epoch": 1.397923875432526, + "grad_norm": 0.0, + "learning_rate": 1.9185449752416367e-05, + "loss": 0.7466, + "step": 606 + }, + { + "epoch": 1.400230680507497, + "grad_norm": 0.0, + "learning_rate": 1.9182161068802742e-05, + "loss": 0.9286, + "step": 607 + }, + { + "epoch": 1.4025374855824682, + "grad_norm": 0.0, + "learning_rate": 1.9178866042681062e-05, + "loss": 1.0603, + "step": 608 + }, + { + "epoch": 1.4048442906574394, + "grad_norm": 0.0, + "learning_rate": 1.917556467632734e-05, + "loss": 1.0477, + "step": 609 + }, + { + "epoch": 1.4071510957324107, + "grad_norm": 0.0, + "learning_rate": 1.917225697202197e-05, + "loss": 0.8019, + "step": 610 + }, + { + "epoch": 1.4094579008073818, + "grad_norm": 0.0, + "learning_rate": 1.916894293204973e-05, + "loss": 0.9148, + "step": 611 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.0, + "learning_rate": 1.9165622558699763e-05, + "loss": 0.6256, + "step": 612 + }, + { + "epoch": 1.414071510957324, + "grad_norm": 0.0, + "learning_rate": 1.9162295854265593e-05, + "loss": 0.6784, + "step": 613 + }, + { + "epoch": 1.4163783160322954, + "grad_norm": 0.0, + "learning_rate": 1.9158962821045113e-05, + "loss": 0.8205, + "step": 614 + }, + { + "epoch": 1.4186851211072664, + "grad_norm": 0.0, + "learning_rate": 1.9155623461340594e-05, + "loss": 0.6162, + "step": 615 + }, + { + "epoch": 1.4209919261822375, + "grad_norm": 0.0, + "learning_rate": 1.9152277777458667e-05, + "loss": 0.6434, + "step": 616 + }, + { + "epoch": 1.4232987312572087, + "grad_norm": 0.0, + "learning_rate": 1.9148925771710347e-05, + "loss": 0.8216, + "step": 617 + }, + { + "epoch": 1.42560553633218, + "grad_norm": 0.0, + "learning_rate": 1.9145567446411003e-05, + "loss": 0.8651, + "step": 618 + }, + { + "epoch": 1.427912341407151, + "grad_norm": 0.0, + "learning_rate": 1.9142202803880373e-05, + "loss": 0.8115, + "step": 619 + }, + { + "epoch": 1.4302191464821223, + "grad_norm": 0.0, + "learning_rate": 1.913883184644255e-05, + "loss": 0.9731, + "step": 620 + }, + { + "epoch": 1.4325259515570934, + "grad_norm": 0.0, + "learning_rate": 1.913545457642601e-05, + "loss": 0.4795, + "step": 621 + }, + { + "epoch": 1.4348327566320647, + "grad_norm": 0.0, + "learning_rate": 1.913207099616357e-05, + "loss": 1.016, + "step": 622 + }, + { + "epoch": 1.4371395617070357, + "grad_norm": 0.0, + "learning_rate": 1.9128681107992415e-05, + "loss": 0.8133, + "step": 623 + }, + { + "epoch": 1.439446366782007, + "grad_norm": 0.0, + "learning_rate": 1.912528491425408e-05, + "loss": 0.7121, + "step": 624 + }, + { + "epoch": 1.441753171856978, + "grad_norm": 0.0, + "learning_rate": 1.9121882417294462e-05, + "loss": 0.8303, + "step": 625 + }, + { + "epoch": 1.4440599769319493, + "grad_norm": 0.0, + "learning_rate": 1.9118473619463813e-05, + "loss": 0.5516, + "step": 626 + }, + { + "epoch": 1.4463667820069204, + "grad_norm": 0.0, + "learning_rate": 1.9115058523116734e-05, + "loss": 0.5597, + "step": 627 + }, + { + "epoch": 1.4486735870818916, + "grad_norm": 0.0, + "learning_rate": 1.9111637130612172e-05, + "loss": 0.6982, + "step": 628 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 0.0, + "learning_rate": 1.9108209444313432e-05, + "loss": 0.7925, + "step": 629 + }, + { + "epoch": 1.453287197231834, + "grad_norm": 0.0, + "learning_rate": 1.9104775466588162e-05, + "loss": 0.7492, + "step": 630 + }, + { + "epoch": 1.455594002306805, + "grad_norm": 0.0, + "learning_rate": 1.9101335199808352e-05, + "loss": 1.0452, + "step": 631 + }, + { + "epoch": 1.4579008073817763, + "grad_norm": 0.0, + "learning_rate": 1.9097888646350347e-05, + "loss": 0.8255, + "step": 632 + }, + { + "epoch": 1.4602076124567474, + "grad_norm": 0.0, + "learning_rate": 1.9094435808594823e-05, + "loss": 0.8823, + "step": 633 + }, + { + "epoch": 1.4625144175317186, + "grad_norm": 0.0, + "learning_rate": 1.9090976688926802e-05, + "loss": 0.8228, + "step": 634 + }, + { + "epoch": 1.4648212226066897, + "grad_norm": 0.0, + "learning_rate": 1.9087511289735646e-05, + "loss": 0.7167, + "step": 635 + }, + { + "epoch": 1.467128027681661, + "grad_norm": 0.0, + "learning_rate": 1.9084039613415052e-05, + "loss": 0.6188, + "step": 636 + }, + { + "epoch": 1.469434832756632, + "grad_norm": 0.0, + "learning_rate": 1.908056166236305e-05, + "loss": 0.5321, + "step": 637 + }, + { + "epoch": 1.4717416378316033, + "grad_norm": 0.0, + "learning_rate": 1.9077077438982016e-05, + "loss": 0.5406, + "step": 638 + }, + { + "epoch": 1.4740484429065743, + "grad_norm": 0.0, + "learning_rate": 1.907358694567865e-05, + "loss": 0.8034, + "step": 639 + }, + { + "epoch": 1.4763552479815456, + "grad_norm": 0.0, + "learning_rate": 1.907009018486398e-05, + "loss": 0.7912, + "step": 640 + }, + { + "epoch": 1.4786620530565167, + "grad_norm": 0.0, + "learning_rate": 1.906658715895337e-05, + "loss": 0.9115, + "step": 641 + }, + { + "epoch": 1.480968858131488, + "grad_norm": 0.0, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.6487, + "step": 642 + }, + { + "epoch": 1.483275663206459, + "grad_norm": 0.0, + "learning_rate": 1.9059562321527397e-05, + "loss": 0.8266, + "step": 643 + }, + { + "epoch": 1.4855824682814303, + "grad_norm": 0.0, + "learning_rate": 1.905604051486439e-05, + "loss": 0.637, + "step": 644 + }, + { + "epoch": 1.4878892733564013, + "grad_norm": 0.0, + "learning_rate": 1.905251245281015e-05, + "loss": 0.9449, + "step": 645 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 0.0, + "learning_rate": 1.904897813780165e-05, + "loss": 0.87, + "step": 646 + }, + { + "epoch": 1.4925028835063436, + "grad_norm": 0.0, + "learning_rate": 1.9045437572280193e-05, + "loss": 1.0397, + "step": 647 + }, + { + "epoch": 1.494809688581315, + "grad_norm": 0.0, + "learning_rate": 1.9041890758691403e-05, + "loss": 1.0072, + "step": 648 + }, + { + "epoch": 1.497116493656286, + "grad_norm": 0.0, + "learning_rate": 1.9038337699485207e-05, + "loss": 0.7953, + "step": 649 + }, + { + "epoch": 1.4994232987312572, + "grad_norm": 0.0, + "learning_rate": 1.903477839711586e-05, + "loss": 0.9631, + "step": 650 + }, + { + "epoch": 1.5017301038062283, + "grad_norm": 0.0, + "learning_rate": 1.903121285404192e-05, + "loss": 0.7595, + "step": 651 + }, + { + "epoch": 1.5040369088811996, + "grad_norm": 0.0, + "learning_rate": 1.902764107272626e-05, + "loss": 0.8466, + "step": 652 + }, + { + "epoch": 1.5063437139561708, + "grad_norm": 0.0, + "learning_rate": 1.9024063055636057e-05, + "loss": 0.6512, + "step": 653 + }, + { + "epoch": 1.508650519031142, + "grad_norm": 0.0, + "learning_rate": 1.9020478805242803e-05, + "loss": 0.6777, + "step": 654 + }, + { + "epoch": 1.510957324106113, + "grad_norm": 0.0, + "learning_rate": 1.9016888324022294e-05, + "loss": 0.5648, + "step": 655 + }, + { + "epoch": 1.5132641291810842, + "grad_norm": 0.0, + "learning_rate": 1.9013291614454622e-05, + "loss": 0.593, + "step": 656 + }, + { + "epoch": 1.5155709342560555, + "grad_norm": 0.0, + "learning_rate": 1.900968867902419e-05, + "loss": 1.0741, + "step": 657 + }, + { + "epoch": 1.5178777393310265, + "grad_norm": 0.0, + "learning_rate": 1.90060795202197e-05, + "loss": 0.596, + "step": 658 + }, + { + "epoch": 1.5201845444059976, + "grad_norm": 0.0, + "learning_rate": 1.9002464140534148e-05, + "loss": 0.641, + "step": 659 + }, + { + "epoch": 1.5224913494809689, + "grad_norm": 0.0, + "learning_rate": 1.8998842542464833e-05, + "loss": 0.5225, + "step": 660 + }, + { + "epoch": 1.5247981545559401, + "grad_norm": 0.0, + "learning_rate": 1.899521472851334e-05, + "loss": 1.047, + "step": 661 + }, + { + "epoch": 1.5271049596309112, + "grad_norm": 0.0, + "learning_rate": 1.8991580701185564e-05, + "loss": 1.0123, + "step": 662 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.0, + "learning_rate": 1.8987940462991673e-05, + "loss": 0.6392, + "step": 663 + }, + { + "epoch": 1.5317185697808535, + "grad_norm": 0.0, + "learning_rate": 1.8984294016446135e-05, + "loss": 0.6204, + "step": 664 + }, + { + "epoch": 1.5340253748558248, + "grad_norm": 0.0, + "learning_rate": 1.8980641364067708e-05, + "loss": 0.5901, + "step": 665 + }, + { + "epoch": 1.5363321799307958, + "grad_norm": 0.0, + "learning_rate": 1.8976982508379436e-05, + "loss": 0.5011, + "step": 666 + }, + { + "epoch": 1.538638985005767, + "grad_norm": 0.0, + "learning_rate": 1.8973317451908642e-05, + "loss": 0.7011, + "step": 667 + }, + { + "epoch": 1.5409457900807382, + "grad_norm": 0.0, + "learning_rate": 1.8969646197186934e-05, + "loss": 0.8344, + "step": 668 + }, + { + "epoch": 1.5432525951557095, + "grad_norm": 0.0, + "learning_rate": 1.896596874675021e-05, + "loss": 0.6678, + "step": 669 + }, + { + "epoch": 1.5455594002306805, + "grad_norm": 0.0, + "learning_rate": 1.8962285103138637e-05, + "loss": 0.7885, + "step": 670 + }, + { + "epoch": 1.5478662053056516, + "grad_norm": 0.0, + "learning_rate": 1.8958595268896662e-05, + "loss": 0.7285, + "step": 671 + }, + { + "epoch": 1.5501730103806228, + "grad_norm": 0.0, + "learning_rate": 1.895489924657301e-05, + "loss": 0.8522, + "step": 672 + }, + { + "epoch": 1.552479815455594, + "grad_norm": 0.0, + "learning_rate": 1.895119703872069e-05, + "loss": 0.8042, + "step": 673 + }, + { + "epoch": 1.5547866205305652, + "grad_norm": 0.0, + "learning_rate": 1.894748864789696e-05, + "loss": 0.581, + "step": 674 + }, + { + "epoch": 1.5570934256055362, + "grad_norm": 0.0, + "learning_rate": 1.8943774076663372e-05, + "loss": 0.9724, + "step": 675 + }, + { + "epoch": 1.5594002306805075, + "grad_norm": 0.0, + "learning_rate": 1.8940053327585733e-05, + "loss": 0.6206, + "step": 676 + }, + { + "epoch": 1.5617070357554788, + "grad_norm": 0.0, + "learning_rate": 1.8936326403234125e-05, + "loss": 0.7367, + "step": 677 + }, + { + "epoch": 1.5640138408304498, + "grad_norm": 0.0, + "learning_rate": 1.893259330618289e-05, + "loss": 0.5564, + "step": 678 + }, + { + "epoch": 1.5663206459054209, + "grad_norm": 0.0, + "learning_rate": 1.8928854039010643e-05, + "loss": 0.4865, + "step": 679 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.0, + "learning_rate": 1.8925108604300245e-05, + "loss": 0.671, + "step": 680 + }, + { + "epoch": 1.5709342560553634, + "grad_norm": 0.0, + "learning_rate": 1.8921357004638837e-05, + "loss": 0.6144, + "step": 681 + }, + { + "epoch": 1.5732410611303345, + "grad_norm": 0.0, + "learning_rate": 1.8917599242617796e-05, + "loss": 0.9663, + "step": 682 + }, + { + "epoch": 1.5755478662053055, + "grad_norm": 0.0, + "learning_rate": 1.8913835320832778e-05, + "loss": 0.9865, + "step": 683 + }, + { + "epoch": 1.5778546712802768, + "grad_norm": 0.0, + "learning_rate": 1.891006524188368e-05, + "loss": 0.9708, + "step": 684 + }, + { + "epoch": 1.580161476355248, + "grad_norm": 0.0, + "learning_rate": 1.8906289008374654e-05, + "loss": 0.9549, + "step": 685 + }, + { + "epoch": 1.5824682814302191, + "grad_norm": 0.0, + "learning_rate": 1.8902506622914105e-05, + "loss": 0.577, + "step": 686 + }, + { + "epoch": 1.5847750865051902, + "grad_norm": 0.0, + "learning_rate": 1.8898718088114688e-05, + "loss": 0.6999, + "step": 687 + }, + { + "epoch": 1.5870818915801614, + "grad_norm": 0.0, + "learning_rate": 1.8894923406593305e-05, + "loss": 1.0039, + "step": 688 + }, + { + "epoch": 1.5893886966551327, + "grad_norm": 0.0, + "learning_rate": 1.88911225809711e-05, + "loss": 1.2906, + "step": 689 + }, + { + "epoch": 1.5916955017301038, + "grad_norm": 0.0, + "learning_rate": 1.888731561387347e-05, + "loss": 0.6353, + "step": 690 + }, + { + "epoch": 1.5940023068050748, + "grad_norm": 0.0, + "learning_rate": 1.8883502507930044e-05, + "loss": 0.5049, + "step": 691 + }, + { + "epoch": 1.596309111880046, + "grad_norm": 0.0, + "learning_rate": 1.8879683265774695e-05, + "loss": 0.7501, + "step": 692 + }, + { + "epoch": 1.5986159169550174, + "grad_norm": 0.0, + "learning_rate": 1.8875857890045544e-05, + "loss": 0.8276, + "step": 693 + }, + { + "epoch": 1.6009227220299884, + "grad_norm": 0.0, + "learning_rate": 1.887202638338493e-05, + "loss": 0.6369, + "step": 694 + }, + { + "epoch": 1.6032295271049595, + "grad_norm": 0.0, + "learning_rate": 1.8868188748439444e-05, + "loss": 0.8369, + "step": 695 + }, + { + "epoch": 1.6055363321799307, + "grad_norm": 0.0, + "learning_rate": 1.8864344987859898e-05, + "loss": 0.8344, + "step": 696 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 0.0, + "learning_rate": 1.8860495104301346e-05, + "loss": 0.8726, + "step": 697 + }, + { + "epoch": 1.610149942329873, + "grad_norm": 0.0, + "learning_rate": 1.8856639100423062e-05, + "loss": 0.9648, + "step": 698 + }, + { + "epoch": 1.6124567474048441, + "grad_norm": 0.0, + "learning_rate": 1.885277697888855e-05, + "loss": 0.6344, + "step": 699 + }, + { + "epoch": 1.6147635524798154, + "grad_norm": 0.0, + "learning_rate": 1.8848908742365547e-05, + "loss": 0.6824, + "step": 700 + }, + { + "epoch": 1.6170703575547867, + "grad_norm": 0.0, + "learning_rate": 1.8845034393526005e-05, + "loss": 0.628, + "step": 701 + }, + { + "epoch": 1.6193771626297577, + "grad_norm": 0.0, + "learning_rate": 1.8841153935046098e-05, + "loss": 0.6739, + "step": 702 + }, + { + "epoch": 1.621683967704729, + "grad_norm": 0.0, + "learning_rate": 1.8837267369606228e-05, + "loss": 0.5553, + "step": 703 + }, + { + "epoch": 1.6239907727797003, + "grad_norm": 0.0, + "learning_rate": 1.883337469989101e-05, + "loss": 0.5648, + "step": 704 + }, + { + "epoch": 1.6262975778546713, + "grad_norm": 0.0, + "learning_rate": 1.8829475928589272e-05, + "loss": 0.6747, + "step": 705 + }, + { + "epoch": 1.6286043829296424, + "grad_norm": 0.0, + "learning_rate": 1.882557105839406e-05, + "loss": 0.9867, + "step": 706 + }, + { + "epoch": 1.6309111880046137, + "grad_norm": 0.0, + "learning_rate": 1.8821660092002642e-05, + "loss": 0.843, + "step": 707 + }, + { + "epoch": 1.633217993079585, + "grad_norm": 0.0, + "learning_rate": 1.8817743032116483e-05, + "loss": 0.5533, + "step": 708 + }, + { + "epoch": 1.635524798154556, + "grad_norm": 0.0, + "learning_rate": 1.881381988144126e-05, + "loss": 0.8924, + "step": 709 + }, + { + "epoch": 1.637831603229527, + "grad_norm": 0.0, + "learning_rate": 1.8809890642686862e-05, + "loss": 1.0735, + "step": 710 + }, + { + "epoch": 1.6401384083044983, + "grad_norm": 0.0, + "learning_rate": 1.880595531856738e-05, + "loss": 0.6316, + "step": 711 + }, + { + "epoch": 1.6424452133794696, + "grad_norm": 0.0, + "learning_rate": 1.880201391180111e-05, + "loss": 1.0137, + "step": 712 + }, + { + "epoch": 1.6447520184544406, + "grad_norm": 0.0, + "learning_rate": 1.879806642511055e-05, + "loss": 0.4879, + "step": 713 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.0, + "learning_rate": 1.87941128612224e-05, + "loss": 0.8189, + "step": 714 + }, + { + "epoch": 1.649365628604383, + "grad_norm": 0.0, + "learning_rate": 1.879015322286754e-05, + "loss": 1.1396, + "step": 715 + }, + { + "epoch": 1.6516724336793542, + "grad_norm": 0.0, + "learning_rate": 1.878618751278108e-05, + "loss": 1.1921, + "step": 716 + }, + { + "epoch": 1.6539792387543253, + "grad_norm": 0.0, + "learning_rate": 1.8782215733702286e-05, + "loss": 0.6635, + "step": 717 + }, + { + "epoch": 1.6562860438292963, + "grad_norm": 0.0, + "learning_rate": 1.8778237888374647e-05, + "loss": 0.5903, + "step": 718 + }, + { + "epoch": 1.6585928489042676, + "grad_norm": 0.0, + "learning_rate": 1.877425397954582e-05, + "loss": 0.7721, + "step": 719 + }, + { + "epoch": 1.6608996539792389, + "grad_norm": 0.0, + "learning_rate": 1.8770264009967667e-05, + "loss": 0.8322, + "step": 720 + }, + { + "epoch": 1.66320645905421, + "grad_norm": 0.0, + "learning_rate": 1.8766267982396225e-05, + "loss": 0.8399, + "step": 721 + }, + { + "epoch": 1.665513264129181, + "grad_norm": 0.0, + "learning_rate": 1.8762265899591724e-05, + "loss": 0.773, + "step": 722 + }, + { + "epoch": 1.6678200692041523, + "grad_norm": 0.0, + "learning_rate": 1.8758257764318566e-05, + "loss": 0.8928, + "step": 723 + }, + { + "epoch": 1.6701268742791235, + "grad_norm": 0.0, + "learning_rate": 1.8754243579345347e-05, + "loss": 0.7347, + "step": 724 + }, + { + "epoch": 1.6724336793540946, + "grad_norm": 0.0, + "learning_rate": 1.875022334744483e-05, + "loss": 0.7234, + "step": 725 + }, + { + "epoch": 1.6747404844290656, + "grad_norm": 0.0, + "learning_rate": 1.874619707139396e-05, + "loss": 1.0121, + "step": 726 + }, + { + "epoch": 1.677047289504037, + "grad_norm": 0.0, + "learning_rate": 1.874216475397386e-05, + "loss": 0.7666, + "step": 727 + }, + { + "epoch": 1.6793540945790082, + "grad_norm": 0.0, + "learning_rate": 1.8738126397969818e-05, + "loss": 0.6755, + "step": 728 + }, + { + "epoch": 1.6816608996539792, + "grad_norm": 0.0, + "learning_rate": 1.87340820061713e-05, + "loss": 0.7993, + "step": 729 + }, + { + "epoch": 1.6839677047289503, + "grad_norm": 0.0, + "learning_rate": 1.873003158137194e-05, + "loss": 1.0176, + "step": 730 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 0.0, + "learning_rate": 1.8725975126369535e-05, + "loss": 0.6397, + "step": 731 + }, + { + "epoch": 1.6885813148788928, + "grad_norm": 0.0, + "learning_rate": 1.8721912643966055e-05, + "loss": 0.8066, + "step": 732 + }, + { + "epoch": 1.690888119953864, + "grad_norm": 0.0, + "learning_rate": 1.8717844136967626e-05, + "loss": 0.7193, + "step": 733 + }, + { + "epoch": 1.693194925028835, + "grad_norm": 0.0, + "learning_rate": 1.871376960818454e-05, + "loss": 0.5262, + "step": 734 + }, + { + "epoch": 1.6955017301038062, + "grad_norm": 0.0, + "learning_rate": 1.8709689060431242e-05, + "loss": 0.8603, + "step": 735 + }, + { + "epoch": 1.6978085351787775, + "grad_norm": 0.0, + "learning_rate": 1.8705602496526344e-05, + "loss": 0.6934, + "step": 736 + }, + { + "epoch": 1.7001153402537486, + "grad_norm": 0.0, + "learning_rate": 1.870150991929261e-05, + "loss": 1.194, + "step": 737 + }, + { + "epoch": 1.7024221453287196, + "grad_norm": 0.0, + "learning_rate": 1.8697411331556958e-05, + "loss": 0.8173, + "step": 738 + }, + { + "epoch": 1.7047289504036909, + "grad_norm": 0.0, + "learning_rate": 1.8693306736150445e-05, + "loss": 0.373, + "step": 739 + }, + { + "epoch": 1.7070357554786622, + "grad_norm": 0.0, + "learning_rate": 1.8689196135908303e-05, + "loss": 0.8672, + "step": 740 + }, + { + "epoch": 1.7093425605536332, + "grad_norm": 0.0, + "learning_rate": 1.868507953366989e-05, + "loss": 0.871, + "step": 741 + }, + { + "epoch": 1.7116493656286043, + "grad_norm": 0.0, + "learning_rate": 1.868095693227872e-05, + "loss": 0.6499, + "step": 742 + }, + { + "epoch": 1.7139561707035755, + "grad_norm": 0.0, + "learning_rate": 1.867682833458245e-05, + "loss": 0.8416, + "step": 743 + }, + { + "epoch": 1.7162629757785468, + "grad_norm": 0.0, + "learning_rate": 1.8672693743432875e-05, + "loss": 0.5984, + "step": 744 + }, + { + "epoch": 1.7185697808535179, + "grad_norm": 0.0, + "learning_rate": 1.8668553161685932e-05, + "loss": 0.7311, + "step": 745 + }, + { + "epoch": 1.720876585928489, + "grad_norm": 0.0, + "learning_rate": 1.86644065922017e-05, + "loss": 0.8443, + "step": 746 + }, + { + "epoch": 1.7231833910034602, + "grad_norm": 0.0, + "learning_rate": 1.866025403784439e-05, + "loss": 0.7564, + "step": 747 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 0.0, + "learning_rate": 1.8656095501482342e-05, + "loss": 0.7704, + "step": 748 + }, + { + "epoch": 1.7277970011534025, + "grad_norm": 0.0, + "learning_rate": 1.8651930985988037e-05, + "loss": 0.5496, + "step": 749 + }, + { + "epoch": 1.7301038062283736, + "grad_norm": 0.0, + "learning_rate": 1.8647760494238082e-05, + "loss": 0.8499, + "step": 750 + }, + { + "epoch": 1.7324106113033448, + "grad_norm": 0.0, + "learning_rate": 1.8643584029113215e-05, + "loss": 1.0669, + "step": 751 + }, + { + "epoch": 1.7347174163783161, + "grad_norm": 0.0, + "learning_rate": 1.86394015934983e-05, + "loss": 0.7644, + "step": 752 + }, + { + "epoch": 1.7370242214532872, + "grad_norm": 0.0, + "learning_rate": 1.8635213190282312e-05, + "loss": 0.8404, + "step": 753 + }, + { + "epoch": 1.7393310265282582, + "grad_norm": 0.0, + "learning_rate": 1.8631018822358363e-05, + "loss": 0.6913, + "step": 754 + }, + { + "epoch": 1.7416378316032295, + "grad_norm": 0.0, + "learning_rate": 1.8626818492623688e-05, + "loss": 0.7011, + "step": 755 + }, + { + "epoch": 1.7439446366782008, + "grad_norm": 0.0, + "learning_rate": 1.8622612203979628e-05, + "loss": 0.5566, + "step": 756 + }, + { + "epoch": 1.7462514417531718, + "grad_norm": 0.0, + "learning_rate": 1.8618399959331642e-05, + "loss": 1.0118, + "step": 757 + }, + { + "epoch": 1.7485582468281429, + "grad_norm": 0.0, + "learning_rate": 1.861418176158931e-05, + "loss": 0.448, + "step": 758 + }, + { + "epoch": 1.7508650519031141, + "grad_norm": 0.0, + "learning_rate": 1.8609957613666316e-05, + "loss": 0.7561, + "step": 759 + }, + { + "epoch": 1.7531718569780854, + "grad_norm": 0.0, + "learning_rate": 1.8605727518480462e-05, + "loss": 0.7707, + "step": 760 + }, + { + "epoch": 1.7554786620530565, + "grad_norm": 0.0, + "learning_rate": 1.860149147895366e-05, + "loss": 0.7954, + "step": 761 + }, + { + "epoch": 1.7577854671280275, + "grad_norm": 0.0, + "learning_rate": 1.8597249498011906e-05, + "loss": 0.7691, + "step": 762 + }, + { + "epoch": 1.7600922722029988, + "grad_norm": 0.0, + "learning_rate": 1.8593001578585325e-05, + "loss": 0.6448, + "step": 763 + }, + { + "epoch": 1.76239907727797, + "grad_norm": 0.0, + "learning_rate": 1.858874772360814e-05, + "loss": 0.7916, + "step": 764 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.0, + "learning_rate": 1.8584487936018663e-05, + "loss": 0.7074, + "step": 765 + }, + { + "epoch": 1.7670126874279122, + "grad_norm": 0.0, + "learning_rate": 1.8580222218759312e-05, + "loss": 0.7928, + "step": 766 + }, + { + "epoch": 1.7693194925028837, + "grad_norm": 0.0, + "learning_rate": 1.8575950574776595e-05, + "loss": 0.6985, + "step": 767 + }, + { + "epoch": 1.7716262975778547, + "grad_norm": 0.0, + "learning_rate": 1.8571673007021124e-05, + "loss": 0.6274, + "step": 768 + }, + { + "epoch": 1.7739331026528258, + "grad_norm": 0.0, + "learning_rate": 1.856738951844759e-05, + "loss": 0.7821, + "step": 769 + }, + { + "epoch": 1.776239907727797, + "grad_norm": 0.0, + "learning_rate": 1.856310011201479e-05, + "loss": 0.6322, + "step": 770 + }, + { + "epoch": 1.7785467128027683, + "grad_norm": 0.0, + "learning_rate": 1.855880479068559e-05, + "loss": 0.7361, + "step": 771 + }, + { + "epoch": 1.7808535178777394, + "grad_norm": 0.0, + "learning_rate": 1.855450355742695e-05, + "loss": 0.7843, + "step": 772 + }, + { + "epoch": 1.7831603229527104, + "grad_norm": 0.0, + "learning_rate": 1.8550196415209916e-05, + "loss": 0.6768, + "step": 773 + }, + { + "epoch": 1.7854671280276817, + "grad_norm": 0.0, + "learning_rate": 1.854588336700962e-05, + "loss": 0.8734, + "step": 774 + }, + { + "epoch": 1.787773933102653, + "grad_norm": 0.0, + "learning_rate": 1.854156441580526e-05, + "loss": 0.3777, + "step": 775 + }, + { + "epoch": 1.790080738177624, + "grad_norm": 0.0, + "learning_rate": 1.8537239564580117e-05, + "loss": 0.7942, + "step": 776 + }, + { + "epoch": 1.792387543252595, + "grad_norm": 0.0, + "learning_rate": 1.8532908816321557e-05, + "loss": 0.5416, + "step": 777 + }, + { + "epoch": 1.7946943483275664, + "grad_norm": 0.0, + "learning_rate": 1.852857217402101e-05, + "loss": 0.7822, + "step": 778 + }, + { + "epoch": 1.7970011534025376, + "grad_norm": 0.0, + "learning_rate": 1.8524229640673974e-05, + "loss": 0.5352, + "step": 779 + }, + { + "epoch": 1.7993079584775087, + "grad_norm": 0.0, + "learning_rate": 1.851988121928002e-05, + "loss": 0.7994, + "step": 780 + }, + { + "epoch": 1.8016147635524797, + "grad_norm": 0.0, + "learning_rate": 1.8515526912842796e-05, + "loss": 0.6714, + "step": 781 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 0.0, + "learning_rate": 1.8511166724369997e-05, + "loss": 0.6343, + "step": 782 + }, + { + "epoch": 1.8062283737024223, + "grad_norm": 0.0, + "learning_rate": 1.8506800656873397e-05, + "loss": 0.5991, + "step": 783 + }, + { + "epoch": 1.8085351787773933, + "grad_norm": 0.0, + "learning_rate": 1.8502428713368818e-05, + "loss": 0.722, + "step": 784 + }, + { + "epoch": 1.8108419838523644, + "grad_norm": 0.0, + "learning_rate": 1.8498050896876152e-05, + "loss": 1.0617, + "step": 785 + }, + { + "epoch": 1.8131487889273357, + "grad_norm": 0.0, + "learning_rate": 1.8493667210419337e-05, + "loss": 0.6679, + "step": 786 + }, + { + "epoch": 1.815455594002307, + "grad_norm": 0.0, + "learning_rate": 1.8489277657026377e-05, + "loss": 1.0058, + "step": 787 + }, + { + "epoch": 1.817762399077278, + "grad_norm": 0.0, + "learning_rate": 1.8484882239729315e-05, + "loss": 0.6065, + "step": 788 + }, + { + "epoch": 1.820069204152249, + "grad_norm": 0.0, + "learning_rate": 1.848048096156426e-05, + "loss": 0.8293, + "step": 789 + }, + { + "epoch": 1.8223760092272203, + "grad_norm": 0.0, + "learning_rate": 1.847607382557136e-05, + "loss": 0.5846, + "step": 790 + }, + { + "epoch": 1.8246828143021916, + "grad_norm": 0.0, + "learning_rate": 1.8471660834794807e-05, + "loss": 0.6811, + "step": 791 + }, + { + "epoch": 1.8269896193771626, + "grad_norm": 0.0, + "learning_rate": 1.8467241992282842e-05, + "loss": 0.7027, + "step": 792 + }, + { + "epoch": 1.8292964244521337, + "grad_norm": 0.0, + "learning_rate": 1.846281730108775e-05, + "loss": 0.8191, + "step": 793 + }, + { + "epoch": 1.831603229527105, + "grad_norm": 0.0, + "learning_rate": 1.8458386764265852e-05, + "loss": 0.8021, + "step": 794 + }, + { + "epoch": 1.8339100346020762, + "grad_norm": 0.0, + "learning_rate": 1.8453950384877504e-05, + "loss": 0.6784, + "step": 795 + }, + { + "epoch": 1.8362168396770473, + "grad_norm": 0.0, + "learning_rate": 1.8449508165987106e-05, + "loss": 0.6081, + "step": 796 + }, + { + "epoch": 1.8385236447520183, + "grad_norm": 0.0, + "learning_rate": 1.844506011066308e-05, + "loss": 0.9829, + "step": 797 + }, + { + "epoch": 1.8408304498269896, + "grad_norm": 0.0, + "learning_rate": 1.8440606221977893e-05, + "loss": 0.8106, + "step": 798 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 0.0, + "learning_rate": 1.8436146503008036e-05, + "loss": 0.559, + "step": 799 + }, + { + "epoch": 1.845444059976932, + "grad_norm": 0.0, + "learning_rate": 1.843168095683402e-05, + "loss": 0.3478, + "step": 800 + }, + { + "epoch": 1.847750865051903, + "grad_norm": 0.0, + "learning_rate": 1.8427209586540392e-05, + "loss": 0.9071, + "step": 801 + }, + { + "epoch": 1.8500576701268743, + "grad_norm": 0.0, + "learning_rate": 1.8422732395215717e-05, + "loss": 0.637, + "step": 802 + }, + { + "epoch": 1.8523644752018456, + "grad_norm": 0.0, + "learning_rate": 1.8418249385952575e-05, + "loss": 0.617, + "step": 803 + }, + { + "epoch": 1.8546712802768166, + "grad_norm": 0.0, + "learning_rate": 1.841376056184758e-05, + "loss": 0.9638, + "step": 804 + }, + { + "epoch": 1.8569780853517877, + "grad_norm": 0.0, + "learning_rate": 1.8409265926001342e-05, + "loss": 0.9811, + "step": 805 + }, + { + "epoch": 1.859284890426759, + "grad_norm": 0.0, + "learning_rate": 1.8404765481518506e-05, + "loss": 0.8957, + "step": 806 + }, + { + "epoch": 1.8615916955017302, + "grad_norm": 0.0, + "learning_rate": 1.8400259231507716e-05, + "loss": 0.862, + "step": 807 + }, + { + "epoch": 1.8638985005767013, + "grad_norm": 0.0, + "learning_rate": 1.839574717908163e-05, + "loss": 0.4463, + "step": 808 + }, + { + "epoch": 1.8662053056516723, + "grad_norm": 0.0, + "learning_rate": 1.8391229327356916e-05, + "loss": 0.7919, + "step": 809 + }, + { + "epoch": 1.8685121107266436, + "grad_norm": 0.0, + "learning_rate": 1.8386705679454243e-05, + "loss": 0.6183, + "step": 810 + }, + { + "epoch": 1.8708189158016149, + "grad_norm": 0.0, + "learning_rate": 1.8382176238498287e-05, + "loss": 0.6153, + "step": 811 + }, + { + "epoch": 1.873125720876586, + "grad_norm": 0.0, + "learning_rate": 1.8377641007617724e-05, + "loss": 0.6181, + "step": 812 + }, + { + "epoch": 1.875432525951557, + "grad_norm": 0.0, + "learning_rate": 1.8373099989945236e-05, + "loss": 0.6922, + "step": 813 + }, + { + "epoch": 1.8777393310265282, + "grad_norm": 0.0, + "learning_rate": 1.836855318861749e-05, + "loss": 0.9716, + "step": 814 + }, + { + "epoch": 1.8800461361014995, + "grad_norm": 0.0, + "learning_rate": 1.8364000606775158e-05, + "loss": 1.0532, + "step": 815 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.0, + "learning_rate": 1.8359442247562896e-05, + "loss": 0.9168, + "step": 816 + }, + { + "epoch": 1.8846597462514416, + "grad_norm": 0.0, + "learning_rate": 1.8354878114129368e-05, + "loss": 0.8284, + "step": 817 + }, + { + "epoch": 1.8869665513264129, + "grad_norm": 0.0, + "learning_rate": 1.8350308209627198e-05, + "loss": 0.7451, + "step": 818 + }, + { + "epoch": 1.8892733564013842, + "grad_norm": 0.0, + "learning_rate": 1.834573253721303e-05, + "loss": 0.9864, + "step": 819 + }, + { + "epoch": 1.8915801614763552, + "grad_norm": 0.0, + "learning_rate": 1.8341151100047462e-05, + "loss": 1.0663, + "step": 820 + }, + { + "epoch": 1.8938869665513263, + "grad_norm": 0.0, + "learning_rate": 1.833656390129509e-05, + "loss": 0.9384, + "step": 821 + }, + { + "epoch": 1.8961937716262975, + "grad_norm": 0.0, + "learning_rate": 1.833197094412449e-05, + "loss": 0.6839, + "step": 822 + }, + { + "epoch": 1.8985005767012688, + "grad_norm": 0.0, + "learning_rate": 1.832737223170821e-05, + "loss": 0.4785, + "step": 823 + }, + { + "epoch": 1.9008073817762399, + "grad_norm": 0.0, + "learning_rate": 1.832276776722278e-05, + "loss": 0.5989, + "step": 824 + }, + { + "epoch": 1.903114186851211, + "grad_norm": 0.0, + "learning_rate": 1.8318157553848694e-05, + "loss": 0.7849, + "step": 825 + }, + { + "epoch": 1.9054209919261822, + "grad_norm": 0.0, + "learning_rate": 1.8313541594770417e-05, + "loss": 0.7207, + "step": 826 + }, + { + "epoch": 1.9077277970011535, + "grad_norm": 0.0, + "learning_rate": 1.8308919893176397e-05, + "loss": 0.7589, + "step": 827 + }, + { + "epoch": 1.9100346020761245, + "grad_norm": 0.0, + "learning_rate": 1.8304292452259037e-05, + "loss": 0.6873, + "step": 828 + }, + { + "epoch": 1.9123414071510956, + "grad_norm": 0.0, + "learning_rate": 1.8299659275214708e-05, + "loss": 0.7201, + "step": 829 + }, + { + "epoch": 1.9146482122260668, + "grad_norm": 0.0, + "learning_rate": 1.8295020365243736e-05, + "loss": 0.4706, + "step": 830 + }, + { + "epoch": 1.9169550173010381, + "grad_norm": 0.0, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.73, + "step": 831 + }, + { + "epoch": 1.9192618223760092, + "grad_norm": 0.0, + "learning_rate": 1.8285725359343e-05, + "loss": 0.3995, + "step": 832 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 0.0, + "learning_rate": 1.8281069269833694e-05, + "loss": 0.9151, + "step": 833 + }, + { + "epoch": 1.9238754325259517, + "grad_norm": 0.0, + "learning_rate": 1.827640746023865e-05, + "loss": 0.8479, + "step": 834 + }, + { + "epoch": 1.9261822376009228, + "grad_norm": 0.0, + "learning_rate": 1.827173993377798e-05, + "loss": 0.663, + "step": 835 + }, + { + "epoch": 1.9284890426758938, + "grad_norm": 0.0, + "learning_rate": 1.8267066693675745e-05, + "loss": 0.8283, + "step": 836 + }, + { + "epoch": 1.930795847750865, + "grad_norm": 0.0, + "learning_rate": 1.826238774315995e-05, + "loss": 0.9821, + "step": 837 + }, + { + "epoch": 1.9331026528258364, + "grad_norm": 0.0, + "learning_rate": 1.8257703085462542e-05, + "loss": 0.7109, + "step": 838 + }, + { + "epoch": 1.9354094579008074, + "grad_norm": 0.0, + "learning_rate": 1.8253012723819417e-05, + "loss": 0.7126, + "step": 839 + }, + { + "epoch": 1.9377162629757785, + "grad_norm": 0.0, + "learning_rate": 1.82483166614704e-05, + "loss": 0.534, + "step": 840 + }, + { + "epoch": 1.9400230680507498, + "grad_norm": 0.0, + "learning_rate": 1.8243614901659265e-05, + "loss": 0.6956, + "step": 841 + }, + { + "epoch": 1.942329873125721, + "grad_norm": 0.0, + "learning_rate": 1.8238907447633716e-05, + "loss": 0.6861, + "step": 842 + }, + { + "epoch": 1.944636678200692, + "grad_norm": 0.0, + "learning_rate": 1.8234194302645393e-05, + "loss": 1.1298, + "step": 843 + }, + { + "epoch": 1.9469434832756631, + "grad_norm": 0.0, + "learning_rate": 1.8229475469949865e-05, + "loss": 0.7579, + "step": 844 + }, + { + "epoch": 1.9492502883506344, + "grad_norm": 0.0, + "learning_rate": 1.8224750952806626e-05, + "loss": 0.6374, + "step": 845 + }, + { + "epoch": 1.9515570934256057, + "grad_norm": 0.0, + "learning_rate": 1.8220020754479104e-05, + "loss": 0.8151, + "step": 846 + }, + { + "epoch": 1.9538638985005767, + "grad_norm": 0.0, + "learning_rate": 1.8215284878234644e-05, + "loss": 0.4476, + "step": 847 + }, + { + "epoch": 1.9561707035755478, + "grad_norm": 0.0, + "learning_rate": 1.8210543327344518e-05, + "loss": 0.8777, + "step": 848 + }, + { + "epoch": 1.958477508650519, + "grad_norm": 0.0, + "learning_rate": 1.8205796105083917e-05, + "loss": 0.7625, + "step": 849 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.0, + "learning_rate": 1.820104321473195e-05, + "loss": 0.7084, + "step": 850 + }, + { + "epoch": 1.9630911188004614, + "grad_norm": 0.0, + "learning_rate": 1.819628465957164e-05, + "loss": 0.9646, + "step": 851 + }, + { + "epoch": 1.9653979238754324, + "grad_norm": 0.0, + "learning_rate": 1.819152044288992e-05, + "loss": 0.462, + "step": 852 + }, + { + "epoch": 1.9677047289504037, + "grad_norm": 0.0, + "learning_rate": 1.8186750567977638e-05, + "loss": 0.8403, + "step": 853 + }, + { + "epoch": 1.970011534025375, + "grad_norm": 0.0, + "learning_rate": 1.818197503812955e-05, + "loss": 0.5777, + "step": 854 + }, + { + "epoch": 1.972318339100346, + "grad_norm": 0.0, + "learning_rate": 1.8177193856644315e-05, + "loss": 0.5461, + "step": 855 + }, + { + "epoch": 1.974625144175317, + "grad_norm": 0.0, + "learning_rate": 1.8172407026824498e-05, + "loss": 0.6516, + "step": 856 + }, + { + "epoch": 1.9769319492502884, + "grad_norm": 0.0, + "learning_rate": 1.816761455197657e-05, + "loss": 0.5545, + "step": 857 + }, + { + "epoch": 1.9792387543252596, + "grad_norm": 0.0, + "learning_rate": 1.8162816435410892e-05, + "loss": 0.6475, + "step": 858 + }, + { + "epoch": 1.9815455594002307, + "grad_norm": 0.0, + "learning_rate": 1.8158012680441723e-05, + "loss": 1.0847, + "step": 859 + }, + { + "epoch": 1.9838523644752017, + "grad_norm": 0.0, + "learning_rate": 1.8153203290387224e-05, + "loss": 0.8766, + "step": 860 + }, + { + "epoch": 1.986159169550173, + "grad_norm": 0.0, + "learning_rate": 1.8148388268569453e-05, + "loss": 0.8599, + "step": 861 + }, + { + "epoch": 1.9884659746251443, + "grad_norm": 0.0, + "learning_rate": 1.8143567618314336e-05, + "loss": 0.8292, + "step": 862 + }, + { + "epoch": 1.9907727797001153, + "grad_norm": 0.0, + "learning_rate": 1.8138741342951706e-05, + "loss": 0.8669, + "step": 863 + }, + { + "epoch": 1.9930795847750864, + "grad_norm": 0.0, + "learning_rate": 1.8133909445815277e-05, + "loss": 0.4969, + "step": 864 + }, + { + "epoch": 1.9953863898500577, + "grad_norm": 0.0, + "learning_rate": 1.8129071930242648e-05, + "loss": 0.6403, + "step": 865 + }, + { + "epoch": 1.997693194925029, + "grad_norm": 0.0, + "learning_rate": 1.8124228799575295e-05, + "loss": 0.5471, + "step": 866 + }, + { + "epoch": 2.0, + "grad_norm": 0.0, + "learning_rate": 1.811938005715857e-05, + "loss": 0.5799, + "step": 867 + }, + { + "epoch": 2.002306805074971, + "grad_norm": 0.0, + "learning_rate": 1.8114525706341702e-05, + "loss": 0.3581, + "step": 868 + }, + { + "epoch": 2.0046136101499425, + "grad_norm": 0.0, + "learning_rate": 1.8109665750477806e-05, + "loss": 0.3694, + "step": 869 + }, + { + "epoch": 2.0069204152249136, + "grad_norm": 0.0, + "learning_rate": 1.8104800192923856e-05, + "loss": 0.46, + "step": 870 + }, + { + "epoch": 2.0092272202998847, + "grad_norm": 0.0, + "learning_rate": 1.8099929037040695e-05, + "loss": 0.5724, + "step": 871 + }, + { + "epoch": 2.0115340253748557, + "grad_norm": 0.0, + "learning_rate": 1.8095052286193044e-05, + "loss": 0.3565, + "step": 872 + }, + { + "epoch": 2.013840830449827, + "grad_norm": 0.0, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.3936, + "step": 873 + }, + { + "epoch": 2.0161476355247983, + "grad_norm": 0.0, + "learning_rate": 1.8085282013082436e-05, + "loss": 0.5218, + "step": 874 + }, + { + "epoch": 2.0184544405997693, + "grad_norm": 0.0, + "learning_rate": 1.808038849756822e-05, + "loss": 0.5057, + "step": 875 + }, + { + "epoch": 2.0207612456747404, + "grad_norm": 0.0, + "learning_rate": 1.8075489400586993e-05, + "loss": 0.4666, + "step": 876 + }, + { + "epoch": 2.023068050749712, + "grad_norm": 0.0, + "learning_rate": 1.8070584725522763e-05, + "loss": 0.4672, + "step": 877 + }, + { + "epoch": 2.025374855824683, + "grad_norm": 0.0, + "learning_rate": 1.8065674475763398e-05, + "loss": 0.5051, + "step": 878 + }, + { + "epoch": 2.027681660899654, + "grad_norm": 0.0, + "learning_rate": 1.8060758654700622e-05, + "loss": 0.3915, + "step": 879 + }, + { + "epoch": 2.029988465974625, + "grad_norm": 0.0, + "learning_rate": 1.8055837265729996e-05, + "loss": 0.4161, + "step": 880 + }, + { + "epoch": 2.0322952710495965, + "grad_norm": 0.0, + "learning_rate": 1.805091031225093e-05, + "loss": 0.4157, + "step": 881 + }, + { + "epoch": 2.0346020761245676, + "grad_norm": 0.0, + "learning_rate": 1.8045977797666685e-05, + "loss": 0.5167, + "step": 882 + }, + { + "epoch": 2.0369088811995386, + "grad_norm": 0.0, + "learning_rate": 1.804103972538435e-05, + "loss": 0.4641, + "step": 883 + }, + { + "epoch": 2.0392156862745097, + "grad_norm": 0.0, + "learning_rate": 1.8036096098814875e-05, + "loss": 0.3374, + "step": 884 + }, + { + "epoch": 2.041522491349481, + "grad_norm": 0.0, + "learning_rate": 1.803114692137302e-05, + "loss": 0.5364, + "step": 885 + }, + { + "epoch": 2.043829296424452, + "grad_norm": 0.0, + "learning_rate": 1.8026192196477395e-05, + "loss": 0.6081, + "step": 886 + }, + { + "epoch": 2.0461361014994233, + "grad_norm": 0.0, + "learning_rate": 1.802123192755044e-05, + "loss": 0.5771, + "step": 887 + }, + { + "epoch": 2.0484429065743943, + "grad_norm": 0.0, + "learning_rate": 1.801626611801842e-05, + "loss": 0.5026, + "step": 888 + }, + { + "epoch": 2.050749711649366, + "grad_norm": 0.0, + "learning_rate": 1.8011294771311436e-05, + "loss": 0.5586, + "step": 889 + }, + { + "epoch": 2.053056516724337, + "grad_norm": 0.0, + "learning_rate": 1.80063178908634e-05, + "loss": 0.3922, + "step": 890 + }, + { + "epoch": 2.055363321799308, + "grad_norm": 0.0, + "learning_rate": 1.8001335480112067e-05, + "loss": 0.5573, + "step": 891 + }, + { + "epoch": 2.057670126874279, + "grad_norm": 0.0, + "learning_rate": 1.7996347542498983e-05, + "loss": 0.3228, + "step": 892 + }, + { + "epoch": 2.0599769319492505, + "grad_norm": 0.0, + "learning_rate": 1.799135408146954e-05, + "loss": 0.405, + "step": 893 + }, + { + "epoch": 2.0622837370242215, + "grad_norm": 0.0, + "learning_rate": 1.798635510047293e-05, + "loss": 0.5568, + "step": 894 + }, + { + "epoch": 2.0645905420991926, + "grad_norm": 0.0, + "learning_rate": 1.798135060296216e-05, + "loss": 0.4499, + "step": 895 + }, + { + "epoch": 2.0668973471741636, + "grad_norm": 0.0, + "learning_rate": 1.797634059239405e-05, + "loss": 0.3977, + "step": 896 + }, + { + "epoch": 2.069204152249135, + "grad_norm": 0.0, + "learning_rate": 1.7971325072229227e-05, + "loss": 0.4029, + "step": 897 + }, + { + "epoch": 2.071510957324106, + "grad_norm": 0.0, + "learning_rate": 1.7966304045932122e-05, + "loss": 0.4059, + "step": 898 + }, + { + "epoch": 2.0738177623990772, + "grad_norm": 0.0, + "learning_rate": 1.796127751697097e-05, + "loss": 0.6208, + "step": 899 + }, + { + "epoch": 2.0761245674740483, + "grad_norm": 0.0, + "learning_rate": 1.795624548881781e-05, + "loss": 0.5197, + "step": 900 + }, + { + "epoch": 2.0784313725490198, + "grad_norm": 0.0, + "learning_rate": 1.795120796494848e-05, + "loss": 0.3885, + "step": 901 + }, + { + "epoch": 2.080738177623991, + "grad_norm": 0.0, + "learning_rate": 1.7946164948842604e-05, + "loss": 0.4355, + "step": 902 + }, + { + "epoch": 2.083044982698962, + "grad_norm": 0.0, + "learning_rate": 1.7941116443983612e-05, + "loss": 0.3537, + "step": 903 + }, + { + "epoch": 2.085351787773933, + "grad_norm": 0.0, + "learning_rate": 1.7936062453858724e-05, + "loss": 0.4278, + "step": 904 + }, + { + "epoch": 2.0876585928489044, + "grad_norm": 0.0, + "learning_rate": 1.7931002981958933e-05, + "loss": 0.6057, + "step": 905 + }, + { + "epoch": 2.0899653979238755, + "grad_norm": 0.0, + "learning_rate": 1.7925938031779044e-05, + "loss": 0.5894, + "step": 906 + }, + { + "epoch": 2.0922722029988465, + "grad_norm": 0.0, + "learning_rate": 1.7920867606817625e-05, + "loss": 0.404, + "step": 907 + }, + { + "epoch": 2.0945790080738176, + "grad_norm": 0.0, + "learning_rate": 1.7915791710577035e-05, + "loss": 0.3701, + "step": 908 + }, + { + "epoch": 2.096885813148789, + "grad_norm": 0.0, + "learning_rate": 1.7910710346563417e-05, + "loss": 0.4394, + "step": 909 + }, + { + "epoch": 2.09919261822376, + "grad_norm": 0.0, + "learning_rate": 1.7905623518286673e-05, + "loss": 0.3412, + "step": 910 + }, + { + "epoch": 2.101499423298731, + "grad_norm": 0.0, + "learning_rate": 1.79005312292605e-05, + "loss": 0.3976, + "step": 911 + }, + { + "epoch": 2.1038062283737022, + "grad_norm": 0.0, + "learning_rate": 1.7895433483002356e-05, + "loss": 0.5541, + "step": 912 + }, + { + "epoch": 2.1061130334486737, + "grad_norm": 0.0, + "learning_rate": 1.7890330283033467e-05, + "loss": 0.4665, + "step": 913 + }, + { + "epoch": 2.108419838523645, + "grad_norm": 0.0, + "learning_rate": 1.7885221632878837e-05, + "loss": 0.6224, + "step": 914 + }, + { + "epoch": 2.110726643598616, + "grad_norm": 0.0, + "learning_rate": 1.788010753606722e-05, + "loss": 0.5937, + "step": 915 + }, + { + "epoch": 2.113033448673587, + "grad_norm": 0.0, + "learning_rate": 1.7874987996131144e-05, + "loss": 0.5993, + "step": 916 + }, + { + "epoch": 2.1153402537485584, + "grad_norm": 0.0, + "learning_rate": 1.7869863016606893e-05, + "loss": 0.5363, + "step": 917 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.0, + "learning_rate": 1.78647326010345e-05, + "loss": 0.6504, + "step": 918 + }, + { + "epoch": 2.1199538638985005, + "grad_norm": 0.0, + "learning_rate": 1.7859596752957768e-05, + "loss": 0.4933, + "step": 919 + }, + { + "epoch": 2.1222606689734715, + "grad_norm": 0.0, + "learning_rate": 1.7854455475924245e-05, + "loss": 0.4761, + "step": 920 + }, + { + "epoch": 2.124567474048443, + "grad_norm": 0.0, + "learning_rate": 1.7849308773485226e-05, + "loss": 0.3558, + "step": 921 + }, + { + "epoch": 2.126874279123414, + "grad_norm": 0.0, + "learning_rate": 1.784415664919576e-05, + "loss": 0.7222, + "step": 922 + }, + { + "epoch": 2.129181084198385, + "grad_norm": 0.0, + "learning_rate": 1.783899910661463e-05, + "loss": 0.6698, + "step": 923 + }, + { + "epoch": 2.131487889273356, + "grad_norm": 0.0, + "learning_rate": 1.783383614930438e-05, + "loss": 0.3221, + "step": 924 + }, + { + "epoch": 2.1337946943483277, + "grad_norm": 0.0, + "learning_rate": 1.782866778083128e-05, + "loss": 0.5143, + "step": 925 + }, + { + "epoch": 2.1361014994232987, + "grad_norm": 0.0, + "learning_rate": 1.7823494004765336e-05, + "loss": 0.4885, + "step": 926 + }, + { + "epoch": 2.13840830449827, + "grad_norm": 0.0, + "learning_rate": 1.78183148246803e-05, + "loss": 0.4965, + "step": 927 + }, + { + "epoch": 2.1407151095732413, + "grad_norm": 0.0, + "learning_rate": 1.7813130244153648e-05, + "loss": 0.3964, + "step": 928 + }, + { + "epoch": 2.1430219146482123, + "grad_norm": 0.0, + "learning_rate": 1.7807940266766595e-05, + "loss": 0.4904, + "step": 929 + }, + { + "epoch": 2.1453287197231834, + "grad_norm": 0.0, + "learning_rate": 1.780274489610407e-05, + "loss": 0.3475, + "step": 930 + }, + { + "epoch": 2.1476355247981544, + "grad_norm": 0.0, + "learning_rate": 1.7797544135754744e-05, + "loss": 0.593, + "step": 931 + }, + { + "epoch": 2.1499423298731255, + "grad_norm": 0.0, + "learning_rate": 1.7792337989311e-05, + "loss": 0.4602, + "step": 932 + }, + { + "epoch": 2.152249134948097, + "grad_norm": 0.0, + "learning_rate": 1.778712646036894e-05, + "loss": 0.4539, + "step": 933 + }, + { + "epoch": 2.154555940023068, + "grad_norm": 0.0, + "learning_rate": 1.7781909552528395e-05, + "loss": 0.733, + "step": 934 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 0.0, + "learning_rate": 1.77766872693929e-05, + "loss": 0.5275, + "step": 935 + }, + { + "epoch": 2.1591695501730106, + "grad_norm": 0.0, + "learning_rate": 1.777145961456971e-05, + "loss": 0.5843, + "step": 936 + }, + { + "epoch": 2.1614763552479817, + "grad_norm": 0.0, + "learning_rate": 1.7766226591669787e-05, + "loss": 0.5257, + "step": 937 + }, + { + "epoch": 2.1637831603229527, + "grad_norm": 0.0, + "learning_rate": 1.7760988204307798e-05, + "loss": 0.7901, + "step": 938 + }, + { + "epoch": 2.1660899653979238, + "grad_norm": 0.0, + "learning_rate": 1.7755744456102123e-05, + "loss": 0.5024, + "step": 939 + }, + { + "epoch": 2.168396770472895, + "grad_norm": 0.0, + "learning_rate": 1.7750495350674844e-05, + "loss": 0.4521, + "step": 940 + }, + { + "epoch": 2.1707035755478663, + "grad_norm": 0.0, + "learning_rate": 1.7745240891651736e-05, + "loss": 0.4274, + "step": 941 + }, + { + "epoch": 2.1730103806228374, + "grad_norm": 0.0, + "learning_rate": 1.7739981082662275e-05, + "loss": 0.2473, + "step": 942 + }, + { + "epoch": 2.1753171856978084, + "grad_norm": 0.0, + "learning_rate": 1.7734715927339642e-05, + "loss": 0.5154, + "step": 943 + }, + { + "epoch": 2.17762399077278, + "grad_norm": 0.0, + "learning_rate": 1.7729445429320696e-05, + "loss": 0.2906, + "step": 944 + }, + { + "epoch": 2.179930795847751, + "grad_norm": 0.0, + "learning_rate": 1.7724169592245996e-05, + "loss": 0.5342, + "step": 945 + }, + { + "epoch": 2.182237600922722, + "grad_norm": 0.0, + "learning_rate": 1.771888841975979e-05, + "loss": 0.357, + "step": 946 + }, + { + "epoch": 2.184544405997693, + "grad_norm": 0.0, + "learning_rate": 1.771360191551e-05, + "loss": 0.4143, + "step": 947 + }, + { + "epoch": 2.1868512110726646, + "grad_norm": 0.0, + "learning_rate": 1.7708310083148242e-05, + "loss": 0.3994, + "step": 948 + }, + { + "epoch": 2.1891580161476356, + "grad_norm": 0.0, + "learning_rate": 1.7703012926329813e-05, + "loss": 0.925, + "step": 949 + }, + { + "epoch": 2.1914648212226067, + "grad_norm": 0.0, + "learning_rate": 1.769771044871368e-05, + "loss": 0.5006, + "step": 950 + }, + { + "epoch": 2.1937716262975777, + "grad_norm": 0.0, + "learning_rate": 1.769240265396249e-05, + "loss": 0.3427, + "step": 951 + }, + { + "epoch": 2.196078431372549, + "grad_norm": 0.0, + "learning_rate": 1.768708954574256e-05, + "loss": 0.6347, + "step": 952 + }, + { + "epoch": 2.1983852364475203, + "grad_norm": 0.0, + "learning_rate": 1.7681771127723883e-05, + "loss": 0.5081, + "step": 953 + }, + { + "epoch": 2.2006920415224913, + "grad_norm": 0.0, + "learning_rate": 1.7676447403580114e-05, + "loss": 0.3524, + "step": 954 + }, + { + "epoch": 2.2029988465974624, + "grad_norm": 0.0, + "learning_rate": 1.7671118376988575e-05, + "loss": 0.3097, + "step": 955 + }, + { + "epoch": 2.205305651672434, + "grad_norm": 0.0, + "learning_rate": 1.766578405163025e-05, + "loss": 0.3783, + "step": 956 + }, + { + "epoch": 2.207612456747405, + "grad_norm": 0.0, + "learning_rate": 1.766044443118978e-05, + "loss": 0.6402, + "step": 957 + }, + { + "epoch": 2.209919261822376, + "grad_norm": 0.0, + "learning_rate": 1.7655099519355477e-05, + "loss": 0.5871, + "step": 958 + }, + { + "epoch": 2.212226066897347, + "grad_norm": 0.0, + "learning_rate": 1.764974931981929e-05, + "loss": 0.6701, + "step": 959 + }, + { + "epoch": 2.2145328719723185, + "grad_norm": 0.0, + "learning_rate": 1.7644393836276832e-05, + "loss": 0.5957, + "step": 960 + }, + { + "epoch": 2.2168396770472896, + "grad_norm": 0.0, + "learning_rate": 1.7639033072427367e-05, + "loss": 0.4668, + "step": 961 + }, + { + "epoch": 2.2191464821222606, + "grad_norm": 0.0, + "learning_rate": 1.7633667031973793e-05, + "loss": 0.4917, + "step": 962 + }, + { + "epoch": 2.2214532871972317, + "grad_norm": 0.0, + "learning_rate": 1.7628295718622666e-05, + "loss": 0.4671, + "step": 963 + }, + { + "epoch": 2.223760092272203, + "grad_norm": 0.0, + "learning_rate": 1.7622919136084183e-05, + "loss": 0.4348, + "step": 964 + }, + { + "epoch": 2.2260668973471742, + "grad_norm": 0.0, + "learning_rate": 1.761753728807217e-05, + "loss": 0.2565, + "step": 965 + }, + { + "epoch": 2.2283737024221453, + "grad_norm": 0.0, + "learning_rate": 1.7612150178304102e-05, + "loss": 0.5, + "step": 966 + }, + { + "epoch": 2.2306805074971163, + "grad_norm": 0.0, + "learning_rate": 1.760675781050109e-05, + "loss": 0.6313, + "step": 967 + }, + { + "epoch": 2.232987312572088, + "grad_norm": 0.0, + "learning_rate": 1.760136018838786e-05, + "loss": 0.334, + "step": 968 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.0, + "learning_rate": 1.7595957315692782e-05, + "loss": 0.5062, + "step": 969 + }, + { + "epoch": 2.23760092272203, + "grad_norm": 0.0, + "learning_rate": 1.7590549196147854e-05, + "loss": 0.4561, + "step": 970 + }, + { + "epoch": 2.239907727797001, + "grad_norm": 0.0, + "learning_rate": 1.7585135833488692e-05, + "loss": 0.3431, + "step": 971 + }, + { + "epoch": 2.2422145328719725, + "grad_norm": 0.0, + "learning_rate": 1.757971723145453e-05, + "loss": 0.4566, + "step": 972 + }, + { + "epoch": 2.2445213379469435, + "grad_norm": 0.0, + "learning_rate": 1.7574293393788236e-05, + "loss": 0.5182, + "step": 973 + }, + { + "epoch": 2.2468281430219146, + "grad_norm": 0.0, + "learning_rate": 1.7568864324236276e-05, + "loss": 0.4189, + "step": 974 + }, + { + "epoch": 2.2491349480968856, + "grad_norm": 0.0, + "learning_rate": 1.7563430026548737e-05, + "loss": 0.3915, + "step": 975 + }, + { + "epoch": 2.251441753171857, + "grad_norm": 0.0, + "learning_rate": 1.7557990504479328e-05, + "loss": 0.4564, + "step": 976 + }, + { + "epoch": 2.253748558246828, + "grad_norm": 0.0, + "learning_rate": 1.755254576178535e-05, + "loss": 0.5378, + "step": 977 + }, + { + "epoch": 2.2560553633217992, + "grad_norm": 0.0, + "learning_rate": 1.7547095802227723e-05, + "loss": 0.421, + "step": 978 + }, + { + "epoch": 2.2583621683967703, + "grad_norm": 0.0, + "learning_rate": 1.754164062957096e-05, + "loss": 0.4231, + "step": 979 + }, + { + "epoch": 2.260668973471742, + "grad_norm": 0.0, + "learning_rate": 1.7536180247583182e-05, + "loss": 0.3338, + "step": 980 + }, + { + "epoch": 2.262975778546713, + "grad_norm": 0.0, + "learning_rate": 1.7530714660036112e-05, + "loss": 0.4675, + "step": 981 + }, + { + "epoch": 2.265282583621684, + "grad_norm": 0.0, + "learning_rate": 1.7525243870705052e-05, + "loss": 0.4478, + "step": 982 + }, + { + "epoch": 2.267589388696655, + "grad_norm": 0.0, + "learning_rate": 1.751976788336892e-05, + "loss": 0.461, + "step": 983 + }, + { + "epoch": 2.2698961937716264, + "grad_norm": 0.0, + "learning_rate": 1.7514286701810203e-05, + "loss": 0.4818, + "step": 984 + }, + { + "epoch": 2.2722029988465975, + "grad_norm": 0.0, + "learning_rate": 1.7508800329814993e-05, + "loss": 0.5986, + "step": 985 + }, + { + "epoch": 2.2745098039215685, + "grad_norm": 0.0, + "learning_rate": 1.7503308771172955e-05, + "loss": 0.6687, + "step": 986 + }, + { + "epoch": 2.2768166089965396, + "grad_norm": 0.0, + "learning_rate": 1.7497812029677344e-05, + "loss": 0.7566, + "step": 987 + }, + { + "epoch": 2.279123414071511, + "grad_norm": 0.0, + "learning_rate": 1.7492310109124992e-05, + "loss": 0.5464, + "step": 988 + }, + { + "epoch": 2.281430219146482, + "grad_norm": 0.0, + "learning_rate": 1.74868030133163e-05, + "loss": 0.6083, + "step": 989 + }, + { + "epoch": 2.283737024221453, + "grad_norm": 0.0, + "learning_rate": 1.748129074605527e-05, + "loss": 0.5148, + "step": 990 + }, + { + "epoch": 2.2860438292964247, + "grad_norm": 0.0, + "learning_rate": 1.7475773311149448e-05, + "loss": 0.3181, + "step": 991 + }, + { + "epoch": 2.2883506343713957, + "grad_norm": 0.0, + "learning_rate": 1.7470250712409963e-05, + "loss": 0.5571, + "step": 992 + }, + { + "epoch": 2.290657439446367, + "grad_norm": 0.0, + "learning_rate": 1.7464722953651504e-05, + "loss": 0.4269, + "step": 993 + }, + { + "epoch": 2.292964244521338, + "grad_norm": 0.0, + "learning_rate": 1.7459190038692333e-05, + "loss": 0.4766, + "step": 994 + }, + { + "epoch": 2.295271049596309, + "grad_norm": 0.0, + "learning_rate": 1.7453651971354265e-05, + "loss": 0.5265, + "step": 995 + }, + { + "epoch": 2.2975778546712804, + "grad_norm": 0.0, + "learning_rate": 1.7448108755462684e-05, + "loss": 0.4346, + "step": 996 + }, + { + "epoch": 2.2998846597462514, + "grad_norm": 0.0, + "learning_rate": 1.7442560394846518e-05, + "loss": 0.5165, + "step": 997 + }, + { + "epoch": 2.3021914648212225, + "grad_norm": 0.0, + "learning_rate": 1.743700689333826e-05, + "loss": 0.4669, + "step": 998 + }, + { + "epoch": 2.304498269896194, + "grad_norm": 0.0, + "learning_rate": 1.7431448254773943e-05, + "loss": 0.3175, + "step": 999 + }, + { + "epoch": 2.306805074971165, + "grad_norm": 0.0, + "learning_rate": 1.742588448299316e-05, + "loss": 0.5314, + "step": 1000 + }, + { + "epoch": 2.309111880046136, + "grad_norm": 0.0, + "learning_rate": 1.7420315581839045e-05, + "loss": 0.5397, + "step": 1001 + }, + { + "epoch": 2.311418685121107, + "grad_norm": 0.0, + "learning_rate": 1.741474155515827e-05, + "loss": 0.4425, + "step": 1002 + }, + { + "epoch": 2.313725490196078, + "grad_norm": 0.0, + "learning_rate": 1.7409162406801053e-05, + "loss": 0.5258, + "step": 1003 + }, + { + "epoch": 2.3160322952710497, + "grad_norm": 0.0, + "learning_rate": 1.7403578140621147e-05, + "loss": 0.5102, + "step": 1004 + }, + { + "epoch": 2.3183391003460208, + "grad_norm": 0.0, + "learning_rate": 1.7397988760475842e-05, + "loss": 0.3762, + "step": 1005 + }, + { + "epoch": 2.320645905420992, + "grad_norm": 0.0, + "learning_rate": 1.739239427022596e-05, + "loss": 0.3951, + "step": 1006 + }, + { + "epoch": 2.3229527104959633, + "grad_norm": 0.0, + "learning_rate": 1.738679467373586e-05, + "loss": 0.6036, + "step": 1007 + }, + { + "epoch": 2.3252595155709344, + "grad_norm": 0.0, + "learning_rate": 1.738118997487341e-05, + "loss": 0.5719, + "step": 1008 + }, + { + "epoch": 2.3275663206459054, + "grad_norm": 0.0, + "learning_rate": 1.7375580177510017e-05, + "loss": 0.6319, + "step": 1009 + }, + { + "epoch": 2.3298731257208765, + "grad_norm": 0.0, + "learning_rate": 1.7369965285520606e-05, + "loss": 0.5118, + "step": 1010 + }, + { + "epoch": 2.3321799307958475, + "grad_norm": 0.0, + "learning_rate": 1.736434530278362e-05, + "loss": 0.5066, + "step": 1011 + }, + { + "epoch": 2.334486735870819, + "grad_norm": 0.0, + "learning_rate": 1.7358720233181023e-05, + "loss": 0.5939, + "step": 1012 + }, + { + "epoch": 2.33679354094579, + "grad_norm": 0.0, + "learning_rate": 1.735309008059829e-05, + "loss": 0.7063, + "step": 1013 + }, + { + "epoch": 2.339100346020761, + "grad_norm": 0.0, + "learning_rate": 1.73474548489244e-05, + "loss": 0.3889, + "step": 1014 + }, + { + "epoch": 2.3414071510957326, + "grad_norm": 0.0, + "learning_rate": 1.7341814542051845e-05, + "loss": 0.4144, + "step": 1015 + }, + { + "epoch": 2.3437139561707037, + "grad_norm": 0.0, + "learning_rate": 1.7336169163876637e-05, + "loss": 0.467, + "step": 1016 + }, + { + "epoch": 2.3460207612456747, + "grad_norm": 0.0, + "learning_rate": 1.7330518718298263e-05, + "loss": 0.6608, + "step": 1017 + }, + { + "epoch": 2.3483275663206458, + "grad_norm": 0.0, + "learning_rate": 1.7324863209219736e-05, + "loss": 0.577, + "step": 1018 + }, + { + "epoch": 2.3506343713956173, + "grad_norm": 0.0, + "learning_rate": 1.7319202640547552e-05, + "loss": 0.5533, + "step": 1019 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.0, + "learning_rate": 1.7313537016191706e-05, + "loss": 0.4271, + "step": 1020 + }, + { + "epoch": 2.3552479815455594, + "grad_norm": 0.0, + "learning_rate": 1.7307866340065684e-05, + "loss": 0.4294, + "step": 1021 + }, + { + "epoch": 2.3575547866205304, + "grad_norm": 0.0, + "learning_rate": 1.7302190616086464e-05, + "loss": 0.4588, + "step": 1022 + }, + { + "epoch": 2.359861591695502, + "grad_norm": 0.0, + "learning_rate": 1.729650984817451e-05, + "loss": 0.5191, + "step": 1023 + }, + { + "epoch": 2.362168396770473, + "grad_norm": 0.0, + "learning_rate": 1.729082404025377e-05, + "loss": 0.6365, + "step": 1024 + }, + { + "epoch": 2.364475201845444, + "grad_norm": 0.0, + "learning_rate": 1.7285133196251664e-05, + "loss": 0.5164, + "step": 1025 + }, + { + "epoch": 2.366782006920415, + "grad_norm": 0.0, + "learning_rate": 1.727943732009911e-05, + "loss": 0.3322, + "step": 1026 + }, + { + "epoch": 2.3690888119953866, + "grad_norm": 0.0, + "learning_rate": 1.7273736415730488e-05, + "loss": 0.6997, + "step": 1027 + }, + { + "epoch": 2.3713956170703576, + "grad_norm": 0.0, + "learning_rate": 1.7268030487083654e-05, + "loss": 0.3441, + "step": 1028 + }, + { + "epoch": 2.3737024221453287, + "grad_norm": 0.0, + "learning_rate": 1.726231953809993e-05, + "loss": 0.4374, + "step": 1029 + }, + { + "epoch": 2.3760092272202997, + "grad_norm": 0.0, + "learning_rate": 1.725660357272412e-05, + "loss": 0.4929, + "step": 1030 + }, + { + "epoch": 2.378316032295271, + "grad_norm": 0.0, + "learning_rate": 1.725088259490448e-05, + "loss": 0.371, + "step": 1031 + }, + { + "epoch": 2.3806228373702423, + "grad_norm": 0.0, + "learning_rate": 1.7245156608592727e-05, + "loss": 0.2966, + "step": 1032 + }, + { + "epoch": 2.3829296424452133, + "grad_norm": 0.0, + "learning_rate": 1.723942561774405e-05, + "loss": 0.3369, + "step": 1033 + }, + { + "epoch": 2.3852364475201844, + "grad_norm": 0.0, + "learning_rate": 1.723368962631708e-05, + "loss": 0.413, + "step": 1034 + }, + { + "epoch": 2.387543252595156, + "grad_norm": 0.0, + "learning_rate": 1.7227948638273918e-05, + "loss": 0.4057, + "step": 1035 + }, + { + "epoch": 2.389850057670127, + "grad_norm": 0.0, + "learning_rate": 1.72222026575801e-05, + "loss": 0.319, + "step": 1036 + }, + { + "epoch": 2.392156862745098, + "grad_norm": 0.0, + "learning_rate": 1.7216451688204623e-05, + "loss": 0.6871, + "step": 1037 + }, + { + "epoch": 2.394463667820069, + "grad_norm": 0.0, + "learning_rate": 1.7210695734119926e-05, + "loss": 0.5843, + "step": 1038 + }, + { + "epoch": 2.3967704728950405, + "grad_norm": 0.0, + "learning_rate": 1.7204934799301883e-05, + "loss": 0.6087, + "step": 1039 + }, + { + "epoch": 2.3990772779700116, + "grad_norm": 0.0, + "learning_rate": 1.719916888772983e-05, + "loss": 0.3969, + "step": 1040 + }, + { + "epoch": 2.4013840830449826, + "grad_norm": 0.0, + "learning_rate": 1.7193398003386514e-05, + "loss": 0.486, + "step": 1041 + }, + { + "epoch": 2.4036908881199537, + "grad_norm": 0.0, + "learning_rate": 1.718762215025813e-05, + "loss": 0.4047, + "step": 1042 + }, + { + "epoch": 2.405997693194925, + "grad_norm": 0.0, + "learning_rate": 1.718184133233432e-05, + "loss": 0.7963, + "step": 1043 + }, + { + "epoch": 2.4083044982698962, + "grad_norm": 0.0, + "learning_rate": 1.717605555360812e-05, + "loss": 0.3697, + "step": 1044 + }, + { + "epoch": 2.4106113033448673, + "grad_norm": 0.0, + "learning_rate": 1.7170264818076027e-05, + "loss": 0.3917, + "step": 1045 + }, + { + "epoch": 2.4129181084198383, + "grad_norm": 0.0, + "learning_rate": 1.7164469129737936e-05, + "loss": 0.56, + "step": 1046 + }, + { + "epoch": 2.41522491349481, + "grad_norm": 0.0, + "learning_rate": 1.7158668492597186e-05, + "loss": 0.4465, + "step": 1047 + }, + { + "epoch": 2.417531718569781, + "grad_norm": 0.0, + "learning_rate": 1.7152862910660516e-05, + "loss": 0.5596, + "step": 1048 + }, + { + "epoch": 2.419838523644752, + "grad_norm": 0.0, + "learning_rate": 1.7147052387938094e-05, + "loss": 0.4819, + "step": 1049 + }, + { + "epoch": 2.422145328719723, + "grad_norm": 0.0, + "learning_rate": 1.7141236928443482e-05, + "loss": 0.3809, + "step": 1050 + }, + { + "epoch": 2.4244521337946945, + "grad_norm": 0.0, + "learning_rate": 1.7135416536193678e-05, + "loss": 0.5057, + "step": 1051 + }, + { + "epoch": 2.4267589388696655, + "grad_norm": 0.0, + "learning_rate": 1.712959121520907e-05, + "loss": 0.4327, + "step": 1052 + }, + { + "epoch": 2.4290657439446366, + "grad_norm": 0.0, + "learning_rate": 1.712376096951345e-05, + "loss": 0.4292, + "step": 1053 + }, + { + "epoch": 2.431372549019608, + "grad_norm": 0.0, + "learning_rate": 1.7117925803134017e-05, + "loss": 0.533, + "step": 1054 + }, + { + "epoch": 2.433679354094579, + "grad_norm": 0.0, + "learning_rate": 1.711208572010137e-05, + "loss": 0.4858, + "step": 1055 + }, + { + "epoch": 2.43598615916955, + "grad_norm": 0.0, + "learning_rate": 1.7106240724449507e-05, + "loss": 0.4681, + "step": 1056 + }, + { + "epoch": 2.4382929642445212, + "grad_norm": 0.0, + "learning_rate": 1.7100390820215805e-05, + "loss": 0.6602, + "step": 1057 + }, + { + "epoch": 2.4405997693194923, + "grad_norm": 0.0, + "learning_rate": 1.7094536011441046e-05, + "loss": 0.6562, + "step": 1058 + }, + { + "epoch": 2.442906574394464, + "grad_norm": 0.0, + "learning_rate": 1.7088676302169394e-05, + "loss": 0.3111, + "step": 1059 + }, + { + "epoch": 2.445213379469435, + "grad_norm": 0.0, + "learning_rate": 1.7082811696448397e-05, + "loss": 0.4315, + "step": 1060 + }, + { + "epoch": 2.447520184544406, + "grad_norm": 0.0, + "learning_rate": 1.7076942198328987e-05, + "loss": 0.4883, + "step": 1061 + }, + { + "epoch": 2.4498269896193774, + "grad_norm": 0.0, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.6466, + "step": 1062 + }, + { + "epoch": 2.4521337946943484, + "grad_norm": 0.0, + "learning_rate": 1.7065188541115554e-05, + "loss": 0.4887, + "step": 1063 + }, + { + "epoch": 2.4544405997693195, + "grad_norm": 0.0, + "learning_rate": 1.705930439014028e-05, + "loss": 0.6299, + "step": 1064 + }, + { + "epoch": 2.4567474048442905, + "grad_norm": 0.0, + "learning_rate": 1.705341536300409e-05, + "loss": 0.3956, + "step": 1065 + }, + { + "epoch": 2.4590542099192616, + "grad_norm": 0.0, + "learning_rate": 1.704752146377478e-05, + "loss": 0.4812, + "step": 1066 + }, + { + "epoch": 2.461361014994233, + "grad_norm": 0.0, + "learning_rate": 1.704162269652352e-05, + "loss": 0.4718, + "step": 1067 + }, + { + "epoch": 2.463667820069204, + "grad_norm": 0.0, + "learning_rate": 1.7035719065324837e-05, + "loss": 0.537, + "step": 1068 + }, + { + "epoch": 2.465974625144175, + "grad_norm": 0.0, + "learning_rate": 1.702981057425662e-05, + "loss": 0.3821, + "step": 1069 + }, + { + "epoch": 2.4682814302191467, + "grad_norm": 0.0, + "learning_rate": 1.7023897227400113e-05, + "loss": 0.3955, + "step": 1070 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.0, + "learning_rate": 1.7017979028839918e-05, + "loss": 0.4731, + "step": 1071 + }, + { + "epoch": 2.472895040369089, + "grad_norm": 0.0, + "learning_rate": 1.701205598266398e-05, + "loss": 0.401, + "step": 1072 + }, + { + "epoch": 2.47520184544406, + "grad_norm": 0.0, + "learning_rate": 1.7006128092963604e-05, + "loss": 0.4288, + "step": 1073 + }, + { + "epoch": 2.477508650519031, + "grad_norm": 0.0, + "learning_rate": 1.7000195363833434e-05, + "loss": 0.5251, + "step": 1074 + }, + { + "epoch": 2.4798154555940024, + "grad_norm": 0.0, + "learning_rate": 1.6994257799371457e-05, + "loss": 0.4595, + "step": 1075 + }, + { + "epoch": 2.4821222606689735, + "grad_norm": 0.0, + "learning_rate": 1.6988315403679e-05, + "loss": 0.3999, + "step": 1076 + }, + { + "epoch": 2.4844290657439445, + "grad_norm": 0.0, + "learning_rate": 1.698236818086073e-05, + "loss": 0.4639, + "step": 1077 + }, + { + "epoch": 2.486735870818916, + "grad_norm": 0.0, + "learning_rate": 1.697641613502464e-05, + "loss": 0.4915, + "step": 1078 + }, + { + "epoch": 2.489042675893887, + "grad_norm": 0.0, + "learning_rate": 1.6970459270282068e-05, + "loss": 0.273, + "step": 1079 + }, + { + "epoch": 2.491349480968858, + "grad_norm": 0.0, + "learning_rate": 1.696449759074767e-05, + "loss": 0.3822, + "step": 1080 + }, + { + "epoch": 2.493656286043829, + "grad_norm": 0.0, + "learning_rate": 1.6958531100539428e-05, + "loss": 0.4648, + "step": 1081 + }, + { + "epoch": 2.4959630911188, + "grad_norm": 0.0, + "learning_rate": 1.6952559803778656e-05, + "loss": 0.3983, + "step": 1082 + }, + { + "epoch": 2.4982698961937717, + "grad_norm": 0.0, + "learning_rate": 1.6946583704589973e-05, + "loss": 0.5093, + "step": 1083 + }, + { + "epoch": 2.5005767012687428, + "grad_norm": 0.0, + "learning_rate": 1.6940602807101335e-05, + "loss": 0.6034, + "step": 1084 + }, + { + "epoch": 2.502883506343714, + "grad_norm": 0.0, + "learning_rate": 1.6934617115443993e-05, + "loss": 0.3113, + "step": 1085 + }, + { + "epoch": 2.5051903114186853, + "grad_norm": 0.0, + "learning_rate": 1.692862663375252e-05, + "loss": 0.652, + "step": 1086 + }, + { + "epoch": 2.5074971164936564, + "grad_norm": 0.0, + "learning_rate": 1.6922631366164795e-05, + "loss": 0.4327, + "step": 1087 + }, + { + "epoch": 2.5098039215686274, + "grad_norm": 0.0, + "learning_rate": 1.6916631316822013e-05, + "loss": 0.4376, + "step": 1088 + }, + { + "epoch": 2.5121107266435985, + "grad_norm": 0.0, + "learning_rate": 1.691062648986865e-05, + "loss": 0.3329, + "step": 1089 + }, + { + "epoch": 2.5144175317185695, + "grad_norm": 0.0, + "learning_rate": 1.6904616889452497e-05, + "loss": 0.5643, + "step": 1090 + }, + { + "epoch": 2.516724336793541, + "grad_norm": 0.0, + "learning_rate": 1.6898602519724647e-05, + "loss": 0.5856, + "step": 1091 + }, + { + "epoch": 2.519031141868512, + "grad_norm": 0.0, + "learning_rate": 1.689258338483947e-05, + "loss": 0.5064, + "step": 1092 + }, + { + "epoch": 2.521337946943483, + "grad_norm": 0.0, + "learning_rate": 1.6886559488954647e-05, + "loss": 0.3837, + "step": 1093 + }, + { + "epoch": 2.5236447520184546, + "grad_norm": 0.0, + "learning_rate": 1.6880530836231137e-05, + "loss": 0.3516, + "step": 1094 + }, + { + "epoch": 2.5259515570934257, + "grad_norm": 0.0, + "learning_rate": 1.6874497430833182e-05, + "loss": 0.3592, + "step": 1095 + }, + { + "epoch": 2.5282583621683967, + "grad_norm": 0.0, + "learning_rate": 1.6868459276928312e-05, + "loss": 0.4723, + "step": 1096 + }, + { + "epoch": 2.5305651672433678, + "grad_norm": 0.0, + "learning_rate": 1.686241637868734e-05, + "loss": 0.2838, + "step": 1097 + }, + { + "epoch": 2.532871972318339, + "grad_norm": 0.0, + "learning_rate": 1.6856368740284342e-05, + "loss": 0.5495, + "step": 1098 + }, + { + "epoch": 2.5351787773933103, + "grad_norm": 0.0, + "learning_rate": 1.6850316365896692e-05, + "loss": 0.4764, + "step": 1099 + }, + { + "epoch": 2.5374855824682814, + "grad_norm": 0.0, + "learning_rate": 1.684425925970501e-05, + "loss": 0.5037, + "step": 1100 + }, + { + "epoch": 2.539792387543253, + "grad_norm": 0.0, + "learning_rate": 1.68381974258932e-05, + "loss": 0.5278, + "step": 1101 + }, + { + "epoch": 2.542099192618224, + "grad_norm": 0.0, + "learning_rate": 1.683213086864843e-05, + "loss": 0.5424, + "step": 1102 + }, + { + "epoch": 2.544405997693195, + "grad_norm": 0.0, + "learning_rate": 1.6826059592161136e-05, + "loss": 0.6143, + "step": 1103 + }, + { + "epoch": 2.546712802768166, + "grad_norm": 0.0, + "learning_rate": 1.6819983600624986e-05, + "loss": 0.6507, + "step": 1104 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 0.0, + "learning_rate": 1.681390289823694e-05, + "loss": 0.3845, + "step": 1105 + }, + { + "epoch": 2.5513264129181086, + "grad_norm": 0.0, + "learning_rate": 1.6807817489197192e-05, + "loss": 0.4351, + "step": 1106 + }, + { + "epoch": 2.5536332179930796, + "grad_norm": 0.0, + "learning_rate": 1.6801727377709195e-05, + "loss": 0.4947, + "step": 1107 + }, + { + "epoch": 2.5559400230680507, + "grad_norm": 0.0, + "learning_rate": 1.6795632567979643e-05, + "loss": 0.3902, + "step": 1108 + }, + { + "epoch": 2.558246828143022, + "grad_norm": 0.0, + "learning_rate": 1.6789533064218487e-05, + "loss": 0.4433, + "step": 1109 + }, + { + "epoch": 2.5605536332179932, + "grad_norm": 0.0, + "learning_rate": 1.6783428870638904e-05, + "loss": 0.5687, + "step": 1110 + }, + { + "epoch": 2.5628604382929643, + "grad_norm": 0.0, + "learning_rate": 1.6777319991457325e-05, + "loss": 0.5395, + "step": 1111 + }, + { + "epoch": 2.5651672433679353, + "grad_norm": 0.0, + "learning_rate": 1.6771206430893408e-05, + "loss": 0.6602, + "step": 1112 + }, + { + "epoch": 2.5674740484429064, + "grad_norm": 0.0, + "learning_rate": 1.6765088193170055e-05, + "loss": 0.4511, + "step": 1113 + }, + { + "epoch": 2.569780853517878, + "grad_norm": 0.0, + "learning_rate": 1.6758965282513383e-05, + "loss": 0.2627, + "step": 1114 + }, + { + "epoch": 2.572087658592849, + "grad_norm": 0.0, + "learning_rate": 1.6752837703152754e-05, + "loss": 0.4823, + "step": 1115 + }, + { + "epoch": 2.57439446366782, + "grad_norm": 0.0, + "learning_rate": 1.6746705459320746e-05, + "loss": 0.4651, + "step": 1116 + }, + { + "epoch": 2.5767012687427915, + "grad_norm": 0.0, + "learning_rate": 1.6740568555253153e-05, + "loss": 0.7135, + "step": 1117 + }, + { + "epoch": 2.5790080738177625, + "grad_norm": 0.0, + "learning_rate": 1.6734426995189003e-05, + "loss": 0.4316, + "step": 1118 + }, + { + "epoch": 2.5813148788927336, + "grad_norm": 0.0, + "learning_rate": 1.672828078337053e-05, + "loss": 0.473, + "step": 1119 + }, + { + "epoch": 2.5836216839677046, + "grad_norm": 0.0, + "learning_rate": 1.6722129924043184e-05, + "loss": 0.392, + "step": 1120 + }, + { + "epoch": 2.5859284890426757, + "grad_norm": 0.0, + "learning_rate": 1.6715974421455615e-05, + "loss": 0.3433, + "step": 1121 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.0, + "learning_rate": 1.67098142798597e-05, + "loss": 0.6067, + "step": 1122 + }, + { + "epoch": 2.5905420991926182, + "grad_norm": 0.0, + "learning_rate": 1.6703649503510514e-05, + "loss": 0.3693, + "step": 1123 + }, + { + "epoch": 2.5928489042675893, + "grad_norm": 0.0, + "learning_rate": 1.6697480096666313e-05, + "loss": 0.6032, + "step": 1124 + }, + { + "epoch": 2.595155709342561, + "grad_norm": 0.0, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.3967, + "step": 1125 + }, + { + "epoch": 2.597462514417532, + "grad_norm": 0.0, + "learning_rate": 1.6685127408541986e-05, + "loss": 0.483, + "step": 1126 + }, + { + "epoch": 2.599769319492503, + "grad_norm": 0.0, + "learning_rate": 1.6678944135794375e-05, + "loss": 0.4309, + "step": 1127 + }, + { + "epoch": 2.602076124567474, + "grad_norm": 0.0, + "learning_rate": 1.667275624961681e-05, + "loss": 0.4103, + "step": 1128 + }, + { + "epoch": 2.604382929642445, + "grad_norm": 0.0, + "learning_rate": 1.6666563754283517e-05, + "loss": 0.4266, + "step": 1129 + }, + { + "epoch": 2.6066897347174165, + "grad_norm": 0.0, + "learning_rate": 1.6660366654071917e-05, + "loss": 0.5928, + "step": 1130 + }, + { + "epoch": 2.6089965397923875, + "grad_norm": 0.0, + "learning_rate": 1.6654164953262614e-05, + "loss": 0.504, + "step": 1131 + }, + { + "epoch": 2.6113033448673586, + "grad_norm": 0.0, + "learning_rate": 1.6647958656139377e-05, + "loss": 0.4914, + "step": 1132 + }, + { + "epoch": 2.61361014994233, + "grad_norm": 0.0, + "learning_rate": 1.6641747766989173e-05, + "loss": 0.688, + "step": 1133 + }, + { + "epoch": 2.615916955017301, + "grad_norm": 0.0, + "learning_rate": 1.6635532290102114e-05, + "loss": 0.5327, + "step": 1134 + }, + { + "epoch": 2.618223760092272, + "grad_norm": 0.0, + "learning_rate": 1.6629312229771497e-05, + "loss": 0.3177, + "step": 1135 + }, + { + "epoch": 2.6205305651672433, + "grad_norm": 0.0, + "learning_rate": 1.6623087590293786e-05, + "loss": 0.5426, + "step": 1136 + }, + { + "epoch": 2.6228373702422143, + "grad_norm": 0.0, + "learning_rate": 1.6616858375968596e-05, + "loss": 0.3675, + "step": 1137 + }, + { + "epoch": 2.625144175317186, + "grad_norm": 0.0, + "learning_rate": 1.6610624591098716e-05, + "loss": 0.4293, + "step": 1138 + }, + { + "epoch": 2.627450980392157, + "grad_norm": 0.0, + "learning_rate": 1.6604386239990077e-05, + "loss": 0.4405, + "step": 1139 + }, + { + "epoch": 2.629757785467128, + "grad_norm": 0.0, + "learning_rate": 1.6598143326951784e-05, + "loss": 0.4093, + "step": 1140 + }, + { + "epoch": 2.6320645905420994, + "grad_norm": 0.0, + "learning_rate": 1.6591895856296075e-05, + "loss": 0.486, + "step": 1141 + }, + { + "epoch": 2.6343713956170705, + "grad_norm": 0.0, + "learning_rate": 1.6585643832338342e-05, + "loss": 0.4883, + "step": 1142 + }, + { + "epoch": 2.6366782006920415, + "grad_norm": 0.0, + "learning_rate": 1.657938725939713e-05, + "loss": 0.343, + "step": 1143 + }, + { + "epoch": 2.6389850057670126, + "grad_norm": 0.0, + "learning_rate": 1.6573126141794108e-05, + "loss": 0.4574, + "step": 1144 + }, + { + "epoch": 2.6412918108419836, + "grad_norm": 0.0, + "learning_rate": 1.6566860483854106e-05, + "loss": 0.6207, + "step": 1145 + }, + { + "epoch": 2.643598615916955, + "grad_norm": 0.0, + "learning_rate": 1.6560590289905074e-05, + "loss": 0.4351, + "step": 1146 + }, + { + "epoch": 2.645905420991926, + "grad_norm": 0.0, + "learning_rate": 1.6554315564278102e-05, + "loss": 0.5095, + "step": 1147 + }, + { + "epoch": 2.648212226066897, + "grad_norm": 0.0, + "learning_rate": 1.654803631130741e-05, + "loss": 0.3427, + "step": 1148 + }, + { + "epoch": 2.6505190311418687, + "grad_norm": 0.0, + "learning_rate": 1.6541752535330345e-05, + "loss": 0.494, + "step": 1149 + }, + { + "epoch": 2.6528258362168398, + "grad_norm": 0.0, + "learning_rate": 1.6535464240687376e-05, + "loss": 0.3953, + "step": 1150 + }, + { + "epoch": 2.655132641291811, + "grad_norm": 0.0, + "learning_rate": 1.6529171431722097e-05, + "loss": 0.351, + "step": 1151 + }, + { + "epoch": 2.657439446366782, + "grad_norm": 0.0, + "learning_rate": 1.6522874112781213e-05, + "loss": 0.5334, + "step": 1152 + }, + { + "epoch": 2.659746251441753, + "grad_norm": 0.0, + "learning_rate": 1.6516572288214555e-05, + "loss": 0.4919, + "step": 1153 + }, + { + "epoch": 2.6620530565167244, + "grad_norm": 0.0, + "learning_rate": 1.6510265962375054e-05, + "loss": 0.4449, + "step": 1154 + }, + { + "epoch": 2.6643598615916955, + "grad_norm": 0.0, + "learning_rate": 1.6503955139618765e-05, + "loss": 0.4589, + "step": 1155 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.0, + "learning_rate": 1.6497639824304833e-05, + "loss": 0.4384, + "step": 1156 + }, + { + "epoch": 2.668973471741638, + "grad_norm": 0.0, + "learning_rate": 1.649132002079552e-05, + "loss": 0.6397, + "step": 1157 + }, + { + "epoch": 2.671280276816609, + "grad_norm": 0.0, + "learning_rate": 1.6484995733456178e-05, + "loss": 0.4139, + "step": 1158 + }, + { + "epoch": 2.67358708189158, + "grad_norm": 0.0, + "learning_rate": 1.6478666966655266e-05, + "loss": 0.5623, + "step": 1159 + }, + { + "epoch": 2.675893886966551, + "grad_norm": 0.0, + "learning_rate": 1.6472333724764326e-05, + "loss": 0.6482, + "step": 1160 + }, + { + "epoch": 2.6782006920415222, + "grad_norm": 0.0, + "learning_rate": 1.6465996012157996e-05, + "loss": 0.3822, + "step": 1161 + }, + { + "epoch": 2.6805074971164937, + "grad_norm": 0.0, + "learning_rate": 1.645965383321401e-05, + "loss": 0.4891, + "step": 1162 + }, + { + "epoch": 2.6828143021914648, + "grad_norm": 0.0, + "learning_rate": 1.6453307192313176e-05, + "loss": 0.5097, + "step": 1163 + }, + { + "epoch": 2.685121107266436, + "grad_norm": 0.0, + "learning_rate": 1.6446956093839385e-05, + "loss": 0.3205, + "step": 1164 + }, + { + "epoch": 2.6874279123414073, + "grad_norm": 0.0, + "learning_rate": 1.6440600542179613e-05, + "loss": 0.456, + "step": 1165 + }, + { + "epoch": 2.6897347174163784, + "grad_norm": 0.0, + "learning_rate": 1.6434240541723908e-05, + "loss": 0.4557, + "step": 1166 + }, + { + "epoch": 2.6920415224913494, + "grad_norm": 0.0, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.5876, + "step": 1167 + }, + { + "epoch": 2.6943483275663205, + "grad_norm": 0.0, + "learning_rate": 1.6421507212000262e-05, + "loss": 0.4488, + "step": 1168 + }, + { + "epoch": 2.696655132641292, + "grad_norm": 0.0, + "learning_rate": 1.641513389152777e-05, + "loss": 0.6073, + "step": 1169 + }, + { + "epoch": 2.698961937716263, + "grad_norm": 0.0, + "learning_rate": 1.6408756139850243e-05, + "loss": 0.482, + "step": 1170 + }, + { + "epoch": 2.701268742791234, + "grad_norm": 0.0, + "learning_rate": 1.640237396137306e-05, + "loss": 0.6577, + "step": 1171 + }, + { + "epoch": 2.7035755478662056, + "grad_norm": 0.0, + "learning_rate": 1.6395987360504667e-05, + "loss": 0.5384, + "step": 1172 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.0, + "learning_rate": 1.638959634165656e-05, + "loss": 0.5178, + "step": 1173 + }, + { + "epoch": 2.7081891580161477, + "grad_norm": 0.0, + "learning_rate": 1.6383200909243285e-05, + "loss": 0.5686, + "step": 1174 + }, + { + "epoch": 2.7104959630911187, + "grad_norm": 0.0, + "learning_rate": 1.6376801067682433e-05, + "loss": 0.4074, + "step": 1175 + }, + { + "epoch": 2.71280276816609, + "grad_norm": 0.0, + "learning_rate": 1.637039682139466e-05, + "loss": 0.7056, + "step": 1176 + }, + { + "epoch": 2.7151095732410613, + "grad_norm": 0.0, + "learning_rate": 1.6363988174803638e-05, + "loss": 0.4399, + "step": 1177 + }, + { + "epoch": 2.7174163783160323, + "grad_norm": 0.0, + "learning_rate": 1.6357575132336093e-05, + "loss": 0.5103, + "step": 1178 + }, + { + "epoch": 2.7197231833910034, + "grad_norm": 0.0, + "learning_rate": 1.635115769842179e-05, + "loss": 0.5517, + "step": 1179 + }, + { + "epoch": 2.722029988465975, + "grad_norm": 0.0, + "learning_rate": 1.6344735877493518e-05, + "loss": 0.5515, + "step": 1180 + }, + { + "epoch": 2.724336793540946, + "grad_norm": 0.0, + "learning_rate": 1.63383096739871e-05, + "loss": 0.3965, + "step": 1181 + }, + { + "epoch": 2.726643598615917, + "grad_norm": 0.0, + "learning_rate": 1.6331879092341402e-05, + "loss": 0.5723, + "step": 1182 + }, + { + "epoch": 2.728950403690888, + "grad_norm": 0.0, + "learning_rate": 1.6325444136998277e-05, + "loss": 0.3967, + "step": 1183 + }, + { + "epoch": 2.731257208765859, + "grad_norm": 0.0, + "learning_rate": 1.6319004812402637e-05, + "loss": 0.3694, + "step": 1184 + }, + { + "epoch": 2.7335640138408306, + "grad_norm": 0.0, + "learning_rate": 1.631256112300239e-05, + "loss": 0.5395, + "step": 1185 + }, + { + "epoch": 2.7358708189158016, + "grad_norm": 0.0, + "learning_rate": 1.630611307324847e-05, + "loss": 0.6903, + "step": 1186 + }, + { + "epoch": 2.7381776239907727, + "grad_norm": 0.0, + "learning_rate": 1.6299660667594814e-05, + "loss": 0.5268, + "step": 1187 + }, + { + "epoch": 2.740484429065744, + "grad_norm": 0.0, + "learning_rate": 1.6293203910498375e-05, + "loss": 0.584, + "step": 1188 + }, + { + "epoch": 2.7427912341407152, + "grad_norm": 0.0, + "learning_rate": 1.628674280641911e-05, + "loss": 0.4556, + "step": 1189 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 0.0, + "learning_rate": 1.6280277359819973e-05, + "loss": 0.5736, + "step": 1190 + }, + { + "epoch": 2.7474048442906573, + "grad_norm": 0.0, + "learning_rate": 1.6273807575166927e-05, + "loss": 0.3524, + "step": 1191 + }, + { + "epoch": 2.7497116493656284, + "grad_norm": 0.0, + "learning_rate": 1.626733345692892e-05, + "loss": 0.6067, + "step": 1192 + }, + { + "epoch": 2.7520184544406, + "grad_norm": 0.0, + "learning_rate": 1.6260855009577912e-05, + "loss": 0.4256, + "step": 1193 + }, + { + "epoch": 2.754325259515571, + "grad_norm": 0.0, + "learning_rate": 1.625437223758883e-05, + "loss": 0.496, + "step": 1194 + }, + { + "epoch": 2.756632064590542, + "grad_norm": 0.0, + "learning_rate": 1.6247885145439602e-05, + "loss": 0.3951, + "step": 1195 + }, + { + "epoch": 2.7589388696655135, + "grad_norm": 0.0, + "learning_rate": 1.624139373761114e-05, + "loss": 0.2879, + "step": 1196 + }, + { + "epoch": 2.7612456747404845, + "grad_norm": 0.0, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.3298, + "step": 1197 + }, + { + "epoch": 2.7635524798154556, + "grad_norm": 0.0, + "learning_rate": 1.6228397992855053e-05, + "loss": 0.4574, + "step": 1198 + }, + { + "epoch": 2.7658592848904267, + "grad_norm": 0.0, + "learning_rate": 1.6221893664904142e-05, + "loss": 0.6728, + "step": 1199 + }, + { + "epoch": 2.7681660899653977, + "grad_norm": 0.0, + "learning_rate": 1.621538503922741e-05, + "loss": 0.426, + "step": 1200 + }, + { + "epoch": 2.770472895040369, + "grad_norm": 0.0, + "learning_rate": 1.6208872120320647e-05, + "loss": 0.5025, + "step": 1201 + }, + { + "epoch": 2.7727797001153403, + "grad_norm": 0.0, + "learning_rate": 1.6202354912682602e-05, + "loss": 0.4905, + "step": 1202 + }, + { + "epoch": 2.7750865051903113, + "grad_norm": 0.0, + "learning_rate": 1.6195833420814983e-05, + "loss": 0.4269, + "step": 1203 + }, + { + "epoch": 2.777393310265283, + "grad_norm": 0.0, + "learning_rate": 1.6189307649222463e-05, + "loss": 0.3785, + "step": 1204 + }, + { + "epoch": 2.779700115340254, + "grad_norm": 0.0, + "learning_rate": 1.618277760241267e-05, + "loss": 0.436, + "step": 1205 + }, + { + "epoch": 2.782006920415225, + "grad_norm": 0.0, + "learning_rate": 1.617624328489618e-05, + "loss": 0.4231, + "step": 1206 + }, + { + "epoch": 2.784313725490196, + "grad_norm": 0.0, + "learning_rate": 1.6169704701186528e-05, + "loss": 0.3647, + "step": 1207 + }, + { + "epoch": 2.786620530565167, + "grad_norm": 0.0, + "learning_rate": 1.616316185580019e-05, + "loss": 0.4274, + "step": 1208 + }, + { + "epoch": 2.7889273356401385, + "grad_norm": 0.0, + "learning_rate": 1.6156614753256583e-05, + "loss": 0.5199, + "step": 1209 + }, + { + "epoch": 2.7912341407151096, + "grad_norm": 0.0, + "learning_rate": 1.6150063398078074e-05, + "loss": 0.4447, + "step": 1210 + }, + { + "epoch": 2.7935409457900806, + "grad_norm": 0.0, + "learning_rate": 1.6143507794789962e-05, + "loss": 0.5074, + "step": 1211 + }, + { + "epoch": 2.795847750865052, + "grad_norm": 0.0, + "learning_rate": 1.6136947947920477e-05, + "loss": 0.4375, + "step": 1212 + }, + { + "epoch": 2.798154555940023, + "grad_norm": 0.0, + "learning_rate": 1.6130383862000783e-05, + "loss": 0.479, + "step": 1213 + }, + { + "epoch": 2.800461361014994, + "grad_norm": 0.0, + "learning_rate": 1.6123815541564973e-05, + "loss": 0.5022, + "step": 1214 + }, + { + "epoch": 2.8027681660899653, + "grad_norm": 0.0, + "learning_rate": 1.6117242991150064e-05, + "loss": 0.3574, + "step": 1215 + }, + { + "epoch": 2.8050749711649363, + "grad_norm": 0.0, + "learning_rate": 1.6110666215296e-05, + "loss": 0.424, + "step": 1216 + }, + { + "epoch": 2.807381776239908, + "grad_norm": 0.0, + "learning_rate": 1.6104085218545633e-05, + "loss": 0.537, + "step": 1217 + }, + { + "epoch": 2.809688581314879, + "grad_norm": 0.0, + "learning_rate": 1.609750000544474e-05, + "loss": 0.482, + "step": 1218 + }, + { + "epoch": 2.81199538638985, + "grad_norm": 0.0, + "learning_rate": 1.6090910580542006e-05, + "loss": 0.4754, + "step": 1219 + }, + { + "epoch": 2.8143021914648214, + "grad_norm": 0.0, + "learning_rate": 1.6084316948389027e-05, + "loss": 0.4355, + "step": 1220 + }, + { + "epoch": 2.8166089965397925, + "grad_norm": 0.0, + "learning_rate": 1.6077719113540303e-05, + "loss": 0.4877, + "step": 1221 + }, + { + "epoch": 2.8189158016147635, + "grad_norm": 0.0, + "learning_rate": 1.6071117080553236e-05, + "loss": 0.3583, + "step": 1222 + }, + { + "epoch": 2.8212226066897346, + "grad_norm": 0.0, + "learning_rate": 1.6064510853988137e-05, + "loss": 0.5303, + "step": 1223 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.0, + "learning_rate": 1.60579004384082e-05, + "loss": 0.6129, + "step": 1224 + }, + { + "epoch": 2.825836216839677, + "grad_norm": 0.0, + "learning_rate": 1.6051285838379525e-05, + "loss": 0.4577, + "step": 1225 + }, + { + "epoch": 2.828143021914648, + "grad_norm": 0.0, + "learning_rate": 1.6044667058471093e-05, + "loss": 0.3249, + "step": 1226 + }, + { + "epoch": 2.830449826989619, + "grad_norm": 0.0, + "learning_rate": 1.6038044103254775e-05, + "loss": 0.5886, + "step": 1227 + }, + { + "epoch": 2.8327566320645907, + "grad_norm": 0.0, + "learning_rate": 1.603141697730533e-05, + "loss": 0.363, + "step": 1228 + }, + { + "epoch": 2.8350634371395618, + "grad_norm": 0.0, + "learning_rate": 1.6024785685200396e-05, + "loss": 0.4216, + "step": 1229 + }, + { + "epoch": 2.837370242214533, + "grad_norm": 0.0, + "learning_rate": 1.6018150231520486e-05, + "loss": 0.4745, + "step": 1230 + }, + { + "epoch": 2.839677047289504, + "grad_norm": 0.0, + "learning_rate": 1.6011510620848985e-05, + "loss": 0.5405, + "step": 1231 + }, + { + "epoch": 2.841983852364475, + "grad_norm": 0.0, + "learning_rate": 1.600486685777216e-05, + "loss": 0.5265, + "step": 1232 + }, + { + "epoch": 2.8442906574394464, + "grad_norm": 0.0, + "learning_rate": 1.599821894687914e-05, + "loss": 0.5119, + "step": 1233 + }, + { + "epoch": 2.8465974625144175, + "grad_norm": 0.0, + "learning_rate": 1.5991566892761913e-05, + "loss": 0.5269, + "step": 1234 + }, + { + "epoch": 2.848904267589389, + "grad_norm": 0.0, + "learning_rate": 1.5984910700015337e-05, + "loss": 0.3895, + "step": 1235 + }, + { + "epoch": 2.85121107266436, + "grad_norm": 0.0, + "learning_rate": 1.5978250373237132e-05, + "loss": 0.4819, + "step": 1236 + }, + { + "epoch": 2.853517877739331, + "grad_norm": 0.0, + "learning_rate": 1.5971585917027864e-05, + "loss": 0.5676, + "step": 1237 + }, + { + "epoch": 2.855824682814302, + "grad_norm": 0.0, + "learning_rate": 1.5964917335990953e-05, + "loss": 0.5602, + "step": 1238 + }, + { + "epoch": 2.858131487889273, + "grad_norm": 0.0, + "learning_rate": 1.5958244634732673e-05, + "loss": 0.5511, + "step": 1239 + }, + { + "epoch": 2.8604382929642447, + "grad_norm": 0.0, + "learning_rate": 1.5951567817862147e-05, + "loss": 0.5623, + "step": 1240 + }, + { + "epoch": 2.8627450980392157, + "grad_norm": 0.0, + "learning_rate": 1.5944886889991326e-05, + "loss": 0.588, + "step": 1241 + }, + { + "epoch": 2.865051903114187, + "grad_norm": 0.0, + "learning_rate": 1.5938201855735017e-05, + "loss": 0.3131, + "step": 1242 + }, + { + "epoch": 2.8673587081891583, + "grad_norm": 0.0, + "learning_rate": 1.5931512719710855e-05, + "loss": 0.6199, + "step": 1243 + }, + { + "epoch": 2.8696655132641293, + "grad_norm": 0.0, + "learning_rate": 1.592481948653931e-05, + "loss": 0.5341, + "step": 1244 + }, + { + "epoch": 2.8719723183391004, + "grad_norm": 0.0, + "learning_rate": 1.591812216084368e-05, + "loss": 0.5037, + "step": 1245 + }, + { + "epoch": 2.8742791234140714, + "grad_norm": 0.0, + "learning_rate": 1.5911420747250094e-05, + "loss": 0.301, + "step": 1246 + }, + { + "epoch": 2.8765859284890425, + "grad_norm": 0.0, + "learning_rate": 1.5904715250387498e-05, + "loss": 0.3529, + "step": 1247 + }, + { + "epoch": 2.878892733564014, + "grad_norm": 0.0, + "learning_rate": 1.5898005674887673e-05, + "loss": 0.3962, + "step": 1248 + }, + { + "epoch": 2.881199538638985, + "grad_norm": 0.0, + "learning_rate": 1.58912920253852e-05, + "loss": 0.3829, + "step": 1249 + }, + { + "epoch": 2.883506343713956, + "grad_norm": 0.0, + "learning_rate": 1.5884574306517482e-05, + "loss": 0.454, + "step": 1250 + }, + { + "epoch": 2.8858131487889276, + "grad_norm": 0.0, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.5881, + "step": 1251 + }, + { + "epoch": 2.8881199538638986, + "grad_norm": 0.0, + "learning_rate": 1.5871126679249977e-05, + "loss": 0.2544, + "step": 1252 + }, + { + "epoch": 2.8904267589388697, + "grad_norm": 0.0, + "learning_rate": 1.586439678013903e-05, + "loss": 0.288, + "step": 1253 + }, + { + "epoch": 2.8927335640138407, + "grad_norm": 0.0, + "learning_rate": 1.585766283024053e-05, + "loss": 0.5505, + "step": 1254 + }, + { + "epoch": 2.895040369088812, + "grad_norm": 0.0, + "learning_rate": 1.5850924834205897e-05, + "loss": 0.5685, + "step": 1255 + }, + { + "epoch": 2.8973471741637833, + "grad_norm": 0.0, + "learning_rate": 1.5844182796689348e-05, + "loss": 0.6052, + "step": 1256 + }, + { + "epoch": 2.8996539792387543, + "grad_norm": 0.0, + "learning_rate": 1.5837436722347902e-05, + "loss": 0.4579, + "step": 1257 + }, + { + "epoch": 2.9019607843137254, + "grad_norm": 0.0, + "learning_rate": 1.5830686615841348e-05, + "loss": 0.4537, + "step": 1258 + }, + { + "epoch": 2.904267589388697, + "grad_norm": 0.0, + "learning_rate": 1.582393248183228e-05, + "loss": 0.3697, + "step": 1259 + }, + { + "epoch": 2.906574394463668, + "grad_norm": 0.0, + "learning_rate": 1.581717432498606e-05, + "loss": 0.5533, + "step": 1260 + }, + { + "epoch": 2.908881199538639, + "grad_norm": 0.0, + "learning_rate": 1.5810412149970832e-05, + "loss": 0.4279, + "step": 1261 + }, + { + "epoch": 2.91118800461361, + "grad_norm": 0.0, + "learning_rate": 1.5803645961457522e-05, + "loss": 0.4076, + "step": 1262 + }, + { + "epoch": 2.913494809688581, + "grad_norm": 0.0, + "learning_rate": 1.5796875764119826e-05, + "loss": 0.4123, + "step": 1263 + }, + { + "epoch": 2.9158016147635526, + "grad_norm": 0.0, + "learning_rate": 1.5790101562634194e-05, + "loss": 0.4087, + "step": 1264 + }, + { + "epoch": 2.9181084198385236, + "grad_norm": 0.0, + "learning_rate": 1.5783323361679865e-05, + "loss": 0.5587, + "step": 1265 + }, + { + "epoch": 2.9204152249134947, + "grad_norm": 0.0, + "learning_rate": 1.577654116593883e-05, + "loss": 0.5905, + "step": 1266 + }, + { + "epoch": 2.922722029988466, + "grad_norm": 0.0, + "learning_rate": 1.576975498009583e-05, + "loss": 0.3051, + "step": 1267 + }, + { + "epoch": 2.9250288350634372, + "grad_norm": 0.0, + "learning_rate": 1.576296480883838e-05, + "loss": 0.4562, + "step": 1268 + }, + { + "epoch": 2.9273356401384083, + "grad_norm": 0.0, + "learning_rate": 1.575617065685674e-05, + "loss": 0.4578, + "step": 1269 + }, + { + "epoch": 2.9296424452133794, + "grad_norm": 0.0, + "learning_rate": 1.5749372528843908e-05, + "loss": 0.5221, + "step": 1270 + }, + { + "epoch": 2.9319492502883504, + "grad_norm": 0.0, + "learning_rate": 1.574257042949565e-05, + "loss": 0.554, + "step": 1271 + }, + { + "epoch": 2.934256055363322, + "grad_norm": 0.0, + "learning_rate": 1.573576436351046e-05, + "loss": 0.522, + "step": 1272 + }, + { + "epoch": 2.936562860438293, + "grad_norm": 0.0, + "learning_rate": 1.572895433558958e-05, + "loss": 0.5842, + "step": 1273 + }, + { + "epoch": 2.938869665513264, + "grad_norm": 0.0, + "learning_rate": 1.5722140350436984e-05, + "loss": 0.4128, + "step": 1274 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.0, + "learning_rate": 1.5715322412759374e-05, + "loss": 0.3207, + "step": 1275 + }, + { + "epoch": 2.9434832756632066, + "grad_norm": 0.0, + "learning_rate": 1.57085005272662e-05, + "loss": 0.4646, + "step": 1276 + }, + { + "epoch": 2.9457900807381776, + "grad_norm": 0.0, + "learning_rate": 1.570167469866962e-05, + "loss": 0.6126, + "step": 1277 + }, + { + "epoch": 2.9480968858131487, + "grad_norm": 0.0, + "learning_rate": 1.569484493168452e-05, + "loss": 0.5577, + "step": 1278 + }, + { + "epoch": 2.9504036908881197, + "grad_norm": 0.0, + "learning_rate": 1.568801123102852e-05, + "loss": 0.5127, + "step": 1279 + }, + { + "epoch": 2.952710495963091, + "grad_norm": 0.0, + "learning_rate": 1.568117360142194e-05, + "loss": 0.389, + "step": 1280 + }, + { + "epoch": 2.9550173010380623, + "grad_norm": 0.0, + "learning_rate": 1.567433204758782e-05, + "loss": 0.4067, + "step": 1281 + }, + { + "epoch": 2.9573241061130333, + "grad_norm": 0.0, + "learning_rate": 1.5667486574251916e-05, + "loss": 0.4942, + "step": 1282 + }, + { + "epoch": 2.959630911188005, + "grad_norm": 0.0, + "learning_rate": 1.566063718614268e-05, + "loss": 0.5117, + "step": 1283 + }, + { + "epoch": 2.961937716262976, + "grad_norm": 0.0, + "learning_rate": 1.5653783887991282e-05, + "loss": 0.4091, + "step": 1284 + }, + { + "epoch": 2.964244521337947, + "grad_norm": 0.0, + "learning_rate": 1.5646926684531586e-05, + "loss": 0.5213, + "step": 1285 + }, + { + "epoch": 2.966551326412918, + "grad_norm": 0.0, + "learning_rate": 1.5640065580500146e-05, + "loss": 0.6082, + "step": 1286 + }, + { + "epoch": 2.968858131487889, + "grad_norm": 0.0, + "learning_rate": 1.563320058063622e-05, + "loss": 0.5679, + "step": 1287 + }, + { + "epoch": 2.9711649365628605, + "grad_norm": 0.0, + "learning_rate": 1.562633168968176e-05, + "loss": 0.5482, + "step": 1288 + }, + { + "epoch": 2.9734717416378316, + "grad_norm": 0.0, + "learning_rate": 1.5619458912381397e-05, + "loss": 0.4218, + "step": 1289 + }, + { + "epoch": 2.9757785467128026, + "grad_norm": 0.0, + "learning_rate": 1.5612582253482444e-05, + "loss": 0.4031, + "step": 1290 + }, + { + "epoch": 2.978085351787774, + "grad_norm": 0.0, + "learning_rate": 1.5605701717734908e-05, + "loss": 0.46, + "step": 1291 + }, + { + "epoch": 2.980392156862745, + "grad_norm": 0.0, + "learning_rate": 1.5598817309891466e-05, + "loss": 0.3907, + "step": 1292 + }, + { + "epoch": 2.982698961937716, + "grad_norm": 0.0, + "learning_rate": 1.5591929034707468e-05, + "loss": 0.6269, + "step": 1293 + }, + { + "epoch": 2.9850057670126873, + "grad_norm": 0.0, + "learning_rate": 1.558503689694094e-05, + "loss": 0.3448, + "step": 1294 + }, + { + "epoch": 2.9873125720876583, + "grad_norm": 0.0, + "learning_rate": 1.5578140901352576e-05, + "loss": 0.3264, + "step": 1295 + }, + { + "epoch": 2.98961937716263, + "grad_norm": 0.0, + "learning_rate": 1.5571241052705724e-05, + "loss": 0.4899, + "step": 1296 + }, + { + "epoch": 2.991926182237601, + "grad_norm": 0.0, + "learning_rate": 1.5564337355766412e-05, + "loss": 0.438, + "step": 1297 + }, + { + "epoch": 2.994232987312572, + "grad_norm": 0.0, + "learning_rate": 1.555742981530331e-05, + "loss": 0.3405, + "step": 1298 + }, + { + "epoch": 2.9965397923875434, + "grad_norm": 0.0, + "learning_rate": 1.5550518436087753e-05, + "loss": 0.4214, + "step": 1299 + }, + { + "epoch": 2.9988465974625145, + "grad_norm": 0.0, + "learning_rate": 1.5543603222893718e-05, + "loss": 0.5982, + "step": 1300 + }, + { + "epoch": 3.0011534025374855, + "grad_norm": 0.0, + "learning_rate": 1.5536684180497838e-05, + "loss": 0.2961, + "step": 1301 + }, + { + "epoch": 3.0034602076124566, + "grad_norm": 0.0, + "learning_rate": 1.5529761313679396e-05, + "loss": 0.469, + "step": 1302 + }, + { + "epoch": 3.005767012687428, + "grad_norm": 0.0, + "learning_rate": 1.55228346272203e-05, + "loss": 0.3034, + "step": 1303 + }, + { + "epoch": 3.008073817762399, + "grad_norm": 0.0, + "learning_rate": 1.5515904125905118e-05, + "loss": 0.2598, + "step": 1304 + }, + { + "epoch": 3.01038062283737, + "grad_norm": 0.0, + "learning_rate": 1.5508969814521026e-05, + "loss": 0.403, + "step": 1305 + }, + { + "epoch": 3.0126874279123412, + "grad_norm": 0.0, + "learning_rate": 1.5502031697857858e-05, + "loss": 0.2381, + "step": 1306 + }, + { + "epoch": 3.0149942329873127, + "grad_norm": 0.0, + "learning_rate": 1.5495089780708062e-05, + "loss": 0.37, + "step": 1307 + }, + { + "epoch": 3.017301038062284, + "grad_norm": 0.0, + "learning_rate": 1.548814406786671e-05, + "loss": 0.2217, + "step": 1308 + }, + { + "epoch": 3.019607843137255, + "grad_norm": 0.0, + "learning_rate": 1.5481194564131512e-05, + "loss": 0.3474, + "step": 1309 + }, + { + "epoch": 3.021914648212226, + "grad_norm": 0.0, + "learning_rate": 1.5474241274302777e-05, + "loss": 0.3048, + "step": 1310 + }, + { + "epoch": 3.0242214532871974, + "grad_norm": 0.0, + "learning_rate": 1.5467284203183437e-05, + "loss": 0.2127, + "step": 1311 + }, + { + "epoch": 3.0265282583621684, + "grad_norm": 0.0, + "learning_rate": 1.5460323355579035e-05, + "loss": 0.2924, + "step": 1312 + }, + { + "epoch": 3.0288350634371395, + "grad_norm": 0.0, + "learning_rate": 1.5453358736297727e-05, + "loss": 0.324, + "step": 1313 + }, + { + "epoch": 3.0311418685121105, + "grad_norm": 0.0, + "learning_rate": 1.5446390350150272e-05, + "loss": 0.3157, + "step": 1314 + }, + { + "epoch": 3.033448673587082, + "grad_norm": 0.0, + "learning_rate": 1.5439418201950025e-05, + "loss": 0.4296, + "step": 1315 + }, + { + "epoch": 3.035755478662053, + "grad_norm": 0.0, + "learning_rate": 1.543244229651295e-05, + "loss": 0.1885, + "step": 1316 + }, + { + "epoch": 3.038062283737024, + "grad_norm": 0.0, + "learning_rate": 1.5425462638657597e-05, + "loss": 0.2345, + "step": 1317 + }, + { + "epoch": 3.040369088811995, + "grad_norm": 0.0, + "learning_rate": 1.5418479233205112e-05, + "loss": 0.2252, + "step": 1318 + }, + { + "epoch": 3.0426758938869667, + "grad_norm": 0.0, + "learning_rate": 1.541149208497923e-05, + "loss": 0.2805, + "step": 1319 + }, + { + "epoch": 3.0449826989619377, + "grad_norm": 0.0, + "learning_rate": 1.5404501198806267e-05, + "loss": 0.1897, + "step": 1320 + }, + { + "epoch": 3.047289504036909, + "grad_norm": 0.0, + "learning_rate": 1.539750657951513e-05, + "loss": 0.2589, + "step": 1321 + }, + { + "epoch": 3.04959630911188, + "grad_norm": 0.0, + "learning_rate": 1.53905082319373e-05, + "loss": 0.2638, + "step": 1322 + }, + { + "epoch": 3.0519031141868513, + "grad_norm": 0.0, + "learning_rate": 1.5383506160906826e-05, + "loss": 0.3877, + "step": 1323 + }, + { + "epoch": 3.0542099192618224, + "grad_norm": 0.0, + "learning_rate": 1.5376500371260335e-05, + "loss": 0.3145, + "step": 1324 + }, + { + "epoch": 3.0565167243367934, + "grad_norm": 0.0, + "learning_rate": 1.5369490867837037e-05, + "loss": 0.3886, + "step": 1325 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 0.0, + "learning_rate": 1.5362477655478677e-05, + "loss": 0.4357, + "step": 1326 + }, + { + "epoch": 3.061130334486736, + "grad_norm": 0.0, + "learning_rate": 1.5355460739029585e-05, + "loss": 0.3708, + "step": 1327 + }, + { + "epoch": 3.063437139561707, + "grad_norm": 0.0, + "learning_rate": 1.5348440123336647e-05, + "loss": 0.2363, + "step": 1328 + }, + { + "epoch": 3.065743944636678, + "grad_norm": 0.0, + "learning_rate": 1.534141581324929e-05, + "loss": 0.2681, + "step": 1329 + }, + { + "epoch": 3.0680507497116496, + "grad_norm": 0.0, + "learning_rate": 1.5334387813619508e-05, + "loss": 0.2168, + "step": 1330 + }, + { + "epoch": 3.0703575547866206, + "grad_norm": 0.0, + "learning_rate": 1.532735612930184e-05, + "loss": 0.2744, + "step": 1331 + }, + { + "epoch": 3.0726643598615917, + "grad_norm": 0.0, + "learning_rate": 1.5320320765153367e-05, + "loss": 0.3737, + "step": 1332 + }, + { + "epoch": 3.0749711649365628, + "grad_norm": 0.0, + "learning_rate": 1.5313281726033714e-05, + "loss": 0.2949, + "step": 1333 + }, + { + "epoch": 3.077277970011534, + "grad_norm": 0.0, + "learning_rate": 1.5306239016805045e-05, + "loss": 0.2605, + "step": 1334 + }, + { + "epoch": 3.0795847750865053, + "grad_norm": 0.0, + "learning_rate": 1.529919264233205e-05, + "loss": 0.3751, + "step": 1335 + }, + { + "epoch": 3.0818915801614764, + "grad_norm": 0.0, + "learning_rate": 1.529214260748197e-05, + "loss": 0.2882, + "step": 1336 + }, + { + "epoch": 3.0841983852364474, + "grad_norm": 0.0, + "learning_rate": 1.5285088917124555e-05, + "loss": 0.3046, + "step": 1337 + }, + { + "epoch": 3.086505190311419, + "grad_norm": 0.0, + "learning_rate": 1.527803157613209e-05, + "loss": 0.2279, + "step": 1338 + }, + { + "epoch": 3.08881199538639, + "grad_norm": 0.0, + "learning_rate": 1.5270970589379387e-05, + "loss": 0.1535, + "step": 1339 + }, + { + "epoch": 3.091118800461361, + "grad_norm": 0.0, + "learning_rate": 1.5263905961743758e-05, + "loss": 0.5102, + "step": 1340 + }, + { + "epoch": 3.093425605536332, + "grad_norm": 0.0, + "learning_rate": 1.5256837698105047e-05, + "loss": 0.241, + "step": 1341 + }, + { + "epoch": 3.0957324106113036, + "grad_norm": 0.0, + "learning_rate": 1.5249765803345602e-05, + "loss": 0.2284, + "step": 1342 + }, + { + "epoch": 3.0980392156862746, + "grad_norm": 0.0, + "learning_rate": 1.5242690282350281e-05, + "loss": 0.3061, + "step": 1343 + }, + { + "epoch": 3.1003460207612457, + "grad_norm": 0.0, + "learning_rate": 1.5235611140006446e-05, + "loss": 0.3612, + "step": 1344 + }, + { + "epoch": 3.1026528258362167, + "grad_norm": 0.0, + "learning_rate": 1.5228528381203962e-05, + "loss": 0.3075, + "step": 1345 + }, + { + "epoch": 3.104959630911188, + "grad_norm": 0.0, + "learning_rate": 1.5221442010835187e-05, + "loss": 0.4081, + "step": 1346 + }, + { + "epoch": 3.1072664359861593, + "grad_norm": 0.0, + "learning_rate": 1.5214352033794981e-05, + "loss": 0.1777, + "step": 1347 + }, + { + "epoch": 3.1095732410611303, + "grad_norm": 0.0, + "learning_rate": 1.5207258454980694e-05, + "loss": 0.2036, + "step": 1348 + }, + { + "epoch": 3.1118800461361014, + "grad_norm": 0.0, + "learning_rate": 1.5200161279292154e-05, + "loss": 0.3655, + "step": 1349 + }, + { + "epoch": 3.114186851211073, + "grad_norm": 0.0, + "learning_rate": 1.5193060511631692e-05, + "loss": 0.2986, + "step": 1350 + }, + { + "epoch": 3.116493656286044, + "grad_norm": 0.0, + "learning_rate": 1.51859561569041e-05, + "loss": 0.2411, + "step": 1351 + }, + { + "epoch": 3.118800461361015, + "grad_norm": 0.0, + "learning_rate": 1.517884822001666e-05, + "loss": 0.1674, + "step": 1352 + }, + { + "epoch": 3.121107266435986, + "grad_norm": 0.0, + "learning_rate": 1.5171736705879127e-05, + "loss": 0.3046, + "step": 1353 + }, + { + "epoch": 3.1234140715109575, + "grad_norm": 0.0, + "learning_rate": 1.5164621619403725e-05, + "loss": 0.2204, + "step": 1354 + }, + { + "epoch": 3.1257208765859286, + "grad_norm": 0.0, + "learning_rate": 1.5157502965505144e-05, + "loss": 0.2174, + "step": 1355 + }, + { + "epoch": 3.1280276816608996, + "grad_norm": 0.0, + "learning_rate": 1.5150380749100545e-05, + "loss": 0.2485, + "step": 1356 + }, + { + "epoch": 3.1303344867358707, + "grad_norm": 0.0, + "learning_rate": 1.5143254975109538e-05, + "loss": 0.2246, + "step": 1357 + }, + { + "epoch": 3.132641291810842, + "grad_norm": 0.0, + "learning_rate": 1.51361256484542e-05, + "loss": 0.2828, + "step": 1358 + }, + { + "epoch": 3.134948096885813, + "grad_norm": 0.0, + "learning_rate": 1.5128992774059063e-05, + "loss": 0.3656, + "step": 1359 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.0, + "learning_rate": 1.5121856356851101e-05, + "loss": 0.3268, + "step": 1360 + }, + { + "epoch": 3.1395617070357553, + "grad_norm": 0.0, + "learning_rate": 1.5114716401759741e-05, + "loss": 0.4582, + "step": 1361 + }, + { + "epoch": 3.141868512110727, + "grad_norm": 0.0, + "learning_rate": 1.5107572913716859e-05, + "loss": 0.3062, + "step": 1362 + }, + { + "epoch": 3.144175317185698, + "grad_norm": 0.0, + "learning_rate": 1.5100425897656754e-05, + "loss": 0.2255, + "step": 1363 + }, + { + "epoch": 3.146482122260669, + "grad_norm": 0.0, + "learning_rate": 1.5093275358516182e-05, + "loss": 0.3411, + "step": 1364 + }, + { + "epoch": 3.14878892733564, + "grad_norm": 0.0, + "learning_rate": 1.5086121301234318e-05, + "loss": 0.2545, + "step": 1365 + }, + { + "epoch": 3.1510957324106115, + "grad_norm": 0.0, + "learning_rate": 1.5078963730752775e-05, + "loss": 0.2674, + "step": 1366 + }, + { + "epoch": 3.1534025374855825, + "grad_norm": 0.0, + "learning_rate": 1.5071802652015592e-05, + "loss": 0.1963, + "step": 1367 + }, + { + "epoch": 3.1557093425605536, + "grad_norm": 0.0, + "learning_rate": 1.5064638069969228e-05, + "loss": 0.2392, + "step": 1368 + }, + { + "epoch": 3.1580161476355246, + "grad_norm": 0.0, + "learning_rate": 1.5057469989562568e-05, + "loss": 0.4015, + "step": 1369 + }, + { + "epoch": 3.160322952710496, + "grad_norm": 0.0, + "learning_rate": 1.5050298415746903e-05, + "loss": 0.2642, + "step": 1370 + }, + { + "epoch": 3.162629757785467, + "grad_norm": 0.0, + "learning_rate": 1.5043123353475944e-05, + "loss": 0.2594, + "step": 1371 + }, + { + "epoch": 3.1649365628604382, + "grad_norm": 0.0, + "learning_rate": 1.503594480770581e-05, + "loss": 0.2049, + "step": 1372 + }, + { + "epoch": 3.1672433679354093, + "grad_norm": 0.0, + "learning_rate": 1.5028762783395035e-05, + "loss": 0.1665, + "step": 1373 + }, + { + "epoch": 3.169550173010381, + "grad_norm": 0.0, + "learning_rate": 1.5021577285504538e-05, + "loss": 0.3518, + "step": 1374 + }, + { + "epoch": 3.171856978085352, + "grad_norm": 0.0, + "learning_rate": 1.5014388318997655e-05, + "loss": 0.2681, + "step": 1375 + }, + { + "epoch": 3.174163783160323, + "grad_norm": 0.0, + "learning_rate": 1.5007195888840102e-05, + "loss": 0.268, + "step": 1376 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 0.0, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.2367, + "step": 1377 + }, + { + "epoch": 3.1787773933102654, + "grad_norm": 0.0, + "learning_rate": 1.4992800657447858e-05, + "loss": 0.181, + "step": 1378 + }, + { + "epoch": 3.1810841983852365, + "grad_norm": 0.0, + "learning_rate": 1.498559786615656e-05, + "loss": 0.4208, + "step": 1379 + }, + { + "epoch": 3.1833910034602075, + "grad_norm": 0.0, + "learning_rate": 1.4978391631101383e-05, + "loss": 0.2387, + "step": 1380 + }, + { + "epoch": 3.1856978085351786, + "grad_norm": 0.0, + "learning_rate": 1.4971181957259982e-05, + "loss": 0.2334, + "step": 1381 + }, + { + "epoch": 3.18800461361015, + "grad_norm": 0.0, + "learning_rate": 1.496396884961238e-05, + "loss": 0.3558, + "step": 1382 + }, + { + "epoch": 3.190311418685121, + "grad_norm": 0.0, + "learning_rate": 1.4956752313140978e-05, + "loss": 0.2694, + "step": 1383 + }, + { + "epoch": 3.192618223760092, + "grad_norm": 0.0, + "learning_rate": 1.4949532352830543e-05, + "loss": 0.2966, + "step": 1384 + }, + { + "epoch": 3.1949250288350632, + "grad_norm": 0.0, + "learning_rate": 1.494230897366821e-05, + "loss": 0.3562, + "step": 1385 + }, + { + "epoch": 3.1972318339100347, + "grad_norm": 0.0, + "learning_rate": 1.493508218064347e-05, + "loss": 0.2147, + "step": 1386 + }, + { + "epoch": 3.199538638985006, + "grad_norm": 0.0, + "learning_rate": 1.4927851978748177e-05, + "loss": 0.3268, + "step": 1387 + }, + { + "epoch": 3.201845444059977, + "grad_norm": 0.0, + "learning_rate": 1.492061837297654e-05, + "loss": 0.3238, + "step": 1388 + }, + { + "epoch": 3.204152249134948, + "grad_norm": 0.0, + "learning_rate": 1.4913381368325115e-05, + "loss": 0.483, + "step": 1389 + }, + { + "epoch": 3.2064590542099194, + "grad_norm": 0.0, + "learning_rate": 1.4906140969792808e-05, + "loss": 0.4238, + "step": 1390 + }, + { + "epoch": 3.2087658592848904, + "grad_norm": 0.0, + "learning_rate": 1.4898897182380872e-05, + "loss": 0.2129, + "step": 1391 + }, + { + "epoch": 3.2110726643598615, + "grad_norm": 0.0, + "learning_rate": 1.4891650011092896e-05, + "loss": 0.3535, + "step": 1392 + }, + { + "epoch": 3.213379469434833, + "grad_norm": 0.0, + "learning_rate": 1.4884399460934806e-05, + "loss": 0.3541, + "step": 1393 + }, + { + "epoch": 3.215686274509804, + "grad_norm": 0.0, + "learning_rate": 1.487714553691487e-05, + "loss": 0.3834, + "step": 1394 + }, + { + "epoch": 3.217993079584775, + "grad_norm": 0.0, + "learning_rate": 1.4869888244043674e-05, + "loss": 0.2178, + "step": 1395 + }, + { + "epoch": 3.220299884659746, + "grad_norm": 0.0, + "learning_rate": 1.4862627587334144e-05, + "loss": 0.2678, + "step": 1396 + }, + { + "epoch": 3.222606689734717, + "grad_norm": 0.0, + "learning_rate": 1.4855363571801523e-05, + "loss": 0.2013, + "step": 1397 + }, + { + "epoch": 3.2249134948096887, + "grad_norm": 0.0, + "learning_rate": 1.4848096202463373e-05, + "loss": 0.2318, + "step": 1398 + }, + { + "epoch": 3.2272202998846597, + "grad_norm": 0.0, + "learning_rate": 1.4840825484339574e-05, + "loss": 0.2473, + "step": 1399 + }, + { + "epoch": 3.229527104959631, + "grad_norm": 0.0, + "learning_rate": 1.483355142245232e-05, + "loss": 0.3816, + "step": 1400 + }, + { + "epoch": 3.2318339100346023, + "grad_norm": 0.0, + "learning_rate": 1.482627402182611e-05, + "loss": 0.2547, + "step": 1401 + }, + { + "epoch": 3.2341407151095733, + "grad_norm": 0.0, + "learning_rate": 1.481899328748776e-05, + "loss": 0.2829, + "step": 1402 + }, + { + "epoch": 3.2364475201845444, + "grad_norm": 0.0, + "learning_rate": 1.481170922446638e-05, + "loss": 0.307, + "step": 1403 + }, + { + "epoch": 3.2387543252595155, + "grad_norm": 0.0, + "learning_rate": 1.4804421837793379e-05, + "loss": 0.3478, + "step": 1404 + }, + { + "epoch": 3.2410611303344865, + "grad_norm": 0.0, + "learning_rate": 1.4797131132502464e-05, + "loss": 0.2051, + "step": 1405 + }, + { + "epoch": 3.243367935409458, + "grad_norm": 0.0, + "learning_rate": 1.4789837113629637e-05, + "loss": 0.2191, + "step": 1406 + }, + { + "epoch": 3.245674740484429, + "grad_norm": 0.0, + "learning_rate": 1.4782539786213184e-05, + "loss": 0.2577, + "step": 1407 + }, + { + "epoch": 3.2479815455594, + "grad_norm": 0.0, + "learning_rate": 1.477523915529368e-05, + "loss": 0.2612, + "step": 1408 + }, + { + "epoch": 3.2502883506343716, + "grad_norm": 0.0, + "learning_rate": 1.4767935225913976e-05, + "loss": 0.1385, + "step": 1409 + }, + { + "epoch": 3.2525951557093427, + "grad_norm": 0.0, + "learning_rate": 1.4760628003119213e-05, + "loss": 0.3205, + "step": 1410 + }, + { + "epoch": 3.2549019607843137, + "grad_norm": 0.0, + "learning_rate": 1.4753317491956798e-05, + "loss": 0.3668, + "step": 1411 + }, + { + "epoch": 3.2572087658592848, + "grad_norm": 0.0, + "learning_rate": 1.4746003697476406e-05, + "loss": 0.1318, + "step": 1412 + }, + { + "epoch": 3.259515570934256, + "grad_norm": 0.0, + "learning_rate": 1.4738686624729987e-05, + "loss": 0.4485, + "step": 1413 + }, + { + "epoch": 3.2618223760092273, + "grad_norm": 0.0, + "learning_rate": 1.473136627877176e-05, + "loss": 0.4717, + "step": 1414 + }, + { + "epoch": 3.2641291810841984, + "grad_norm": 0.0, + "learning_rate": 1.4724042664658185e-05, + "loss": 0.1885, + "step": 1415 + }, + { + "epoch": 3.2664359861591694, + "grad_norm": 0.0, + "learning_rate": 1.4716715787448007e-05, + "loss": 0.2858, + "step": 1416 + }, + { + "epoch": 3.268742791234141, + "grad_norm": 0.0, + "learning_rate": 1.4709385652202204e-05, + "loss": 0.3106, + "step": 1417 + }, + { + "epoch": 3.271049596309112, + "grad_norm": 0.0, + "learning_rate": 1.470205226398401e-05, + "loss": 0.3519, + "step": 1418 + }, + { + "epoch": 3.273356401384083, + "grad_norm": 0.0, + "learning_rate": 1.469471562785891e-05, + "loss": 0.3678, + "step": 1419 + }, + { + "epoch": 3.275663206459054, + "grad_norm": 0.0, + "learning_rate": 1.4687375748894628e-05, + "loss": 0.35, + "step": 1420 + }, + { + "epoch": 3.2779700115340256, + "grad_norm": 0.0, + "learning_rate": 1.468003263216113e-05, + "loss": 0.1518, + "step": 1421 + }, + { + "epoch": 3.2802768166089966, + "grad_norm": 0.0, + "learning_rate": 1.4672686282730622e-05, + "loss": 0.3924, + "step": 1422 + }, + { + "epoch": 3.2825836216839677, + "grad_norm": 0.0, + "learning_rate": 1.4665336705677533e-05, + "loss": 0.3525, + "step": 1423 + }, + { + "epoch": 3.2848904267589387, + "grad_norm": 0.0, + "learning_rate": 1.4657983906078533e-05, + "loss": 0.1596, + "step": 1424 + }, + { + "epoch": 3.28719723183391, + "grad_norm": 0.0, + "learning_rate": 1.4650627889012507e-05, + "loss": 0.2008, + "step": 1425 + }, + { + "epoch": 3.2895040369088813, + "grad_norm": 0.0, + "learning_rate": 1.4643268659560571e-05, + "loss": 0.2452, + "step": 1426 + }, + { + "epoch": 3.2918108419838523, + "grad_norm": 0.0, + "learning_rate": 1.4635906222806058e-05, + "loss": 0.3666, + "step": 1427 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.0, + "learning_rate": 1.4628540583834511e-05, + "loss": 0.3948, + "step": 1428 + }, + { + "epoch": 3.296424452133795, + "grad_norm": 0.0, + "learning_rate": 1.4621171747733698e-05, + "loss": 0.1876, + "step": 1429 + }, + { + "epoch": 3.298731257208766, + "grad_norm": 0.0, + "learning_rate": 1.4613799719593577e-05, + "loss": 0.222, + "step": 1430 + }, + { + "epoch": 3.301038062283737, + "grad_norm": 0.0, + "learning_rate": 1.4606424504506325e-05, + "loss": 0.2961, + "step": 1431 + }, + { + "epoch": 3.303344867358708, + "grad_norm": 0.0, + "learning_rate": 1.4599046107566314e-05, + "loss": 0.2402, + "step": 1432 + }, + { + "epoch": 3.3056516724336795, + "grad_norm": 0.0, + "learning_rate": 1.4591664533870118e-05, + "loss": 0.4048, + "step": 1433 + }, + { + "epoch": 3.3079584775086506, + "grad_norm": 0.0, + "learning_rate": 1.45842797885165e-05, + "loss": 0.3493, + "step": 1434 + }, + { + "epoch": 3.3102652825836216, + "grad_norm": 0.0, + "learning_rate": 1.4576891876606421e-05, + "loss": 0.2753, + "step": 1435 + }, + { + "epoch": 3.3125720876585927, + "grad_norm": 0.0, + "learning_rate": 1.4569500803243021e-05, + "loss": 0.1955, + "step": 1436 + }, + { + "epoch": 3.314878892733564, + "grad_norm": 0.0, + "learning_rate": 1.4562106573531632e-05, + "loss": 0.3036, + "step": 1437 + }, + { + "epoch": 3.3171856978085352, + "grad_norm": 0.0, + "learning_rate": 1.4554709192579757e-05, + "loss": 0.3426, + "step": 1438 + }, + { + "epoch": 3.3194925028835063, + "grad_norm": 0.0, + "learning_rate": 1.4547308665497082e-05, + "loss": 0.2458, + "step": 1439 + }, + { + "epoch": 3.3217993079584773, + "grad_norm": 0.0, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.2772, + "step": 1440 + }, + { + "epoch": 3.324106113033449, + "grad_norm": 0.0, + "learning_rate": 1.4532498193388941e-05, + "loss": 0.3181, + "step": 1441 + }, + { + "epoch": 3.32641291810842, + "grad_norm": 0.0, + "learning_rate": 1.4525088258593695e-05, + "loss": 0.3993, + "step": 1442 + }, + { + "epoch": 3.328719723183391, + "grad_norm": 0.0, + "learning_rate": 1.4517675198128086e-05, + "loss": 0.3119, + "step": 1443 + }, + { + "epoch": 3.331026528258362, + "grad_norm": 0.0, + "learning_rate": 1.4510259017112624e-05, + "loss": 0.4084, + "step": 1444 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.0, + "learning_rate": 1.4502839720669988e-05, + "loss": 0.2698, + "step": 1445 + }, + { + "epoch": 3.3356401384083045, + "grad_norm": 0.0, + "learning_rate": 1.4495417313924996e-05, + "loss": 0.2571, + "step": 1446 + }, + { + "epoch": 3.3379469434832756, + "grad_norm": 0.0, + "learning_rate": 1.4487991802004625e-05, + "loss": 0.3403, + "step": 1447 + }, + { + "epoch": 3.3402537485582466, + "grad_norm": 0.0, + "learning_rate": 1.4480563190037981e-05, + "loss": 0.2974, + "step": 1448 + }, + { + "epoch": 3.342560553633218, + "grad_norm": 0.0, + "learning_rate": 1.4473131483156326e-05, + "loss": 0.3071, + "step": 1449 + }, + { + "epoch": 3.344867358708189, + "grad_norm": 0.0, + "learning_rate": 1.446569668649306e-05, + "loss": 0.3214, + "step": 1450 + }, + { + "epoch": 3.3471741637831602, + "grad_norm": 0.0, + "learning_rate": 1.4458258805183704e-05, + "loss": 0.2755, + "step": 1451 + }, + { + "epoch": 3.3494809688581313, + "grad_norm": 0.0, + "learning_rate": 1.4450817844365924e-05, + "loss": 0.1851, + "step": 1452 + }, + { + "epoch": 3.351787773933103, + "grad_norm": 0.0, + "learning_rate": 1.4443373809179508e-05, + "loss": 0.3576, + "step": 1453 + }, + { + "epoch": 3.354094579008074, + "grad_norm": 0.0, + "learning_rate": 1.4435926704766364e-05, + "loss": 0.2239, + "step": 1454 + }, + { + "epoch": 3.356401384083045, + "grad_norm": 0.0, + "learning_rate": 1.4428476536270517e-05, + "loss": 0.2896, + "step": 1455 + }, + { + "epoch": 3.3587081891580164, + "grad_norm": 0.0, + "learning_rate": 1.4421023308838124e-05, + "loss": 0.2987, + "step": 1456 + }, + { + "epoch": 3.3610149942329874, + "grad_norm": 0.0, + "learning_rate": 1.4413567027617442e-05, + "loss": 0.2778, + "step": 1457 + }, + { + "epoch": 3.3633217993079585, + "grad_norm": 0.0, + "learning_rate": 1.4406107697758838e-05, + "loss": 0.2483, + "step": 1458 + }, + { + "epoch": 3.3656286043829295, + "grad_norm": 0.0, + "learning_rate": 1.4398645324414792e-05, + "loss": 0.3586, + "step": 1459 + }, + { + "epoch": 3.3679354094579006, + "grad_norm": 0.0, + "learning_rate": 1.4391179912739881e-05, + "loss": 0.353, + "step": 1460 + }, + { + "epoch": 3.370242214532872, + "grad_norm": 0.0, + "learning_rate": 1.4383711467890776e-05, + "loss": 0.3174, + "step": 1461 + }, + { + "epoch": 3.372549019607843, + "grad_norm": 0.0, + "learning_rate": 1.4376239995026254e-05, + "loss": 0.2725, + "step": 1462 + }, + { + "epoch": 3.374855824682814, + "grad_norm": 0.0, + "learning_rate": 1.4368765499307177e-05, + "loss": 0.3365, + "step": 1463 + }, + { + "epoch": 3.3771626297577857, + "grad_norm": 0.0, + "learning_rate": 1.4361287985896495e-05, + "loss": 0.2724, + "step": 1464 + }, + { + "epoch": 3.3794694348327567, + "grad_norm": 0.0, + "learning_rate": 1.4353807459959243e-05, + "loss": 0.3259, + "step": 1465 + }, + { + "epoch": 3.381776239907728, + "grad_norm": 0.0, + "learning_rate": 1.4346323926662541e-05, + "loss": 0.3477, + "step": 1466 + }, + { + "epoch": 3.384083044982699, + "grad_norm": 0.0, + "learning_rate": 1.4338837391175582e-05, + "loss": 0.3697, + "step": 1467 + }, + { + "epoch": 3.38638985005767, + "grad_norm": 0.0, + "learning_rate": 1.4331347858669631e-05, + "loss": 0.3652, + "step": 1468 + }, + { + "epoch": 3.3886966551326414, + "grad_norm": 0.0, + "learning_rate": 1.4323855334318026e-05, + "loss": 0.3108, + "step": 1469 + }, + { + "epoch": 3.3910034602076125, + "grad_norm": 0.0, + "learning_rate": 1.4316359823296174e-05, + "loss": 0.2951, + "step": 1470 + }, + { + "epoch": 3.3933102652825835, + "grad_norm": 0.0, + "learning_rate": 1.430886133078154e-05, + "loss": 0.2915, + "step": 1471 + }, + { + "epoch": 3.395617070357555, + "grad_norm": 0.0, + "learning_rate": 1.4301359861953652e-05, + "loss": 0.3814, + "step": 1472 + }, + { + "epoch": 3.397923875432526, + "grad_norm": 0.0, + "learning_rate": 1.4293855421994094e-05, + "loss": 0.3606, + "step": 1473 + }, + { + "epoch": 3.400230680507497, + "grad_norm": 0.0, + "learning_rate": 1.4286348016086496e-05, + "loss": 0.2738, + "step": 1474 + }, + { + "epoch": 3.402537485582468, + "grad_norm": 0.0, + "learning_rate": 1.4278837649416543e-05, + "loss": 0.2622, + "step": 1475 + }, + { + "epoch": 3.404844290657439, + "grad_norm": 0.0, + "learning_rate": 1.4271324327171969e-05, + "loss": 0.2941, + "step": 1476 + }, + { + "epoch": 3.4071510957324107, + "grad_norm": 0.0, + "learning_rate": 1.4263808054542541e-05, + "loss": 0.3735, + "step": 1477 + }, + { + "epoch": 3.4094579008073818, + "grad_norm": 0.0, + "learning_rate": 1.4256288836720065e-05, + "loss": 0.3645, + "step": 1478 + }, + { + "epoch": 3.411764705882353, + "grad_norm": 0.0, + "learning_rate": 1.4248766678898386e-05, + "loss": 0.3484, + "step": 1479 + }, + { + "epoch": 3.4140715109573243, + "grad_norm": 0.0, + "learning_rate": 1.4241241586273377e-05, + "loss": 0.2413, + "step": 1480 + }, + { + "epoch": 3.4163783160322954, + "grad_norm": 0.0, + "learning_rate": 1.4233713564042937e-05, + "loss": 0.1764, + "step": 1481 + }, + { + "epoch": 3.4186851211072664, + "grad_norm": 0.0, + "learning_rate": 1.4226182617406996e-05, + "loss": 0.2618, + "step": 1482 + }, + { + "epoch": 3.4209919261822375, + "grad_norm": 0.0, + "learning_rate": 1.4218648751567492e-05, + "loss": 0.1661, + "step": 1483 + }, + { + "epoch": 3.423298731257209, + "grad_norm": 0.0, + "learning_rate": 1.4211111971728388e-05, + "loss": 0.3404, + "step": 1484 + }, + { + "epoch": 3.42560553633218, + "grad_norm": 0.0, + "learning_rate": 1.4203572283095657e-05, + "loss": 0.2877, + "step": 1485 + }, + { + "epoch": 3.427912341407151, + "grad_norm": 0.0, + "learning_rate": 1.419602969087728e-05, + "loss": 0.2449, + "step": 1486 + }, + { + "epoch": 3.430219146482122, + "grad_norm": 0.0, + "learning_rate": 1.418848420028325e-05, + "loss": 0.391, + "step": 1487 + }, + { + "epoch": 3.4325259515570936, + "grad_norm": 0.0, + "learning_rate": 1.4180935816525554e-05, + "loss": 0.2508, + "step": 1488 + }, + { + "epoch": 3.4348327566320647, + "grad_norm": 0.0, + "learning_rate": 1.417338454481818e-05, + "loss": 0.3421, + "step": 1489 + }, + { + "epoch": 3.4371395617070357, + "grad_norm": 0.0, + "learning_rate": 1.4165830390377115e-05, + "loss": 0.2835, + "step": 1490 + }, + { + "epoch": 3.4394463667820068, + "grad_norm": 0.0, + "learning_rate": 1.415827335842033e-05, + "loss": 0.3754, + "step": 1491 + }, + { + "epoch": 3.4417531718569783, + "grad_norm": 0.0, + "learning_rate": 1.4150713454167788e-05, + "loss": 0.4026, + "step": 1492 + }, + { + "epoch": 3.4440599769319493, + "grad_norm": 0.0, + "learning_rate": 1.414315068284144e-05, + "loss": 0.3019, + "step": 1493 + }, + { + "epoch": 3.4463667820069204, + "grad_norm": 0.0, + "learning_rate": 1.4135585049665207e-05, + "loss": 0.1687, + "step": 1494 + }, + { + "epoch": 3.4486735870818914, + "grad_norm": 0.0, + "learning_rate": 1.4128016559864998e-05, + "loss": 0.2277, + "step": 1495 + }, + { + "epoch": 3.450980392156863, + "grad_norm": 0.0, + "learning_rate": 1.4120445218668687e-05, + "loss": 0.2716, + "step": 1496 + }, + { + "epoch": 3.453287197231834, + "grad_norm": 0.0, + "learning_rate": 1.4112871031306118e-05, + "loss": 0.3664, + "step": 1497 + }, + { + "epoch": 3.455594002306805, + "grad_norm": 0.0, + "learning_rate": 1.4105294003009107e-05, + "loss": 0.2151, + "step": 1498 + }, + { + "epoch": 3.457900807381776, + "grad_norm": 0.0, + "learning_rate": 1.4097714139011428e-05, + "loss": 0.3097, + "step": 1499 + }, + { + "epoch": 3.4602076124567476, + "grad_norm": 0.0, + "learning_rate": 1.4090131444548814e-05, + "loss": 0.3317, + "step": 1500 + }, + { + "epoch": 3.4625144175317186, + "grad_norm": 0.0, + "learning_rate": 1.4082545924858955e-05, + "loss": 0.2498, + "step": 1501 + }, + { + "epoch": 3.4648212226066897, + "grad_norm": 0.0, + "learning_rate": 1.4074957585181488e-05, + "loss": 0.3038, + "step": 1502 + }, + { + "epoch": 3.4671280276816607, + "grad_norm": 0.0, + "learning_rate": 1.4067366430758004e-05, + "loss": 0.358, + "step": 1503 + }, + { + "epoch": 3.4694348327566322, + "grad_norm": 0.0, + "learning_rate": 1.4059772466832033e-05, + "loss": 0.3243, + "step": 1504 + }, + { + "epoch": 3.4717416378316033, + "grad_norm": 0.0, + "learning_rate": 1.4052175698649054e-05, + "loss": 0.2212, + "step": 1505 + }, + { + "epoch": 3.4740484429065743, + "grad_norm": 0.0, + "learning_rate": 1.4044576131456466e-05, + "loss": 0.2785, + "step": 1506 + }, + { + "epoch": 3.4763552479815454, + "grad_norm": 0.0, + "learning_rate": 1.4036973770503623e-05, + "loss": 0.4104, + "step": 1507 + }, + { + "epoch": 3.478662053056517, + "grad_norm": 0.0, + "learning_rate": 1.4029368621041795e-05, + "loss": 0.5832, + "step": 1508 + }, + { + "epoch": 3.480968858131488, + "grad_norm": 0.0, + "learning_rate": 1.4021760688324175e-05, + "loss": 0.3468, + "step": 1509 + }, + { + "epoch": 3.483275663206459, + "grad_norm": 0.0, + "learning_rate": 1.4014149977605893e-05, + "loss": 0.3597, + "step": 1510 + }, + { + "epoch": 3.48558246828143, + "grad_norm": 0.0, + "learning_rate": 1.4006536494143987e-05, + "loss": 0.263, + "step": 1511 + }, + { + "epoch": 3.4878892733564015, + "grad_norm": 0.0, + "learning_rate": 1.3998920243197408e-05, + "loss": 0.2407, + "step": 1512 + }, + { + "epoch": 3.4901960784313726, + "grad_norm": 0.0, + "learning_rate": 1.3991301230027032e-05, + "loss": 0.3208, + "step": 1513 + }, + { + "epoch": 3.4925028835063436, + "grad_norm": 0.0, + "learning_rate": 1.3983679459895635e-05, + "loss": 0.3936, + "step": 1514 + }, + { + "epoch": 3.4948096885813147, + "grad_norm": 0.0, + "learning_rate": 1.3976054938067885e-05, + "loss": 0.3356, + "step": 1515 + }, + { + "epoch": 3.497116493656286, + "grad_norm": 0.0, + "learning_rate": 1.3968427669810372e-05, + "loss": 0.4391, + "step": 1516 + }, + { + "epoch": 3.4994232987312572, + "grad_norm": 0.0, + "learning_rate": 1.396079766039157e-05, + "loss": 0.3209, + "step": 1517 + }, + { + "epoch": 3.5017301038062283, + "grad_norm": 0.0, + "learning_rate": 1.3953164915081852e-05, + "loss": 0.1835, + "step": 1518 + }, + { + "epoch": 3.5040369088812, + "grad_norm": 0.0, + "learning_rate": 1.3945529439153478e-05, + "loss": 0.393, + "step": 1519 + }, + { + "epoch": 3.506343713956171, + "grad_norm": 0.0, + "learning_rate": 1.3937891237880599e-05, + "loss": 0.2571, + "step": 1520 + }, + { + "epoch": 3.508650519031142, + "grad_norm": 0.0, + "learning_rate": 1.3930250316539237e-05, + "loss": 0.3312, + "step": 1521 + }, + { + "epoch": 3.510957324106113, + "grad_norm": 0.0, + "learning_rate": 1.3922606680407307e-05, + "loss": 0.2452, + "step": 1522 + }, + { + "epoch": 3.513264129181084, + "grad_norm": 0.0, + "learning_rate": 1.3914960334764589e-05, + "loss": 0.2028, + "step": 1523 + }, + { + "epoch": 3.5155709342560555, + "grad_norm": 0.0, + "learning_rate": 1.3907311284892737e-05, + "loss": 0.4266, + "step": 1524 + }, + { + "epoch": 3.5178777393310265, + "grad_norm": 0.0, + "learning_rate": 1.389965953607528e-05, + "loss": 0.2556, + "step": 1525 + }, + { + "epoch": 3.5201845444059976, + "grad_norm": 0.0, + "learning_rate": 1.38920050935976e-05, + "loss": 0.3679, + "step": 1526 + }, + { + "epoch": 3.522491349480969, + "grad_norm": 0.0, + "learning_rate": 1.3884347962746949e-05, + "loss": 0.3175, + "step": 1527 + }, + { + "epoch": 3.52479815455594, + "grad_norm": 0.0, + "learning_rate": 1.3876688148812428e-05, + "loss": 0.4343, + "step": 1528 + }, + { + "epoch": 3.527104959630911, + "grad_norm": 0.0, + "learning_rate": 1.3869025657084996e-05, + "loss": 0.3324, + "step": 1529 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.0, + "learning_rate": 1.3861360492857464e-05, + "loss": 0.3387, + "step": 1530 + }, + { + "epoch": 3.5317185697808533, + "grad_norm": 0.0, + "learning_rate": 1.3853692661424485e-05, + "loss": 0.3322, + "step": 1531 + }, + { + "epoch": 3.534025374855825, + "grad_norm": 0.0, + "learning_rate": 1.3846022168082553e-05, + "loss": 0.3218, + "step": 1532 + }, + { + "epoch": 3.536332179930796, + "grad_norm": 0.0, + "learning_rate": 1.3838349018130007e-05, + "loss": 0.3545, + "step": 1533 + }, + { + "epoch": 3.538638985005767, + "grad_norm": 0.0, + "learning_rate": 1.383067321686701e-05, + "loss": 0.396, + "step": 1534 + }, + { + "epoch": 3.5409457900807384, + "grad_norm": 0.0, + "learning_rate": 1.382299476959557e-05, + "loss": 0.3169, + "step": 1535 + }, + { + "epoch": 3.5432525951557095, + "grad_norm": 0.0, + "learning_rate": 1.3815313681619515e-05, + "loss": 0.4068, + "step": 1536 + }, + { + "epoch": 3.5455594002306805, + "grad_norm": 0.0, + "learning_rate": 1.3807629958244498e-05, + "loss": 0.3084, + "step": 1537 + }, + { + "epoch": 3.5478662053056516, + "grad_norm": 0.0, + "learning_rate": 1.3799943604777993e-05, + "loss": 0.3517, + "step": 1538 + }, + { + "epoch": 3.5501730103806226, + "grad_norm": 0.0, + "learning_rate": 1.3792254626529286e-05, + "loss": 0.2759, + "step": 1539 + }, + { + "epoch": 3.552479815455594, + "grad_norm": 0.0, + "learning_rate": 1.3784563028809485e-05, + "loss": 0.2451, + "step": 1540 + }, + { + "epoch": 3.554786620530565, + "grad_norm": 0.0, + "learning_rate": 1.3776868816931501e-05, + "loss": 0.3406, + "step": 1541 + }, + { + "epoch": 3.557093425605536, + "grad_norm": 0.0, + "learning_rate": 1.3769171996210053e-05, + "loss": 0.296, + "step": 1542 + }, + { + "epoch": 3.5594002306805077, + "grad_norm": 0.0, + "learning_rate": 1.3761472571961664e-05, + "loss": 0.3114, + "step": 1543 + }, + { + "epoch": 3.5617070357554788, + "grad_norm": 0.0, + "learning_rate": 1.375377054950465e-05, + "loss": 0.2946, + "step": 1544 + }, + { + "epoch": 3.56401384083045, + "grad_norm": 0.0, + "learning_rate": 1.3746065934159123e-05, + "loss": 0.2878, + "step": 1545 + }, + { + "epoch": 3.566320645905421, + "grad_norm": 0.0, + "learning_rate": 1.3738358731246988e-05, + "loss": 0.3163, + "step": 1546 + }, + { + "epoch": 3.568627450980392, + "grad_norm": 0.0, + "learning_rate": 1.373064894609194e-05, + "loss": 0.3226, + "step": 1547 + }, + { + "epoch": 3.5709342560553634, + "grad_norm": 0.0, + "learning_rate": 1.3722936584019453e-05, + "loss": 0.2587, + "step": 1548 + }, + { + "epoch": 3.5732410611303345, + "grad_norm": 0.0, + "learning_rate": 1.371522165035678e-05, + "loss": 0.2326, + "step": 1549 + }, + { + "epoch": 3.5755478662053055, + "grad_norm": 0.0, + "learning_rate": 1.370750415043296e-05, + "loss": 0.2866, + "step": 1550 + }, + { + "epoch": 3.577854671280277, + "grad_norm": 0.0, + "learning_rate": 1.3699784089578791e-05, + "loss": 0.3296, + "step": 1551 + }, + { + "epoch": 3.580161476355248, + "grad_norm": 0.0, + "learning_rate": 1.3692061473126845e-05, + "loss": 0.2021, + "step": 1552 + }, + { + "epoch": 3.582468281430219, + "grad_norm": 0.0, + "learning_rate": 1.3684336306411467e-05, + "loss": 0.1763, + "step": 1553 + }, + { + "epoch": 3.58477508650519, + "grad_norm": 0.0, + "learning_rate": 1.3676608594768754e-05, + "loss": 0.3425, + "step": 1554 + }, + { + "epoch": 3.587081891580161, + "grad_norm": 0.0, + "learning_rate": 1.3668878343536562e-05, + "loss": 0.2775, + "step": 1555 + }, + { + "epoch": 3.5893886966551327, + "grad_norm": 0.0, + "learning_rate": 1.366114555805451e-05, + "loss": 0.2184, + "step": 1556 + }, + { + "epoch": 3.5916955017301038, + "grad_norm": 0.0, + "learning_rate": 1.3653410243663953e-05, + "loss": 0.3801, + "step": 1557 + }, + { + "epoch": 3.594002306805075, + "grad_norm": 0.0, + "learning_rate": 1.3645672405708003e-05, + "loss": 0.3479, + "step": 1558 + }, + { + "epoch": 3.5963091118800463, + "grad_norm": 0.0, + "learning_rate": 1.3637932049531517e-05, + "loss": 0.332, + "step": 1559 + }, + { + "epoch": 3.5986159169550174, + "grad_norm": 0.0, + "learning_rate": 1.3630189180481083e-05, + "loss": 0.3743, + "step": 1560 + }, + { + "epoch": 3.6009227220299884, + "grad_norm": 0.0, + "learning_rate": 1.3622443803905028e-05, + "loss": 0.2975, + "step": 1561 + }, + { + "epoch": 3.6032295271049595, + "grad_norm": 0.0, + "learning_rate": 1.361469592515342e-05, + "loss": 0.4326, + "step": 1562 + }, + { + "epoch": 3.6055363321799305, + "grad_norm": 0.0, + "learning_rate": 1.3606945549578039e-05, + "loss": 0.2289, + "step": 1563 + }, + { + "epoch": 3.607843137254902, + "grad_norm": 0.0, + "learning_rate": 1.3599192682532398e-05, + "loss": 0.2092, + "step": 1564 + }, + { + "epoch": 3.610149942329873, + "grad_norm": 0.0, + "learning_rate": 1.3591437329371738e-05, + "loss": 0.2271, + "step": 1565 + }, + { + "epoch": 3.612456747404844, + "grad_norm": 0.0, + "learning_rate": 1.3583679495453e-05, + "loss": 0.2342, + "step": 1566 + }, + { + "epoch": 3.6147635524798156, + "grad_norm": 0.0, + "learning_rate": 1.3575919186134862e-05, + "loss": 0.2539, + "step": 1567 + }, + { + "epoch": 3.6170703575547867, + "grad_norm": 0.0, + "learning_rate": 1.3568156406777693e-05, + "loss": 0.4775, + "step": 1568 + }, + { + "epoch": 3.6193771626297577, + "grad_norm": 0.0, + "learning_rate": 1.356039116274357e-05, + "loss": 0.348, + "step": 1569 + }, + { + "epoch": 3.621683967704729, + "grad_norm": 0.0, + "learning_rate": 1.3552623459396279e-05, + "loss": 0.3697, + "step": 1570 + }, + { + "epoch": 3.6239907727797003, + "grad_norm": 0.0, + "learning_rate": 1.3544853302101302e-05, + "loss": 0.3499, + "step": 1571 + }, + { + "epoch": 3.6262975778546713, + "grad_norm": 0.0, + "learning_rate": 1.3537080696225815e-05, + "loss": 0.3591, + "step": 1572 + }, + { + "epoch": 3.6286043829296424, + "grad_norm": 0.0, + "learning_rate": 1.3529305647138689e-05, + "loss": 0.3687, + "step": 1573 + }, + { + "epoch": 3.630911188004614, + "grad_norm": 0.0, + "learning_rate": 1.3521528160210479e-05, + "loss": 0.3569, + "step": 1574 + }, + { + "epoch": 3.633217993079585, + "grad_norm": 0.0, + "learning_rate": 1.3513748240813429e-05, + "loss": 0.1778, + "step": 1575 + }, + { + "epoch": 3.635524798154556, + "grad_norm": 0.0, + "learning_rate": 1.3505965894321453e-05, + "loss": 0.5023, + "step": 1576 + }, + { + "epoch": 3.637831603229527, + "grad_norm": 0.0, + "learning_rate": 1.349818112611015e-05, + "loss": 0.3319, + "step": 1577 + }, + { + "epoch": 3.640138408304498, + "grad_norm": 0.0, + "learning_rate": 1.3490393941556787e-05, + "loss": 0.3686, + "step": 1578 + }, + { + "epoch": 3.6424452133794696, + "grad_norm": 0.0, + "learning_rate": 1.348260434604031e-05, + "loss": 0.3395, + "step": 1579 + }, + { + "epoch": 3.6447520184544406, + "grad_norm": 0.0, + "learning_rate": 1.3474812344941315e-05, + "loss": 0.2645, + "step": 1580 + }, + { + "epoch": 3.6470588235294117, + "grad_norm": 0.0, + "learning_rate": 1.3467017943642074e-05, + "loss": 0.2106, + "step": 1581 + }, + { + "epoch": 3.649365628604383, + "grad_norm": 0.0, + "learning_rate": 1.3459221147526504e-05, + "loss": 0.3926, + "step": 1582 + }, + { + "epoch": 3.6516724336793542, + "grad_norm": 0.0, + "learning_rate": 1.3451421961980189e-05, + "loss": 0.3171, + "step": 1583 + }, + { + "epoch": 3.6539792387543253, + "grad_norm": 0.0, + "learning_rate": 1.3443620392390352e-05, + "loss": 0.3794, + "step": 1584 + }, + { + "epoch": 3.6562860438292963, + "grad_norm": 0.0, + "learning_rate": 1.3435816444145871e-05, + "loss": 0.3959, + "step": 1585 + }, + { + "epoch": 3.6585928489042674, + "grad_norm": 0.0, + "learning_rate": 1.3428010122637265e-05, + "loss": 0.293, + "step": 1586 + }, + { + "epoch": 3.660899653979239, + "grad_norm": 0.0, + "learning_rate": 1.342020143325669e-05, + "loss": 0.2619, + "step": 1587 + }, + { + "epoch": 3.66320645905421, + "grad_norm": 0.0, + "learning_rate": 1.3412390381397938e-05, + "loss": 0.2685, + "step": 1588 + }, + { + "epoch": 3.665513264129181, + "grad_norm": 0.0, + "learning_rate": 1.340457697245643e-05, + "loss": 0.2823, + "step": 1589 + }, + { + "epoch": 3.6678200692041525, + "grad_norm": 0.0, + "learning_rate": 1.3396761211829229e-05, + "loss": 0.2209, + "step": 1590 + }, + { + "epoch": 3.6701268742791235, + "grad_norm": 0.0, + "learning_rate": 1.3388943104915004e-05, + "loss": 0.2299, + "step": 1591 + }, + { + "epoch": 3.6724336793540946, + "grad_norm": 0.0, + "learning_rate": 1.3381122657114059e-05, + "loss": 0.3839, + "step": 1592 + }, + { + "epoch": 3.6747404844290656, + "grad_norm": 0.0, + "learning_rate": 1.3373299873828303e-05, + "loss": 0.2482, + "step": 1593 + }, + { + "epoch": 3.6770472895040367, + "grad_norm": 0.0, + "learning_rate": 1.3365474760461265e-05, + "loss": 0.285, + "step": 1594 + }, + { + "epoch": 3.679354094579008, + "grad_norm": 0.0, + "learning_rate": 1.3357647322418086e-05, + "loss": 0.323, + "step": 1595 + }, + { + "epoch": 3.6816608996539792, + "grad_norm": 0.0, + "learning_rate": 1.3349817565105507e-05, + "loss": 0.3931, + "step": 1596 + }, + { + "epoch": 3.6839677047289503, + "grad_norm": 0.0, + "learning_rate": 1.3341985493931877e-05, + "loss": 0.3993, + "step": 1597 + }, + { + "epoch": 3.686274509803922, + "grad_norm": 0.0, + "learning_rate": 1.3334151114307136e-05, + "loss": 0.3926, + "step": 1598 + }, + { + "epoch": 3.688581314878893, + "grad_norm": 0.0, + "learning_rate": 1.3326314431642821e-05, + "loss": 0.1268, + "step": 1599 + }, + { + "epoch": 3.690888119953864, + "grad_norm": 0.0, + "learning_rate": 1.3318475451352066e-05, + "loss": 0.334, + "step": 1600 + }, + { + "epoch": 3.693194925028835, + "grad_norm": 0.0, + "learning_rate": 1.3310634178849583e-05, + "loss": 0.3, + "step": 1601 + }, + { + "epoch": 3.695501730103806, + "grad_norm": 0.0, + "learning_rate": 1.3302790619551673e-05, + "loss": 0.1854, + "step": 1602 + }, + { + "epoch": 3.6978085351787775, + "grad_norm": 0.0, + "learning_rate": 1.3294944778876215e-05, + "loss": 0.2424, + "step": 1603 + }, + { + "epoch": 3.7001153402537486, + "grad_norm": 0.0, + "learning_rate": 1.3287096662242665e-05, + "loss": 0.2985, + "step": 1604 + }, + { + "epoch": 3.7024221453287196, + "grad_norm": 0.0, + "learning_rate": 1.3279246275072046e-05, + "loss": 0.2453, + "step": 1605 + }, + { + "epoch": 3.704728950403691, + "grad_norm": 0.0, + "learning_rate": 1.3271393622786957e-05, + "loss": 0.3666, + "step": 1606 + }, + { + "epoch": 3.707035755478662, + "grad_norm": 0.0, + "learning_rate": 1.3263538710811559e-05, + "loss": 0.3422, + "step": 1607 + }, + { + "epoch": 3.709342560553633, + "grad_norm": 0.0, + "learning_rate": 1.3255681544571568e-05, + "loss": 0.2995, + "step": 1608 + }, + { + "epoch": 3.7116493656286043, + "grad_norm": 0.0, + "learning_rate": 1.3247822129494265e-05, + "loss": 0.2882, + "step": 1609 + }, + { + "epoch": 3.7139561707035753, + "grad_norm": 0.0, + "learning_rate": 1.3239960471008484e-05, + "loss": 0.3378, + "step": 1610 + }, + { + "epoch": 3.716262975778547, + "grad_norm": 0.0, + "learning_rate": 1.3232096574544602e-05, + "loss": 0.261, + "step": 1611 + }, + { + "epoch": 3.718569780853518, + "grad_norm": 0.0, + "learning_rate": 1.3224230445534544e-05, + "loss": 0.243, + "step": 1612 + }, + { + "epoch": 3.720876585928489, + "grad_norm": 0.0, + "learning_rate": 1.3216362089411785e-05, + "loss": 0.2325, + "step": 1613 + }, + { + "epoch": 3.7231833910034604, + "grad_norm": 0.0, + "learning_rate": 1.320849151161133e-05, + "loss": 0.2039, + "step": 1614 + }, + { + "epoch": 3.7254901960784315, + "grad_norm": 0.0, + "learning_rate": 1.3200618717569716e-05, + "loss": 0.2406, + "step": 1615 + }, + { + "epoch": 3.7277970011534025, + "grad_norm": 0.0, + "learning_rate": 1.3192743712725022e-05, + "loss": 0.319, + "step": 1616 + }, + { + "epoch": 3.7301038062283736, + "grad_norm": 0.0, + "learning_rate": 1.3184866502516846e-05, + "loss": 0.283, + "step": 1617 + }, + { + "epoch": 3.7324106113033446, + "grad_norm": 0.0, + "learning_rate": 1.317698709238631e-05, + "loss": 0.2611, + "step": 1618 + }, + { + "epoch": 3.734717416378316, + "grad_norm": 0.0, + "learning_rate": 1.3169105487776057e-05, + "loss": 0.1818, + "step": 1619 + }, + { + "epoch": 3.737024221453287, + "grad_norm": 0.0, + "learning_rate": 1.3161221694130247e-05, + "loss": 0.1769, + "step": 1620 + }, + { + "epoch": 3.739331026528258, + "grad_norm": 0.0, + "learning_rate": 1.3153335716894543e-05, + "loss": 0.2327, + "step": 1621 + }, + { + "epoch": 3.7416378316032297, + "grad_norm": 0.0, + "learning_rate": 1.3145447561516138e-05, + "loss": 0.2544, + "step": 1622 + }, + { + "epoch": 3.7439446366782008, + "grad_norm": 0.0, + "learning_rate": 1.3137557233443707e-05, + "loss": 0.3829, + "step": 1623 + }, + { + "epoch": 3.746251441753172, + "grad_norm": 0.0, + "learning_rate": 1.3129664738127431e-05, + "loss": 0.3288, + "step": 1624 + }, + { + "epoch": 3.748558246828143, + "grad_norm": 0.0, + "learning_rate": 1.3121770081018998e-05, + "loss": 0.1589, + "step": 1625 + }, + { + "epoch": 3.750865051903114, + "grad_norm": 0.0, + "learning_rate": 1.3113873267571577e-05, + "loss": 0.2206, + "step": 1626 + }, + { + "epoch": 3.7531718569780854, + "grad_norm": 0.0, + "learning_rate": 1.3105974303239838e-05, + "loss": 0.433, + "step": 1627 + }, + { + "epoch": 3.7554786620530565, + "grad_norm": 0.0, + "learning_rate": 1.3098073193479929e-05, + "loss": 0.3526, + "step": 1628 + }, + { + "epoch": 3.7577854671280275, + "grad_norm": 0.0, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.204, + "step": 1629 + }, + { + "epoch": 3.760092272202999, + "grad_norm": 0.0, + "learning_rate": 1.3082264559507593e-05, + "loss": 0.2303, + "step": 1630 + }, + { + "epoch": 3.76239907727797, + "grad_norm": 0.0, + "learning_rate": 1.3074357046214865e-05, + "loss": 0.3435, + "step": 1631 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.0, + "learning_rate": 1.3066447409333345e-05, + "loss": 0.2011, + "step": 1632 + }, + { + "epoch": 3.767012687427912, + "grad_norm": 0.0, + "learning_rate": 1.3058535654326554e-05, + "loss": 0.4013, + "step": 1633 + }, + { + "epoch": 3.7693194925028837, + "grad_norm": 0.0, + "learning_rate": 1.305062178665948e-05, + "loss": 0.2058, + "step": 1634 + }, + { + "epoch": 3.7716262975778547, + "grad_norm": 0.0, + "learning_rate": 1.3042705811798565e-05, + "loss": 0.3435, + "step": 1635 + }, + { + "epoch": 3.7739331026528258, + "grad_norm": 0.0, + "learning_rate": 1.3034787735211708e-05, + "loss": 0.4149, + "step": 1636 + }, + { + "epoch": 3.7762399077277973, + "grad_norm": 0.0, + "learning_rate": 1.3026867562368262e-05, + "loss": 0.162, + "step": 1637 + }, + { + "epoch": 3.7785467128027683, + "grad_norm": 0.0, + "learning_rate": 1.3018945298739022e-05, + "loss": 0.2817, + "step": 1638 + }, + { + "epoch": 3.7808535178777394, + "grad_norm": 0.0, + "learning_rate": 1.3011020949796236e-05, + "loss": 0.4222, + "step": 1639 + }, + { + "epoch": 3.7831603229527104, + "grad_norm": 0.0, + "learning_rate": 1.3003094521013586e-05, + "loss": 0.3925, + "step": 1640 + }, + { + "epoch": 3.7854671280276815, + "grad_norm": 0.0, + "learning_rate": 1.2995166017866194e-05, + "loss": 0.3864, + "step": 1641 + }, + { + "epoch": 3.787773933102653, + "grad_norm": 0.0, + "learning_rate": 1.2987235445830612e-05, + "loss": 0.5532, + "step": 1642 + }, + { + "epoch": 3.790080738177624, + "grad_norm": 0.0, + "learning_rate": 1.297930281038482e-05, + "loss": 0.3028, + "step": 1643 + }, + { + "epoch": 3.792387543252595, + "grad_norm": 0.0, + "learning_rate": 1.2971368117008232e-05, + "loss": 0.2385, + "step": 1644 + }, + { + "epoch": 3.7946943483275666, + "grad_norm": 0.0, + "learning_rate": 1.2963431371181672e-05, + "loss": 0.3265, + "step": 1645 + }, + { + "epoch": 3.7970011534025376, + "grad_norm": 0.0, + "learning_rate": 1.295549257838739e-05, + "loss": 0.3159, + "step": 1646 + }, + { + "epoch": 3.7993079584775087, + "grad_norm": 0.0, + "learning_rate": 1.2947551744109044e-05, + "loss": 0.2964, + "step": 1647 + }, + { + "epoch": 3.8016147635524797, + "grad_norm": 0.0, + "learning_rate": 1.2939608873831708e-05, + "loss": 0.253, + "step": 1648 + }, + { + "epoch": 3.803921568627451, + "grad_norm": 0.0, + "learning_rate": 1.2931663973041855e-05, + "loss": 0.2996, + "step": 1649 + }, + { + "epoch": 3.8062283737024223, + "grad_norm": 0.0, + "learning_rate": 1.2923717047227368e-05, + "loss": 0.323, + "step": 1650 + }, + { + "epoch": 3.8085351787773933, + "grad_norm": 0.0, + "learning_rate": 1.2915768101877526e-05, + "loss": 0.3638, + "step": 1651 + }, + { + "epoch": 3.8108419838523644, + "grad_norm": 0.0, + "learning_rate": 1.2907817142483002e-05, + "loss": 0.2885, + "step": 1652 + }, + { + "epoch": 3.813148788927336, + "grad_norm": 0.0, + "learning_rate": 1.2899864174535863e-05, + "loss": 0.2691, + "step": 1653 + }, + { + "epoch": 3.815455594002307, + "grad_norm": 0.0, + "learning_rate": 1.2891909203529558e-05, + "loss": 0.2833, + "step": 1654 + }, + { + "epoch": 3.817762399077278, + "grad_norm": 0.0, + "learning_rate": 1.2883952234958921e-05, + "loss": 0.3069, + "step": 1655 + }, + { + "epoch": 3.820069204152249, + "grad_norm": 0.0, + "learning_rate": 1.2875993274320173e-05, + "loss": 0.4663, + "step": 1656 + }, + { + "epoch": 3.82237600922722, + "grad_norm": 0.0, + "learning_rate": 1.2868032327110904e-05, + "loss": 0.2144, + "step": 1657 + }, + { + "epoch": 3.8246828143021916, + "grad_norm": 0.0, + "learning_rate": 1.2860069398830075e-05, + "loss": 0.3179, + "step": 1658 + }, + { + "epoch": 3.8269896193771626, + "grad_norm": 0.0, + "learning_rate": 1.2852104494978024e-05, + "loss": 0.2847, + "step": 1659 + }, + { + "epoch": 3.8292964244521337, + "grad_norm": 0.0, + "learning_rate": 1.284413762105644e-05, + "loss": 0.2572, + "step": 1660 + }, + { + "epoch": 3.831603229527105, + "grad_norm": 0.0, + "learning_rate": 1.2836168782568385e-05, + "loss": 0.4057, + "step": 1661 + }, + { + "epoch": 3.8339100346020762, + "grad_norm": 0.0, + "learning_rate": 1.2828197985018276e-05, + "loss": 0.3066, + "step": 1662 + }, + { + "epoch": 3.8362168396770473, + "grad_norm": 0.0, + "learning_rate": 1.2820225233911877e-05, + "loss": 0.1876, + "step": 1663 + }, + { + "epoch": 3.8385236447520183, + "grad_norm": 0.0, + "learning_rate": 1.2812250534756307e-05, + "loss": 0.3396, + "step": 1664 + }, + { + "epoch": 3.8408304498269894, + "grad_norm": 0.0, + "learning_rate": 1.2804273893060028e-05, + "loss": 0.3979, + "step": 1665 + }, + { + "epoch": 3.843137254901961, + "grad_norm": 0.0, + "learning_rate": 1.2796295314332847e-05, + "loss": 0.273, + "step": 1666 + }, + { + "epoch": 3.845444059976932, + "grad_norm": 0.0, + "learning_rate": 1.2788314804085904e-05, + "loss": 0.1965, + "step": 1667 + }, + { + "epoch": 3.847750865051903, + "grad_norm": 0.0, + "learning_rate": 1.2780332367831678e-05, + "loss": 0.2138, + "step": 1668 + }, + { + "epoch": 3.8500576701268745, + "grad_norm": 0.0, + "learning_rate": 1.2772348011083973e-05, + "loss": 0.3155, + "step": 1669 + }, + { + "epoch": 3.8523644752018456, + "grad_norm": 0.0, + "learning_rate": 1.2764361739357925e-05, + "loss": 0.251, + "step": 1670 + }, + { + "epoch": 3.8546712802768166, + "grad_norm": 0.0, + "learning_rate": 1.2756373558169992e-05, + "loss": 0.3779, + "step": 1671 + }, + { + "epoch": 3.8569780853517877, + "grad_norm": 0.0, + "learning_rate": 1.2748383473037948e-05, + "loss": 0.3273, + "step": 1672 + }, + { + "epoch": 3.8592848904267587, + "grad_norm": 0.0, + "learning_rate": 1.2740391489480885e-05, + "loss": 0.3446, + "step": 1673 + }, + { + "epoch": 3.86159169550173, + "grad_norm": 0.0, + "learning_rate": 1.2732397613019203e-05, + "loss": 0.3339, + "step": 1674 + }, + { + "epoch": 3.8638985005767013, + "grad_norm": 0.0, + "learning_rate": 1.272440184917461e-05, + "loss": 0.401, + "step": 1675 + }, + { + "epoch": 3.8662053056516723, + "grad_norm": 0.0, + "learning_rate": 1.2716404203470121e-05, + "loss": 0.1701, + "step": 1676 + }, + { + "epoch": 3.868512110726644, + "grad_norm": 0.0, + "learning_rate": 1.2708404681430054e-05, + "loss": 0.3131, + "step": 1677 + }, + { + "epoch": 3.870818915801615, + "grad_norm": 0.0, + "learning_rate": 1.270040328858001e-05, + "loss": 0.3271, + "step": 1678 + }, + { + "epoch": 3.873125720876586, + "grad_norm": 0.0, + "learning_rate": 1.2692400030446895e-05, + "loss": 0.2178, + "step": 1679 + }, + { + "epoch": 3.875432525951557, + "grad_norm": 0.0, + "learning_rate": 1.2684394912558898e-05, + "loss": 0.1925, + "step": 1680 + }, + { + "epoch": 3.877739331026528, + "grad_norm": 0.0, + "learning_rate": 1.267638794044549e-05, + "loss": 0.3196, + "step": 1681 + }, + { + "epoch": 3.8800461361014995, + "grad_norm": 0.0, + "learning_rate": 1.266837911963743e-05, + "loss": 0.2587, + "step": 1682 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 0.0, + "learning_rate": 1.2660368455666752e-05, + "loss": 0.2855, + "step": 1683 + }, + { + "epoch": 3.8846597462514416, + "grad_norm": 0.0, + "learning_rate": 1.265235595406676e-05, + "loss": 0.3071, + "step": 1684 + }, + { + "epoch": 3.886966551326413, + "grad_norm": 0.0, + "learning_rate": 1.2644341620372025e-05, + "loss": 0.1994, + "step": 1685 + }, + { + "epoch": 3.889273356401384, + "grad_norm": 0.0, + "learning_rate": 1.2636325460118388e-05, + "loss": 0.4437, + "step": 1686 + }, + { + "epoch": 3.891580161476355, + "grad_norm": 0.0, + "learning_rate": 1.2628307478842955e-05, + "loss": 0.2628, + "step": 1687 + }, + { + "epoch": 3.8938869665513263, + "grad_norm": 0.0, + "learning_rate": 1.2620287682084082e-05, + "loss": 0.4151, + "step": 1688 + }, + { + "epoch": 3.8961937716262973, + "grad_norm": 0.0, + "learning_rate": 1.2612266075381385e-05, + "loss": 0.3714, + "step": 1689 + }, + { + "epoch": 3.898500576701269, + "grad_norm": 0.0, + "learning_rate": 1.2604242664275728e-05, + "loss": 0.2252, + "step": 1690 + }, + { + "epoch": 3.90080738177624, + "grad_norm": 0.0, + "learning_rate": 1.2596217454309216e-05, + "loss": 0.2019, + "step": 1691 + }, + { + "epoch": 3.903114186851211, + "grad_norm": 0.0, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.2288, + "step": 1692 + }, + { + "epoch": 3.9054209919261824, + "grad_norm": 0.0, + "learning_rate": 1.2580161659968294e-05, + "loss": 0.3035, + "step": 1693 + }, + { + "epoch": 3.9077277970011535, + "grad_norm": 0.0, + "learning_rate": 1.25721310866843e-05, + "loss": 0.2037, + "step": 1694 + }, + { + "epoch": 3.9100346020761245, + "grad_norm": 0.0, + "learning_rate": 1.2564098736720286e-05, + "loss": 0.2536, + "step": 1695 + }, + { + "epoch": 3.9123414071510956, + "grad_norm": 0.0, + "learning_rate": 1.2556064615624532e-05, + "loss": 0.347, + "step": 1696 + }, + { + "epoch": 3.9146482122260666, + "grad_norm": 0.0, + "learning_rate": 1.2548028728946548e-05, + "loss": 0.1558, + "step": 1697 + }, + { + "epoch": 3.916955017301038, + "grad_norm": 0.0, + "learning_rate": 1.2539991082237062e-05, + "loss": 0.28, + "step": 1698 + }, + { + "epoch": 3.919261822376009, + "grad_norm": 0.0, + "learning_rate": 1.253195168104802e-05, + "loss": 0.336, + "step": 1699 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 0.0, + "learning_rate": 1.2523910530932572e-05, + "loss": 0.3436, + "step": 1700 + }, + { + "epoch": 3.9238754325259517, + "grad_norm": 0.0, + "learning_rate": 1.2515867637445088e-05, + "loss": 0.2987, + "step": 1701 + }, + { + "epoch": 3.9261822376009228, + "grad_norm": 0.0, + "learning_rate": 1.2507823006141128e-05, + "loss": 0.4079, + "step": 1702 + }, + { + "epoch": 3.928489042675894, + "grad_norm": 0.0, + "learning_rate": 1.2499776642577465e-05, + "loss": 0.2848, + "step": 1703 + }, + { + "epoch": 3.930795847750865, + "grad_norm": 0.0, + "learning_rate": 1.2491728552312066e-05, + "loss": 0.2593, + "step": 1704 + }, + { + "epoch": 3.9331026528258364, + "grad_norm": 0.0, + "learning_rate": 1.2483678740904081e-05, + "loss": 0.3382, + "step": 1705 + }, + { + "epoch": 3.9354094579008074, + "grad_norm": 0.0, + "learning_rate": 1.2475627213913861e-05, + "loss": 0.2571, + "step": 1706 + }, + { + "epoch": 3.9377162629757785, + "grad_norm": 0.0, + "learning_rate": 1.2467573976902936e-05, + "loss": 0.2838, + "step": 1707 + }, + { + "epoch": 3.94002306805075, + "grad_norm": 0.0, + "learning_rate": 1.2459519035434023e-05, + "loss": 0.2974, + "step": 1708 + }, + { + "epoch": 3.942329873125721, + "grad_norm": 0.0, + "learning_rate": 1.2451462395071002e-05, + "loss": 0.2798, + "step": 1709 + }, + { + "epoch": 3.944636678200692, + "grad_norm": 0.0, + "learning_rate": 1.2443404061378941e-05, + "loss": 0.3008, + "step": 1710 + }, + { + "epoch": 3.946943483275663, + "grad_norm": 0.0, + "learning_rate": 1.2435344039924076e-05, + "loss": 0.5102, + "step": 1711 + }, + { + "epoch": 3.949250288350634, + "grad_norm": 0.0, + "learning_rate": 1.24272823362738e-05, + "loss": 0.3114, + "step": 1712 + }, + { + "epoch": 3.9515570934256057, + "grad_norm": 0.0, + "learning_rate": 1.2419218955996677e-05, + "loss": 0.3737, + "step": 1713 + }, + { + "epoch": 3.9538638985005767, + "grad_norm": 0.0, + "learning_rate": 1.241115390466243e-05, + "loss": 0.3145, + "step": 1714 + }, + { + "epoch": 3.956170703575548, + "grad_norm": 0.0, + "learning_rate": 1.240308718784192e-05, + "loss": 0.2322, + "step": 1715 + }, + { + "epoch": 3.9584775086505193, + "grad_norm": 0.0, + "learning_rate": 1.239501881110718e-05, + "loss": 0.3182, + "step": 1716 + }, + { + "epoch": 3.9607843137254903, + "grad_norm": 0.0, + "learning_rate": 1.238694878003138e-05, + "loss": 0.3454, + "step": 1717 + }, + { + "epoch": 3.9630911188004614, + "grad_norm": 0.0, + "learning_rate": 1.2378877100188827e-05, + "loss": 0.2624, + "step": 1718 + }, + { + "epoch": 3.9653979238754324, + "grad_norm": 0.0, + "learning_rate": 1.2370803777154976e-05, + "loss": 0.2159, + "step": 1719 + }, + { + "epoch": 3.9677047289504035, + "grad_norm": 0.0, + "learning_rate": 1.2362728816506418e-05, + "loss": 0.308, + "step": 1720 + }, + { + "epoch": 3.970011534025375, + "grad_norm": 0.0, + "learning_rate": 1.2354652223820858e-05, + "loss": 0.2822, + "step": 1721 + }, + { + "epoch": 3.972318339100346, + "grad_norm": 0.0, + "learning_rate": 1.2346574004677154e-05, + "loss": 0.3138, + "step": 1722 + }, + { + "epoch": 3.974625144175317, + "grad_norm": 0.0, + "learning_rate": 1.2338494164655267e-05, + "loss": 0.367, + "step": 1723 + }, + { + "epoch": 3.9769319492502886, + "grad_norm": 0.0, + "learning_rate": 1.233041270933629e-05, + "loss": 0.2529, + "step": 1724 + }, + { + "epoch": 3.9792387543252596, + "grad_norm": 0.0, + "learning_rate": 1.2322329644302426e-05, + "loss": 0.2852, + "step": 1725 + }, + { + "epoch": 3.9815455594002307, + "grad_norm": 0.0, + "learning_rate": 1.2314244975136989e-05, + "loss": 0.2602, + "step": 1726 + }, + { + "epoch": 3.9838523644752017, + "grad_norm": 0.0, + "learning_rate": 1.2306158707424402e-05, + "loss": 0.3142, + "step": 1727 + }, + { + "epoch": 3.986159169550173, + "grad_norm": 0.0, + "learning_rate": 1.2298070846750197e-05, + "loss": 0.2278, + "step": 1728 + }, + { + "epoch": 3.9884659746251443, + "grad_norm": 0.0, + "learning_rate": 1.2289981398700996e-05, + "loss": 0.4661, + "step": 1729 + }, + { + "epoch": 3.9907727797001153, + "grad_norm": 0.0, + "learning_rate": 1.228189036886453e-05, + "loss": 0.2752, + "step": 1730 + }, + { + "epoch": 3.9930795847750864, + "grad_norm": 0.0, + "learning_rate": 1.2273797762829615e-05, + "loss": 0.2883, + "step": 1731 + }, + { + "epoch": 3.995386389850058, + "grad_norm": 0.0, + "learning_rate": 1.2265703586186158e-05, + "loss": 0.2496, + "step": 1732 + }, + { + "epoch": 3.997693194925029, + "grad_norm": 0.0, + "learning_rate": 1.2257607844525145e-05, + "loss": 0.2548, + "step": 1733 + }, + { + "epoch": 4.0, + "grad_norm": 0.0, + "learning_rate": 1.2249510543438652e-05, + "loss": 0.425, + "step": 1734 + }, + { + "epoch": 4.002306805074971, + "grad_norm": 0.0, + "learning_rate": 1.2241411688519826e-05, + "loss": 0.1496, + "step": 1735 + }, + { + "epoch": 4.004613610149942, + "grad_norm": 0.0, + "learning_rate": 1.2233311285362895e-05, + "loss": 0.186, + "step": 1736 + }, + { + "epoch": 4.006920415224913, + "grad_norm": 0.0, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.2156, + "step": 1737 + }, + { + "epoch": 4.009227220299885, + "grad_norm": 0.0, + "learning_rate": 1.2217105856716938e-05, + "loss": 0.3354, + "step": 1738 + }, + { + "epoch": 4.011534025374856, + "grad_norm": 0.0, + "learning_rate": 1.220900084242169e-05, + "loss": 0.1923, + "step": 1739 + }, + { + "epoch": 4.013840830449827, + "grad_norm": 0.0, + "learning_rate": 1.2200894302275878e-05, + "loss": 0.2135, + "step": 1740 + }, + { + "epoch": 4.016147635524798, + "grad_norm": 0.0, + "learning_rate": 1.2192786241879033e-05, + "loss": 0.1802, + "step": 1741 + }, + { + "epoch": 4.018454440599769, + "grad_norm": 0.0, + "learning_rate": 1.2184676666831741e-05, + "loss": 0.1866, + "step": 1742 + }, + { + "epoch": 4.02076124567474, + "grad_norm": 0.0, + "learning_rate": 1.2176565582735624e-05, + "loss": 0.1554, + "step": 1743 + }, + { + "epoch": 4.023068050749711, + "grad_norm": 0.0, + "learning_rate": 1.2168452995193354e-05, + "loss": 0.2091, + "step": 1744 + }, + { + "epoch": 4.0253748558246825, + "grad_norm": 0.0, + "learning_rate": 1.216033890980864e-05, + "loss": 0.2339, + "step": 1745 + }, + { + "epoch": 4.027681660899654, + "grad_norm": 0.0, + "learning_rate": 1.2152223332186222e-05, + "loss": 0.1835, + "step": 1746 + }, + { + "epoch": 4.0299884659746255, + "grad_norm": 0.0, + "learning_rate": 1.2144106267931877e-05, + "loss": 0.1862, + "step": 1747 + }, + { + "epoch": 4.0322952710495965, + "grad_norm": 0.0, + "learning_rate": 1.2135987722652403e-05, + "loss": 0.1711, + "step": 1748 + }, + { + "epoch": 4.034602076124568, + "grad_norm": 0.0, + "learning_rate": 1.2127867701955622e-05, + "loss": 0.153, + "step": 1749 + }, + { + "epoch": 4.036908881199539, + "grad_norm": 0.0, + "learning_rate": 1.2119746211450382e-05, + "loss": 0.1654, + "step": 1750 + }, + { + "epoch": 4.03921568627451, + "grad_norm": 0.0, + "learning_rate": 1.2111623256746539e-05, + "loss": 0.2226, + "step": 1751 + }, + { + "epoch": 4.041522491349481, + "grad_norm": 0.0, + "learning_rate": 1.210349884345496e-05, + "loss": 0.25, + "step": 1752 + }, + { + "epoch": 4.043829296424452, + "grad_norm": 0.0, + "learning_rate": 1.2095372977187521e-05, + "loss": 0.1054, + "step": 1753 + }, + { + "epoch": 4.046136101499424, + "grad_norm": 0.0, + "learning_rate": 1.2087245663557108e-05, + "loss": 0.1807, + "step": 1754 + }, + { + "epoch": 4.048442906574395, + "grad_norm": 0.0, + "learning_rate": 1.2079116908177592e-05, + "loss": 0.1275, + "step": 1755 + }, + { + "epoch": 4.050749711649366, + "grad_norm": 0.0, + "learning_rate": 1.2070986716663864e-05, + "loss": 0.1872, + "step": 1756 + }, + { + "epoch": 4.053056516724337, + "grad_norm": 0.0, + "learning_rate": 1.2062855094631777e-05, + "loss": 0.1397, + "step": 1757 + }, + { + "epoch": 4.055363321799308, + "grad_norm": 0.0, + "learning_rate": 1.2054722047698192e-05, + "loss": 0.1035, + "step": 1758 + }, + { + "epoch": 4.057670126874279, + "grad_norm": 0.0, + "learning_rate": 1.2046587581480953e-05, + "loss": 0.1749, + "step": 1759 + }, + { + "epoch": 4.05997693194925, + "grad_norm": 0.0, + "learning_rate": 1.2038451701598879e-05, + "loss": 0.1698, + "step": 1760 + }, + { + "epoch": 4.062283737024221, + "grad_norm": 0.0, + "learning_rate": 1.2030314413671763e-05, + "loss": 0.1502, + "step": 1761 + }, + { + "epoch": 4.064590542099193, + "grad_norm": 0.0, + "learning_rate": 1.2022175723320382e-05, + "loss": 0.2349, + "step": 1762 + }, + { + "epoch": 4.066897347174164, + "grad_norm": 0.0, + "learning_rate": 1.2014035636166468e-05, + "loss": 0.1458, + "step": 1763 + }, + { + "epoch": 4.069204152249135, + "grad_norm": 0.0, + "learning_rate": 1.200589415783273e-05, + "loss": 0.2033, + "step": 1764 + }, + { + "epoch": 4.071510957324106, + "grad_norm": 0.0, + "learning_rate": 1.1997751293942828e-05, + "loss": 0.1418, + "step": 1765 + }, + { + "epoch": 4.073817762399077, + "grad_norm": 0.0, + "learning_rate": 1.1989607050121383e-05, + "loss": 0.2387, + "step": 1766 + }, + { + "epoch": 4.076124567474048, + "grad_norm": 0.0, + "learning_rate": 1.1981461431993978e-05, + "loss": 0.1648, + "step": 1767 + }, + { + "epoch": 4.078431372549019, + "grad_norm": 0.0, + "learning_rate": 1.1973314445187125e-05, + "loss": 0.1348, + "step": 1768 + }, + { + "epoch": 4.08073817762399, + "grad_norm": 0.0, + "learning_rate": 1.1965166095328302e-05, + "loss": 0.2018, + "step": 1769 + }, + { + "epoch": 4.083044982698962, + "grad_norm": 0.0, + "learning_rate": 1.1957016388045917e-05, + "loss": 0.2664, + "step": 1770 + }, + { + "epoch": 4.085351787773933, + "grad_norm": 0.0, + "learning_rate": 1.1948865328969317e-05, + "loss": 0.2344, + "step": 1771 + }, + { + "epoch": 4.087658592848904, + "grad_norm": 0.0, + "learning_rate": 1.1940712923728784e-05, + "loss": 0.1984, + "step": 1772 + }, + { + "epoch": 4.0899653979238755, + "grad_norm": 0.0, + "learning_rate": 1.1932559177955533e-05, + "loss": 0.1934, + "step": 1773 + }, + { + "epoch": 4.0922722029988465, + "grad_norm": 0.0, + "learning_rate": 1.1924404097281702e-05, + "loss": 0.1533, + "step": 1774 + }, + { + "epoch": 4.094579008073818, + "grad_norm": 0.0, + "learning_rate": 1.1916247687340348e-05, + "loss": 0.1125, + "step": 1775 + }, + { + "epoch": 4.096885813148789, + "grad_norm": 0.0, + "learning_rate": 1.190808995376545e-05, + "loss": 0.247, + "step": 1776 + }, + { + "epoch": 4.09919261822376, + "grad_norm": 0.0, + "learning_rate": 1.1899930902191904e-05, + "loss": 0.3058, + "step": 1777 + }, + { + "epoch": 4.101499423298732, + "grad_norm": 0.0, + "learning_rate": 1.1891770538255506e-05, + "loss": 0.1869, + "step": 1778 + }, + { + "epoch": 4.103806228373703, + "grad_norm": 0.0, + "learning_rate": 1.188360886759297e-05, + "loss": 0.2139, + "step": 1779 + }, + { + "epoch": 4.106113033448674, + "grad_norm": 0.0, + "learning_rate": 1.1875445895841911e-05, + "loss": 0.2488, + "step": 1780 + }, + { + "epoch": 4.108419838523645, + "grad_norm": 0.0, + "learning_rate": 1.1867281628640833e-05, + "loss": 0.1644, + "step": 1781 + }, + { + "epoch": 4.110726643598616, + "grad_norm": 0.0, + "learning_rate": 1.1859116071629148e-05, + "loss": 0.2598, + "step": 1782 + }, + { + "epoch": 4.113033448673587, + "grad_norm": 0.0, + "learning_rate": 1.1850949230447146e-05, + "loss": 0.106, + "step": 1783 + }, + { + "epoch": 4.115340253748558, + "grad_norm": 0.0, + "learning_rate": 1.1842781110736016e-05, + "loss": 0.1477, + "step": 1784 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 0.0, + "learning_rate": 1.1834611718137825e-05, + "loss": 0.2039, + "step": 1785 + }, + { + "epoch": 4.119953863898501, + "grad_norm": 0.0, + "learning_rate": 1.1826441058295514e-05, + "loss": 0.1712, + "step": 1786 + }, + { + "epoch": 4.122260668973472, + "grad_norm": 0.0, + "learning_rate": 1.181826913685291e-05, + "loss": 0.1901, + "step": 1787 + }, + { + "epoch": 4.124567474048443, + "grad_norm": 0.0, + "learning_rate": 1.18100959594547e-05, + "loss": 0.1155, + "step": 1788 + }, + { + "epoch": 4.126874279123414, + "grad_norm": 0.0, + "learning_rate": 1.1801921531746446e-05, + "loss": 0.2571, + "step": 1789 + }, + { + "epoch": 4.129181084198385, + "grad_norm": 0.0, + "learning_rate": 1.1793745859374575e-05, + "loss": 0.2007, + "step": 1790 + }, + { + "epoch": 4.131487889273356, + "grad_norm": 0.0, + "learning_rate": 1.1785568947986368e-05, + "loss": 0.1472, + "step": 1791 + }, + { + "epoch": 4.133794694348327, + "grad_norm": 0.0, + "learning_rate": 1.1777390803229964e-05, + "loss": 0.2193, + "step": 1792 + }, + { + "epoch": 4.136101499423299, + "grad_norm": 0.0, + "learning_rate": 1.1769211430754357e-05, + "loss": 0.1127, + "step": 1793 + }, + { + "epoch": 4.13840830449827, + "grad_norm": 0.0, + "learning_rate": 1.1761030836209384e-05, + "loss": 0.357, + "step": 1794 + }, + { + "epoch": 4.140715109573241, + "grad_norm": 0.0, + "learning_rate": 1.1752849025245727e-05, + "loss": 0.1703, + "step": 1795 + }, + { + "epoch": 4.143021914648212, + "grad_norm": 0.0, + "learning_rate": 1.1744666003514916e-05, + "loss": 0.2676, + "step": 1796 + }, + { + "epoch": 4.145328719723183, + "grad_norm": 0.0, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.1177, + "step": 1797 + }, + { + "epoch": 4.1476355247981544, + "grad_norm": 0.0, + "learning_rate": 1.172829635036209e-05, + "loss": 0.2378, + "step": 1798 + }, + { + "epoch": 4.1499423298731255, + "grad_norm": 0.0, + "learning_rate": 1.1720109730247291e-05, + "loss": 0.1675, + "step": 1799 + }, + { + "epoch": 4.1522491349480966, + "grad_norm": 0.0, + "learning_rate": 1.1711921921979754e-05, + "loss": 0.1615, + "step": 1800 + }, + { + "epoch": 4.154555940023068, + "grad_norm": 0.0, + "learning_rate": 1.1703732931215141e-05, + "loss": 0.2301, + "step": 1801 + }, + { + "epoch": 4.1568627450980395, + "grad_norm": 0.0, + "learning_rate": 1.1695542763609944e-05, + "loss": 0.1164, + "step": 1802 + }, + { + "epoch": 4.159169550173011, + "grad_norm": 0.0, + "learning_rate": 1.1687351424821448e-05, + "loss": 0.2795, + "step": 1803 + }, + { + "epoch": 4.161476355247982, + "grad_norm": 0.0, + "learning_rate": 1.1679158920507773e-05, + "loss": 0.2225, + "step": 1804 + }, + { + "epoch": 4.163783160322953, + "grad_norm": 0.0, + "learning_rate": 1.1670965256327818e-05, + "loss": 0.1073, + "step": 1805 + }, + { + "epoch": 4.166089965397924, + "grad_norm": 0.0, + "learning_rate": 1.1662770437941293e-05, + "loss": 0.1782, + "step": 1806 + }, + { + "epoch": 4.168396770472895, + "grad_norm": 0.0, + "learning_rate": 1.1654574471008712e-05, + "loss": 0.2662, + "step": 1807 + }, + { + "epoch": 4.170703575547866, + "grad_norm": 0.0, + "learning_rate": 1.1646377361191379e-05, + "loss": 0.2599, + "step": 1808 + }, + { + "epoch": 4.173010380622838, + "grad_norm": 0.0, + "learning_rate": 1.1638179114151378e-05, + "loss": 0.199, + "step": 1809 + }, + { + "epoch": 4.175317185697809, + "grad_norm": 0.0, + "learning_rate": 1.1629979735551592e-05, + "loss": 0.2143, + "step": 1810 + }, + { + "epoch": 4.17762399077278, + "grad_norm": 0.0, + "learning_rate": 1.1621779231055677e-05, + "loss": 0.1818, + "step": 1811 + }, + { + "epoch": 4.179930795847751, + "grad_norm": 0.0, + "learning_rate": 1.1613577606328068e-05, + "loss": 0.1285, + "step": 1812 + }, + { + "epoch": 4.182237600922722, + "grad_norm": 0.0, + "learning_rate": 1.1605374867033978e-05, + "loss": 0.1335, + "step": 1813 + }, + { + "epoch": 4.184544405997693, + "grad_norm": 0.0, + "learning_rate": 1.1597171018839384e-05, + "loss": 0.1636, + "step": 1814 + }, + { + "epoch": 4.186851211072664, + "grad_norm": 0.0, + "learning_rate": 1.1588966067411033e-05, + "loss": 0.1519, + "step": 1815 + }, + { + "epoch": 4.189158016147635, + "grad_norm": 0.0, + "learning_rate": 1.1580760018416434e-05, + "loss": 0.1565, + "step": 1816 + }, + { + "epoch": 4.191464821222607, + "grad_norm": 0.0, + "learning_rate": 1.1572552877523855e-05, + "loss": 0.1605, + "step": 1817 + }, + { + "epoch": 4.193771626297578, + "grad_norm": 0.0, + "learning_rate": 1.156434465040231e-05, + "loss": 0.1823, + "step": 1818 + }, + { + "epoch": 4.196078431372549, + "grad_norm": 0.0, + "learning_rate": 1.1556135342721575e-05, + "loss": 0.1897, + "step": 1819 + }, + { + "epoch": 4.19838523644752, + "grad_norm": 0.0, + "learning_rate": 1.1547924960152162e-05, + "loss": 0.1674, + "step": 1820 + }, + { + "epoch": 4.200692041522491, + "grad_norm": 0.0, + "learning_rate": 1.1539713508365336e-05, + "loss": 0.1819, + "step": 1821 + }, + { + "epoch": 4.202998846597462, + "grad_norm": 0.0, + "learning_rate": 1.1531500993033094e-05, + "loss": 0.2221, + "step": 1822 + }, + { + "epoch": 4.205305651672433, + "grad_norm": 0.0, + "learning_rate": 1.1523287419828165e-05, + "loss": 0.2248, + "step": 1823 + }, + { + "epoch": 4.2076124567474045, + "grad_norm": 0.0, + "learning_rate": 1.1515072794424013e-05, + "loss": 0.1347, + "step": 1824 + }, + { + "epoch": 4.209919261822376, + "grad_norm": 0.0, + "learning_rate": 1.1506857122494832e-05, + "loss": 0.1759, + "step": 1825 + }, + { + "epoch": 4.2122260668973475, + "grad_norm": 0.0, + "learning_rate": 1.1498640409715532e-05, + "loss": 0.1484, + "step": 1826 + }, + { + "epoch": 4.2145328719723185, + "grad_norm": 0.0, + "learning_rate": 1.1490422661761744e-05, + "loss": 0.235, + "step": 1827 + }, + { + "epoch": 4.21683967704729, + "grad_norm": 0.0, + "learning_rate": 1.148220388430982e-05, + "loss": 0.2536, + "step": 1828 + }, + { + "epoch": 4.219146482122261, + "grad_norm": 0.0, + "learning_rate": 1.1473984083036813e-05, + "loss": 0.1628, + "step": 1829 + }, + { + "epoch": 4.221453287197232, + "grad_norm": 0.0, + "learning_rate": 1.146576326362049e-05, + "loss": 0.2838, + "step": 1830 + }, + { + "epoch": 4.223760092272203, + "grad_norm": 0.0, + "learning_rate": 1.1457541431739321e-05, + "loss": 0.1258, + "step": 1831 + }, + { + "epoch": 4.226066897347174, + "grad_norm": 0.0, + "learning_rate": 1.1449318593072468e-05, + "loss": 0.2008, + "step": 1832 + }, + { + "epoch": 4.228373702422146, + "grad_norm": 0.0, + "learning_rate": 1.1441094753299802e-05, + "loss": 0.1424, + "step": 1833 + }, + { + "epoch": 4.230680507497117, + "grad_norm": 0.0, + "learning_rate": 1.1432869918101877e-05, + "loss": 0.2017, + "step": 1834 + }, + { + "epoch": 4.232987312572088, + "grad_norm": 0.0, + "learning_rate": 1.142464409315993e-05, + "loss": 0.1637, + "step": 1835 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 0.0, + "learning_rate": 1.1416417284155892e-05, + "loss": 0.2572, + "step": 1836 + }, + { + "epoch": 4.23760092272203, + "grad_norm": 0.0, + "learning_rate": 1.1408189496772369e-05, + "loss": 0.123, + "step": 1837 + }, + { + "epoch": 4.239907727797001, + "grad_norm": 0.0, + "learning_rate": 1.1399960736692637e-05, + "loss": 0.1639, + "step": 1838 + }, + { + "epoch": 4.242214532871972, + "grad_norm": 0.0, + "learning_rate": 1.1391731009600655e-05, + "loss": 0.1802, + "step": 1839 + }, + { + "epoch": 4.244521337946943, + "grad_norm": 0.0, + "learning_rate": 1.1383500321181045e-05, + "loss": 0.1952, + "step": 1840 + }, + { + "epoch": 4.246828143021915, + "grad_norm": 0.0, + "learning_rate": 1.1375268677119089e-05, + "loss": 0.1144, + "step": 1841 + }, + { + "epoch": 4.249134948096886, + "grad_norm": 0.0, + "learning_rate": 1.1367036083100735e-05, + "loss": 0.1814, + "step": 1842 + }, + { + "epoch": 4.251441753171857, + "grad_norm": 0.0, + "learning_rate": 1.1358802544812584e-05, + "loss": 0.1483, + "step": 1843 + }, + { + "epoch": 4.253748558246828, + "grad_norm": 0.0, + "learning_rate": 1.135056806794189e-05, + "loss": 0.171, + "step": 1844 + }, + { + "epoch": 4.256055363321799, + "grad_norm": 0.0, + "learning_rate": 1.1342332658176556e-05, + "loss": 0.2132, + "step": 1845 + }, + { + "epoch": 4.25836216839677, + "grad_norm": 0.0, + "learning_rate": 1.1334096321205129e-05, + "loss": 0.2036, + "step": 1846 + }, + { + "epoch": 4.260668973471741, + "grad_norm": 0.0, + "learning_rate": 1.1325859062716795e-05, + "loss": 0.1242, + "step": 1847 + }, + { + "epoch": 4.262975778546712, + "grad_norm": 0.0, + "learning_rate": 1.1317620888401379e-05, + "loss": 0.1525, + "step": 1848 + }, + { + "epoch": 4.265282583621684, + "grad_norm": 0.0, + "learning_rate": 1.1309381803949333e-05, + "loss": 0.1445, + "step": 1849 + }, + { + "epoch": 4.267589388696655, + "grad_norm": 0.0, + "learning_rate": 1.1301141815051751e-05, + "loss": 0.1457, + "step": 1850 + }, + { + "epoch": 4.269896193771626, + "grad_norm": 0.0, + "learning_rate": 1.1292900927400334e-05, + "loss": 0.1906, + "step": 1851 + }, + { + "epoch": 4.2722029988465975, + "grad_norm": 0.0, + "learning_rate": 1.1284659146687416e-05, + "loss": 0.1612, + "step": 1852 + }, + { + "epoch": 4.2745098039215685, + "grad_norm": 0.0, + "learning_rate": 1.127641647860595e-05, + "loss": 0.162, + "step": 1853 + }, + { + "epoch": 4.27681660899654, + "grad_norm": 0.0, + "learning_rate": 1.1268172928849486e-05, + "loss": 0.1714, + "step": 1854 + }, + { + "epoch": 4.279123414071511, + "grad_norm": 0.0, + "learning_rate": 1.1259928503112199e-05, + "loss": 0.2213, + "step": 1855 + }, + { + "epoch": 4.281430219146483, + "grad_norm": 0.0, + "learning_rate": 1.1251683207088862e-05, + "loss": 0.2243, + "step": 1856 + }, + { + "epoch": 4.283737024221454, + "grad_norm": 0.0, + "learning_rate": 1.1243437046474854e-05, + "loss": 0.3105, + "step": 1857 + }, + { + "epoch": 4.286043829296425, + "grad_norm": 0.0, + "learning_rate": 1.1235190026966142e-05, + "loss": 0.1343, + "step": 1858 + }, + { + "epoch": 4.288350634371396, + "grad_norm": 0.0, + "learning_rate": 1.1226942154259302e-05, + "loss": 0.1594, + "step": 1859 + }, + { + "epoch": 4.290657439446367, + "grad_norm": 0.0, + "learning_rate": 1.1218693434051475e-05, + "loss": 0.1459, + "step": 1860 + }, + { + "epoch": 4.292964244521338, + "grad_norm": 0.0, + "learning_rate": 1.1210443872040416e-05, + "loss": 0.2067, + "step": 1861 + }, + { + "epoch": 4.295271049596309, + "grad_norm": 0.0, + "learning_rate": 1.120219347392444e-05, + "loss": 0.2437, + "step": 1862 + }, + { + "epoch": 4.29757785467128, + "grad_norm": 0.0, + "learning_rate": 1.1193942245402443e-05, + "loss": 0.1786, + "step": 1863 + }, + { + "epoch": 4.299884659746251, + "grad_norm": 0.0, + "learning_rate": 1.1185690192173908e-05, + "loss": 0.2092, + "step": 1864 + }, + { + "epoch": 4.302191464821223, + "grad_norm": 0.0, + "learning_rate": 1.1177437319938874e-05, + "loss": 0.2423, + "step": 1865 + }, + { + "epoch": 4.304498269896194, + "grad_norm": 0.0, + "learning_rate": 1.1169183634397948e-05, + "loss": 0.2105, + "step": 1866 + }, + { + "epoch": 4.306805074971165, + "grad_norm": 0.0, + "learning_rate": 1.1160929141252303e-05, + "loss": 0.26, + "step": 1867 + }, + { + "epoch": 4.309111880046136, + "grad_norm": 0.0, + "learning_rate": 1.1152673846203668e-05, + "loss": 0.2862, + "step": 1868 + }, + { + "epoch": 4.311418685121107, + "grad_norm": 0.0, + "learning_rate": 1.114441775495432e-05, + "loss": 0.1387, + "step": 1869 + }, + { + "epoch": 4.313725490196078, + "grad_norm": 0.0, + "learning_rate": 1.1136160873207098e-05, + "loss": 0.1655, + "step": 1870 + }, + { + "epoch": 4.316032295271049, + "grad_norm": 0.0, + "learning_rate": 1.1127903206665379e-05, + "loss": 0.1731, + "step": 1871 + }, + { + "epoch": 4.318339100346021, + "grad_norm": 0.0, + "learning_rate": 1.1119644761033079e-05, + "loss": 0.1818, + "step": 1872 + }, + { + "epoch": 4.320645905420992, + "grad_norm": 0.0, + "learning_rate": 1.1111385542014662e-05, + "loss": 0.2117, + "step": 1873 + }, + { + "epoch": 4.322952710495963, + "grad_norm": 0.0, + "learning_rate": 1.110312555531512e-05, + "loss": 0.1936, + "step": 1874 + }, + { + "epoch": 4.325259515570934, + "grad_norm": 0.0, + "learning_rate": 1.1094864806639971e-05, + "loss": 0.1514, + "step": 1875 + }, + { + "epoch": 4.327566320645905, + "grad_norm": 0.0, + "learning_rate": 1.1086603301695268e-05, + "loss": 0.2077, + "step": 1876 + }, + { + "epoch": 4.3298731257208765, + "grad_norm": 0.0, + "learning_rate": 1.1078341046187588e-05, + "loss": 0.2069, + "step": 1877 + }, + { + "epoch": 4.3321799307958475, + "grad_norm": 0.0, + "learning_rate": 1.1070078045824014e-05, + "loss": 0.2248, + "step": 1878 + }, + { + "epoch": 4.334486735870819, + "grad_norm": 0.0, + "learning_rate": 1.1061814306312153e-05, + "loss": 0.1682, + "step": 1879 + }, + { + "epoch": 4.33679354094579, + "grad_norm": 0.0, + "learning_rate": 1.1053549833360117e-05, + "loss": 0.1939, + "step": 1880 + }, + { + "epoch": 4.339100346020762, + "grad_norm": 0.0, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.134, + "step": 1881 + }, + { + "epoch": 4.341407151095733, + "grad_norm": 0.0, + "learning_rate": 1.1037018709970528e-05, + "loss": 0.2189, + "step": 1882 + }, + { + "epoch": 4.343713956170704, + "grad_norm": 0.0, + "learning_rate": 1.102875207095172e-05, + "loss": 0.2714, + "step": 1883 + }, + { + "epoch": 4.346020761245675, + "grad_norm": 0.0, + "learning_rate": 1.1020484721330227e-05, + "loss": 0.1596, + "step": 1884 + }, + { + "epoch": 4.348327566320646, + "grad_norm": 0.0, + "learning_rate": 1.101221666681666e-05, + "loss": 0.2115, + "step": 1885 + }, + { + "epoch": 4.350634371395617, + "grad_norm": 0.0, + "learning_rate": 1.1003947913122112e-05, + "loss": 0.2636, + "step": 1886 + }, + { + "epoch": 4.352941176470588, + "grad_norm": 0.0, + "learning_rate": 1.0995678465958168e-05, + "loss": 0.2805, + "step": 1887 + }, + { + "epoch": 4.35524798154556, + "grad_norm": 0.0, + "learning_rate": 1.0987408331036879e-05, + "loss": 0.1807, + "step": 1888 + }, + { + "epoch": 4.357554786620531, + "grad_norm": 0.0, + "learning_rate": 1.0979137514070783e-05, + "loss": 0.1868, + "step": 1889 + }, + { + "epoch": 4.359861591695502, + "grad_norm": 0.0, + "learning_rate": 1.0970866020772884e-05, + "loss": 0.1891, + "step": 1890 + }, + { + "epoch": 4.362168396770473, + "grad_norm": 0.0, + "learning_rate": 1.0962593856856649e-05, + "loss": 0.1667, + "step": 1891 + }, + { + "epoch": 4.364475201845444, + "grad_norm": 0.0, + "learning_rate": 1.0954321028036013e-05, + "loss": 0.1216, + "step": 1892 + }, + { + "epoch": 4.366782006920415, + "grad_norm": 0.0, + "learning_rate": 1.0946047540025373e-05, + "loss": 0.0782, + "step": 1893 + }, + { + "epoch": 4.369088811995386, + "grad_norm": 0.0, + "learning_rate": 1.0937773398539578e-05, + "loss": 0.1923, + "step": 1894 + }, + { + "epoch": 4.371395617070357, + "grad_norm": 0.0, + "learning_rate": 1.0929498609293925e-05, + "loss": 0.0838, + "step": 1895 + }, + { + "epoch": 4.373702422145329, + "grad_norm": 0.0, + "learning_rate": 1.0921223178004163e-05, + "loss": 0.1566, + "step": 1896 + }, + { + "epoch": 4.3760092272203, + "grad_norm": 0.0, + "learning_rate": 1.0912947110386484e-05, + "loss": 0.1836, + "step": 1897 + }, + { + "epoch": 4.378316032295271, + "grad_norm": 0.0, + "learning_rate": 1.0904670412157522e-05, + "loss": 0.2738, + "step": 1898 + }, + { + "epoch": 4.380622837370242, + "grad_norm": 0.0, + "learning_rate": 1.0896393089034336e-05, + "loss": 0.1616, + "step": 1899 + }, + { + "epoch": 4.382929642445213, + "grad_norm": 0.0, + "learning_rate": 1.088811514673443e-05, + "loss": 0.1479, + "step": 1900 + }, + { + "epoch": 4.385236447520184, + "grad_norm": 0.0, + "learning_rate": 1.0879836590975732e-05, + "loss": 0.203, + "step": 1901 + }, + { + "epoch": 4.387543252595155, + "grad_norm": 0.0, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.2499, + "step": 1902 + }, + { + "epoch": 4.3898500576701265, + "grad_norm": 0.0, + "learning_rate": 1.0863277661955757e-05, + "loss": 0.1802, + "step": 1903 + }, + { + "epoch": 4.392156862745098, + "grad_norm": 0.0, + "learning_rate": 1.0854997300132444e-05, + "loss": 0.2076, + "step": 1904 + }, + { + "epoch": 4.3944636678200695, + "grad_norm": 0.0, + "learning_rate": 1.0846716347726233e-05, + "loss": 0.2188, + "step": 1905 + }, + { + "epoch": 4.3967704728950405, + "grad_norm": 0.0, + "learning_rate": 1.0838434810457132e-05, + "loss": 0.3145, + "step": 1906 + }, + { + "epoch": 4.399077277970012, + "grad_norm": 0.0, + "learning_rate": 1.0830152694045553e-05, + "loss": 0.1724, + "step": 1907 + }, + { + "epoch": 4.401384083044983, + "grad_norm": 0.0, + "learning_rate": 1.0821870004212305e-05, + "loss": 0.2174, + "step": 1908 + }, + { + "epoch": 4.403690888119954, + "grad_norm": 0.0, + "learning_rate": 1.0813586746678584e-05, + "loss": 0.1869, + "step": 1909 + }, + { + "epoch": 4.405997693194925, + "grad_norm": 0.0, + "learning_rate": 1.0805302927165996e-05, + "loss": 0.1213, + "step": 1910 + }, + { + "epoch": 4.408304498269896, + "grad_norm": 0.0, + "learning_rate": 1.0797018551396527e-05, + "loss": 0.1203, + "step": 1911 + }, + { + "epoch": 4.410611303344868, + "grad_norm": 0.0, + "learning_rate": 1.078873362509254e-05, + "loss": 0.1391, + "step": 1912 + }, + { + "epoch": 4.412918108419839, + "grad_norm": 0.0, + "learning_rate": 1.0780448153976792e-05, + "loss": 0.1427, + "step": 1913 + }, + { + "epoch": 4.41522491349481, + "grad_norm": 0.0, + "learning_rate": 1.0772162143772407e-05, + "loss": 0.1715, + "step": 1914 + }, + { + "epoch": 4.417531718569781, + "grad_norm": 0.0, + "learning_rate": 1.076387560020288e-05, + "loss": 0.262, + "step": 1915 + }, + { + "epoch": 4.419838523644752, + "grad_norm": 0.0, + "learning_rate": 1.0755588528992082e-05, + "loss": 0.1749, + "step": 1916 + }, + { + "epoch": 4.422145328719723, + "grad_norm": 0.0, + "learning_rate": 1.0747300935864245e-05, + "loss": 0.1609, + "step": 1917 + }, + { + "epoch": 4.424452133794694, + "grad_norm": 0.0, + "learning_rate": 1.0739012826543955e-05, + "loss": 0.1175, + "step": 1918 + }, + { + "epoch": 4.426758938869666, + "grad_norm": 0.0, + "learning_rate": 1.073072420675617e-05, + "loss": 0.2036, + "step": 1919 + }, + { + "epoch": 4.429065743944637, + "grad_norm": 0.0, + "learning_rate": 1.0722435082226186e-05, + "loss": 0.1697, + "step": 1920 + }, + { + "epoch": 4.431372549019608, + "grad_norm": 0.0, + "learning_rate": 1.071414545867965e-05, + "loss": 0.1323, + "step": 1921 + }, + { + "epoch": 4.433679354094579, + "grad_norm": 0.0, + "learning_rate": 1.0705855341842564e-05, + "loss": 0.2009, + "step": 1922 + }, + { + "epoch": 4.43598615916955, + "grad_norm": 0.0, + "learning_rate": 1.0697564737441254e-05, + "loss": 0.2231, + "step": 1923 + }, + { + "epoch": 4.438292964244521, + "grad_norm": 0.0, + "learning_rate": 1.0689273651202398e-05, + "loss": 0.184, + "step": 1924 + }, + { + "epoch": 4.440599769319492, + "grad_norm": 0.0, + "learning_rate": 1.0680982088853003e-05, + "loss": 0.2049, + "step": 1925 + }, + { + "epoch": 4.442906574394463, + "grad_norm": 0.0, + "learning_rate": 1.0672690056120398e-05, + "loss": 0.2426, + "step": 1926 + }, + { + "epoch": 4.445213379469434, + "grad_norm": 0.0, + "learning_rate": 1.0664397558732245e-05, + "loss": 0.1243, + "step": 1927 + }, + { + "epoch": 4.447520184544406, + "grad_norm": 0.0, + "learning_rate": 1.0656104602416519e-05, + "loss": 0.2568, + "step": 1928 + }, + { + "epoch": 4.449826989619377, + "grad_norm": 0.0, + "learning_rate": 1.0647811192901518e-05, + "loss": 0.1157, + "step": 1929 + }, + { + "epoch": 4.4521337946943484, + "grad_norm": 0.0, + "learning_rate": 1.0639517335915857e-05, + "loss": 0.2157, + "step": 1930 + }, + { + "epoch": 4.4544405997693195, + "grad_norm": 0.0, + "learning_rate": 1.063122303718845e-05, + "loss": 0.1577, + "step": 1931 + }, + { + "epoch": 4.4567474048442905, + "grad_norm": 0.0, + "learning_rate": 1.0622928302448523e-05, + "loss": 0.1248, + "step": 1932 + }, + { + "epoch": 4.459054209919262, + "grad_norm": 0.0, + "learning_rate": 1.0614633137425599e-05, + "loss": 0.236, + "step": 1933 + }, + { + "epoch": 4.461361014994233, + "grad_norm": 0.0, + "learning_rate": 1.06063375478495e-05, + "loss": 0.138, + "step": 1934 + }, + { + "epoch": 4.463667820069205, + "grad_norm": 0.0, + "learning_rate": 1.0598041539450344e-05, + "loss": 0.1874, + "step": 1935 + }, + { + "epoch": 4.465974625144176, + "grad_norm": 0.0, + "learning_rate": 1.0589745117958533e-05, + "loss": 0.15, + "step": 1936 + }, + { + "epoch": 4.468281430219147, + "grad_norm": 0.0, + "learning_rate": 1.0581448289104759e-05, + "loss": 0.3162, + "step": 1937 + }, + { + "epoch": 4.470588235294118, + "grad_norm": 0.0, + "learning_rate": 1.0573151058619994e-05, + "loss": 0.2448, + "step": 1938 + }, + { + "epoch": 4.472895040369089, + "grad_norm": 0.0, + "learning_rate": 1.0564853432235486e-05, + "loss": 0.1195, + "step": 1939 + }, + { + "epoch": 4.47520184544406, + "grad_norm": 0.0, + "learning_rate": 1.0556555415682757e-05, + "loss": 0.2061, + "step": 1940 + }, + { + "epoch": 4.477508650519031, + "grad_norm": 0.0, + "learning_rate": 1.0548257014693602e-05, + "loss": 0.2662, + "step": 1941 + }, + { + "epoch": 4.479815455594002, + "grad_norm": 0.0, + "learning_rate": 1.0539958235000075e-05, + "loss": 0.1564, + "step": 1942 + }, + { + "epoch": 4.482122260668973, + "grad_norm": 0.0, + "learning_rate": 1.0531659082334495e-05, + "loss": 0.2876, + "step": 1943 + }, + { + "epoch": 4.484429065743945, + "grad_norm": 0.0, + "learning_rate": 1.0523359562429441e-05, + "loss": 0.2131, + "step": 1944 + }, + { + "epoch": 4.486735870818916, + "grad_norm": 0.0, + "learning_rate": 1.051505968101774e-05, + "loss": 0.1728, + "step": 1945 + }, + { + "epoch": 4.489042675893887, + "grad_norm": 0.0, + "learning_rate": 1.0506759443832474e-05, + "loss": 0.1007, + "step": 1946 + }, + { + "epoch": 4.491349480968858, + "grad_norm": 0.0, + "learning_rate": 1.0498458856606972e-05, + "loss": 0.2064, + "step": 1947 + }, + { + "epoch": 4.493656286043829, + "grad_norm": 0.0, + "learning_rate": 1.04901579250748e-05, + "loss": 0.2708, + "step": 1948 + }, + { + "epoch": 4.4959630911188, + "grad_norm": 0.0, + "learning_rate": 1.0481856654969758e-05, + "loss": 0.2141, + "step": 1949 + }, + { + "epoch": 4.498269896193771, + "grad_norm": 0.0, + "learning_rate": 1.0473555052025893e-05, + "loss": 0.1775, + "step": 1950 + }, + { + "epoch": 4.500576701268743, + "grad_norm": 0.0, + "learning_rate": 1.046525312197747e-05, + "loss": 0.2152, + "step": 1951 + }, + { + "epoch": 4.502883506343714, + "grad_norm": 0.0, + "learning_rate": 1.0456950870558982e-05, + "loss": 0.2013, + "step": 1952 + }, + { + "epoch": 4.505190311418685, + "grad_norm": 0.0, + "learning_rate": 1.044864830350515e-05, + "loss": 0.1709, + "step": 1953 + }, + { + "epoch": 4.507497116493656, + "grad_norm": 0.0, + "learning_rate": 1.044034542655091e-05, + "loss": 0.2135, + "step": 1954 + }, + { + "epoch": 4.509803921568627, + "grad_norm": 0.0, + "learning_rate": 1.0432042245431406e-05, + "loss": 0.1422, + "step": 1955 + }, + { + "epoch": 4.5121107266435985, + "grad_norm": 0.0, + "learning_rate": 1.0423738765882006e-05, + "loss": 0.1892, + "step": 1956 + }, + { + "epoch": 4.5144175317185695, + "grad_norm": 0.0, + "learning_rate": 1.0415434993638269e-05, + "loss": 0.239, + "step": 1957 + }, + { + "epoch": 4.516724336793541, + "grad_norm": 0.0, + "learning_rate": 1.040713093443596e-05, + "loss": 0.2019, + "step": 1958 + }, + { + "epoch": 4.519031141868512, + "grad_norm": 0.0, + "learning_rate": 1.039882659401105e-05, + "loss": 0.1672, + "step": 1959 + }, + { + "epoch": 4.521337946943484, + "grad_norm": 0.0, + "learning_rate": 1.0390521978099697e-05, + "loss": 0.144, + "step": 1960 + }, + { + "epoch": 4.523644752018455, + "grad_norm": 0.0, + "learning_rate": 1.0382217092438256e-05, + "loss": 0.139, + "step": 1961 + }, + { + "epoch": 4.525951557093426, + "grad_norm": 0.0, + "learning_rate": 1.037391194276326e-05, + "loss": 0.2045, + "step": 1962 + }, + { + "epoch": 4.528258362168397, + "grad_norm": 0.0, + "learning_rate": 1.0365606534811423e-05, + "loss": 0.1251, + "step": 1963 + }, + { + "epoch": 4.530565167243368, + "grad_norm": 0.0, + "learning_rate": 1.0357300874319651e-05, + "loss": 0.2713, + "step": 1964 + }, + { + "epoch": 4.532871972318339, + "grad_norm": 0.0, + "learning_rate": 1.0348994967025012e-05, + "loss": 0.2667, + "step": 1965 + }, + { + "epoch": 4.53517877739331, + "grad_norm": 0.0, + "learning_rate": 1.0340688818664746e-05, + "loss": 0.1185, + "step": 1966 + }, + { + "epoch": 4.537485582468282, + "grad_norm": 0.0, + "learning_rate": 1.0332382434976267e-05, + "loss": 0.2311, + "step": 1967 + }, + { + "epoch": 4.539792387543253, + "grad_norm": 0.0, + "learning_rate": 1.0324075821697146e-05, + "loss": 0.2394, + "step": 1968 + }, + { + "epoch": 4.542099192618224, + "grad_norm": 0.0, + "learning_rate": 1.031576898456511e-05, + "loss": 0.1736, + "step": 1969 + }, + { + "epoch": 4.544405997693195, + "grad_norm": 0.0, + "learning_rate": 1.0307461929318045e-05, + "loss": 0.1617, + "step": 1970 + }, + { + "epoch": 4.546712802768166, + "grad_norm": 0.0, + "learning_rate": 1.0299154661693987e-05, + "loss": 0.1496, + "step": 1971 + }, + { + "epoch": 4.549019607843137, + "grad_norm": 0.0, + "learning_rate": 1.0290847187431115e-05, + "loss": 0.1768, + "step": 1972 + }, + { + "epoch": 4.551326412918108, + "grad_norm": 0.0, + "learning_rate": 1.0282539512267758e-05, + "loss": 0.18, + "step": 1973 + }, + { + "epoch": 4.553633217993079, + "grad_norm": 0.0, + "learning_rate": 1.0274231641942378e-05, + "loss": 0.1832, + "step": 1974 + }, + { + "epoch": 4.555940023068051, + "grad_norm": 0.0, + "learning_rate": 1.0265923582193574e-05, + "loss": 0.1633, + "step": 1975 + }, + { + "epoch": 4.558246828143022, + "grad_norm": 0.0, + "learning_rate": 1.0257615338760073e-05, + "loss": 0.247, + "step": 1976 + }, + { + "epoch": 4.560553633217993, + "grad_norm": 0.0, + "learning_rate": 1.0249306917380731e-05, + "loss": 0.0854, + "step": 1977 + }, + { + "epoch": 4.562860438292964, + "grad_norm": 0.0, + "learning_rate": 1.024099832379453e-05, + "loss": 0.2056, + "step": 1978 + }, + { + "epoch": 4.565167243367935, + "grad_norm": 0.0, + "learning_rate": 1.0232689563740563e-05, + "loss": 0.2498, + "step": 1979 + }, + { + "epoch": 4.567474048442906, + "grad_norm": 0.0, + "learning_rate": 1.0224380642958052e-05, + "loss": 0.1238, + "step": 1980 + }, + { + "epoch": 4.569780853517877, + "grad_norm": 0.0, + "learning_rate": 1.0216071567186312e-05, + "loss": 0.2554, + "step": 1981 + }, + { + "epoch": 4.572087658592849, + "grad_norm": 0.0, + "learning_rate": 1.0207762342164778e-05, + "loss": 0.2142, + "step": 1982 + }, + { + "epoch": 4.57439446366782, + "grad_norm": 0.0, + "learning_rate": 1.0199452973632982e-05, + "loss": 0.1608, + "step": 1983 + }, + { + "epoch": 4.5767012687427915, + "grad_norm": 0.0, + "learning_rate": 1.0191143467330558e-05, + "loss": 0.1009, + "step": 1984 + }, + { + "epoch": 4.5790080738177625, + "grad_norm": 0.0, + "learning_rate": 1.0182833828997238e-05, + "loss": 0.0881, + "step": 1985 + }, + { + "epoch": 4.581314878892734, + "grad_norm": 0.0, + "learning_rate": 1.0174524064372837e-05, + "loss": 0.3117, + "step": 1986 + }, + { + "epoch": 4.583621683967705, + "grad_norm": 0.0, + "learning_rate": 1.0166214179197265e-05, + "loss": 0.1858, + "step": 1987 + }, + { + "epoch": 4.585928489042676, + "grad_norm": 0.0, + "learning_rate": 1.0157904179210507e-05, + "loss": 0.1414, + "step": 1988 + }, + { + "epoch": 4.588235294117647, + "grad_norm": 0.0, + "learning_rate": 1.0149594070152638e-05, + "loss": 0.2033, + "step": 1989 + }, + { + "epoch": 4.590542099192618, + "grad_norm": 0.0, + "learning_rate": 1.01412838577638e-05, + "loss": 0.186, + "step": 1990 + }, + { + "epoch": 4.59284890426759, + "grad_norm": 0.0, + "learning_rate": 1.013297354778421e-05, + "loss": 0.212, + "step": 1991 + }, + { + "epoch": 4.595155709342561, + "grad_norm": 0.0, + "learning_rate": 1.0124663145954152e-05, + "loss": 0.153, + "step": 1992 + }, + { + "epoch": 4.597462514417532, + "grad_norm": 0.0, + "learning_rate": 1.0116352658013973e-05, + "loss": 0.1393, + "step": 1993 + }, + { + "epoch": 4.599769319492503, + "grad_norm": 0.0, + "learning_rate": 1.0108042089704078e-05, + "loss": 0.2088, + "step": 1994 + }, + { + "epoch": 4.602076124567474, + "grad_norm": 0.0, + "learning_rate": 1.0099731446764927e-05, + "loss": 0.2114, + "step": 1995 + }, + { + "epoch": 4.604382929642445, + "grad_norm": 0.0, + "learning_rate": 1.0091420734937038e-05, + "loss": 0.1025, + "step": 1996 + }, + { + "epoch": 4.606689734717416, + "grad_norm": 0.0, + "learning_rate": 1.0083109959960974e-05, + "loss": 0.2248, + "step": 1997 + }, + { + "epoch": 4.608996539792388, + "grad_norm": 0.0, + "learning_rate": 1.007479912757733e-05, + "loss": 0.2169, + "step": 1998 + }, + { + "epoch": 4.611303344867359, + "grad_norm": 0.0, + "learning_rate": 1.0066488243526761e-05, + "loss": 0.1651, + "step": 1999 + }, + { + "epoch": 4.61361014994233, + "grad_norm": 0.0, + "learning_rate": 1.005817731354994e-05, + "loss": 0.104, + "step": 2000 + }, + { + "epoch": 4.615916955017301, + "grad_norm": 0.0, + "learning_rate": 1.0049866343387582e-05, + "loss": 0.2001, + "step": 2001 + }, + { + "epoch": 4.618223760092272, + "grad_norm": 0.0, + "learning_rate": 1.0041555338780427e-05, + "loss": 0.1955, + "step": 2002 + }, + { + "epoch": 4.620530565167243, + "grad_norm": 0.0, + "learning_rate": 1.0033244305469233e-05, + "loss": 0.1821, + "step": 2003 + }, + { + "epoch": 4.622837370242214, + "grad_norm": 0.0, + "learning_rate": 1.0024933249194792e-05, + "loss": 0.2507, + "step": 2004 + }, + { + "epoch": 4.625144175317185, + "grad_norm": 0.0, + "learning_rate": 1.0016622175697898e-05, + "loss": 0.2247, + "step": 2005 + }, + { + "epoch": 4.627450980392156, + "grad_norm": 0.0, + "learning_rate": 1.000831109071936e-05, + "loss": 0.2912, + "step": 2006 + }, + { + "epoch": 4.629757785467128, + "grad_norm": 0.0, + "learning_rate": 1e-05, + "loss": 0.3337, + "step": 2007 + }, + { + "epoch": 4.632064590542099, + "grad_norm": 0.0, + "learning_rate": 9.99168890928064e-06, + "loss": 0.1868, + "step": 2008 + }, + { + "epoch": 4.6343713956170705, + "grad_norm": 0.0, + "learning_rate": 9.983377824302107e-06, + "loss": 0.154, + "step": 2009 + }, + { + "epoch": 4.6366782006920415, + "grad_norm": 0.0, + "learning_rate": 9.97506675080521e-06, + "loss": 0.2047, + "step": 2010 + }, + { + "epoch": 4.638985005767013, + "grad_norm": 0.0, + "learning_rate": 9.966755694530768e-06, + "loss": 0.2307, + "step": 2011 + }, + { + "epoch": 4.641291810841984, + "grad_norm": 0.0, + "learning_rate": 9.958444661219578e-06, + "loss": 0.1706, + "step": 2012 + }, + { + "epoch": 4.643598615916955, + "grad_norm": 0.0, + "learning_rate": 9.950133656612421e-06, + "loss": 0.1953, + "step": 2013 + }, + { + "epoch": 4.645905420991927, + "grad_norm": 0.0, + "learning_rate": 9.941822686450061e-06, + "loss": 0.1385, + "step": 2014 + }, + { + "epoch": 4.648212226066898, + "grad_norm": 0.0, + "learning_rate": 9.933511756473244e-06, + "loss": 0.1783, + "step": 2015 + }, + { + "epoch": 4.650519031141869, + "grad_norm": 0.0, + "learning_rate": 9.925200872422671e-06, + "loss": 0.2209, + "step": 2016 + }, + { + "epoch": 4.65282583621684, + "grad_norm": 0.0, + "learning_rate": 9.916890040039031e-06, + "loss": 0.1452, + "step": 2017 + }, + { + "epoch": 4.655132641291811, + "grad_norm": 0.0, + "learning_rate": 9.908579265062967e-06, + "loss": 0.2519, + "step": 2018 + }, + { + "epoch": 4.657439446366782, + "grad_norm": 0.0, + "learning_rate": 9.900268553235077e-06, + "loss": 0.2668, + "step": 2019 + }, + { + "epoch": 4.659746251441753, + "grad_norm": 0.0, + "learning_rate": 9.891957910295926e-06, + "loss": 0.2084, + "step": 2020 + }, + { + "epoch": 4.662053056516724, + "grad_norm": 0.0, + "learning_rate": 9.883647341986032e-06, + "loss": 0.1009, + "step": 2021 + }, + { + "epoch": 4.664359861591695, + "grad_norm": 0.0, + "learning_rate": 9.87533685404585e-06, + "loss": 0.1283, + "step": 2022 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.0, + "learning_rate": 9.867026452215791e-06, + "loss": 0.2079, + "step": 2023 + }, + { + "epoch": 4.668973471741638, + "grad_norm": 0.0, + "learning_rate": 9.858716142236205e-06, + "loss": 0.2246, + "step": 2024 + }, + { + "epoch": 4.671280276816609, + "grad_norm": 0.0, + "learning_rate": 9.850405929847367e-06, + "loss": 0.2779, + "step": 2025 + }, + { + "epoch": 4.67358708189158, + "grad_norm": 0.0, + "learning_rate": 9.842095820789495e-06, + "loss": 0.1413, + "step": 2026 + }, + { + "epoch": 4.675893886966551, + "grad_norm": 0.0, + "learning_rate": 9.833785820802739e-06, + "loss": 0.1801, + "step": 2027 + }, + { + "epoch": 4.678200692041522, + "grad_norm": 0.0, + "learning_rate": 9.825475935627165e-06, + "loss": 0.2998, + "step": 2028 + }, + { + "epoch": 4.680507497116493, + "grad_norm": 0.0, + "learning_rate": 9.817166171002766e-06, + "loss": 0.1455, + "step": 2029 + }, + { + "epoch": 4.682814302191465, + "grad_norm": 0.0, + "learning_rate": 9.808856532669442e-06, + "loss": 0.1643, + "step": 2030 + }, + { + "epoch": 4.685121107266436, + "grad_norm": 0.0, + "learning_rate": 9.800547026367022e-06, + "loss": 0.2727, + "step": 2031 + }, + { + "epoch": 4.687427912341407, + "grad_norm": 0.0, + "learning_rate": 9.792237657835225e-06, + "loss": 0.1746, + "step": 2032 + }, + { + "epoch": 4.689734717416378, + "grad_norm": 0.0, + "learning_rate": 9.783928432813688e-06, + "loss": 0.1468, + "step": 2033 + }, + { + "epoch": 4.692041522491349, + "grad_norm": 0.0, + "learning_rate": 9.775619357041952e-06, + "loss": 0.1795, + "step": 2034 + }, + { + "epoch": 4.6943483275663205, + "grad_norm": 0.0, + "learning_rate": 9.767310436259438e-06, + "loss": 0.1787, + "step": 2035 + }, + { + "epoch": 4.6966551326412915, + "grad_norm": 0.0, + "learning_rate": 9.759001676205472e-06, + "loss": 0.1027, + "step": 2036 + }, + { + "epoch": 4.698961937716263, + "grad_norm": 0.0, + "learning_rate": 9.750693082619274e-06, + "loss": 0.1319, + "step": 2037 + }, + { + "epoch": 4.7012687427912345, + "grad_norm": 0.0, + "learning_rate": 9.74238466123993e-06, + "loss": 0.2192, + "step": 2038 + }, + { + "epoch": 4.703575547866206, + "grad_norm": 0.0, + "learning_rate": 9.734076417806428e-06, + "loss": 0.1089, + "step": 2039 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.0, + "learning_rate": 9.725768358057625e-06, + "loss": 0.2057, + "step": 2040 + }, + { + "epoch": 4.708189158016148, + "grad_norm": 0.0, + "learning_rate": 9.717460487732246e-06, + "loss": 0.1373, + "step": 2041 + }, + { + "epoch": 4.710495963091119, + "grad_norm": 0.0, + "learning_rate": 9.709152812568886e-06, + "loss": 0.2446, + "step": 2042 + }, + { + "epoch": 4.71280276816609, + "grad_norm": 0.0, + "learning_rate": 9.700845338306018e-06, + "loss": 0.1814, + "step": 2043 + }, + { + "epoch": 4.715109573241061, + "grad_norm": 0.0, + "learning_rate": 9.692538070681957e-06, + "loss": 0.2648, + "step": 2044 + }, + { + "epoch": 4.717416378316033, + "grad_norm": 0.0, + "learning_rate": 9.684231015434891e-06, + "loss": 0.1349, + "step": 2045 + }, + { + "epoch": 4.719723183391004, + "grad_norm": 0.0, + "learning_rate": 9.675924178302857e-06, + "loss": 0.2206, + "step": 2046 + }, + { + "epoch": 4.722029988465975, + "grad_norm": 0.0, + "learning_rate": 9.667617565023734e-06, + "loss": 0.2443, + "step": 2047 + }, + { + "epoch": 4.724336793540946, + "grad_norm": 0.0, + "learning_rate": 9.659311181335255e-06, + "loss": 0.2422, + "step": 2048 + }, + { + "epoch": 4.726643598615917, + "grad_norm": 0.0, + "learning_rate": 9.651005032974994e-06, + "loss": 0.1193, + "step": 2049 + }, + { + "epoch": 4.728950403690888, + "grad_norm": 0.0, + "learning_rate": 9.642699125680352e-06, + "loss": 0.1496, + "step": 2050 + }, + { + "epoch": 4.731257208765859, + "grad_norm": 0.0, + "learning_rate": 9.634393465188577e-06, + "loss": 0.1183, + "step": 2051 + }, + { + "epoch": 4.73356401384083, + "grad_norm": 0.0, + "learning_rate": 9.626088057236745e-06, + "loss": 0.2147, + "step": 2052 + }, + { + "epoch": 4.735870818915801, + "grad_norm": 0.0, + "learning_rate": 9.617782907561748e-06, + "loss": 0.1133, + "step": 2053 + }, + { + "epoch": 4.738177623990773, + "grad_norm": 0.0, + "learning_rate": 9.609478021900303e-06, + "loss": 0.0983, + "step": 2054 + }, + { + "epoch": 4.740484429065744, + "grad_norm": 0.0, + "learning_rate": 9.601173405988955e-06, + "loss": 0.1283, + "step": 2055 + }, + { + "epoch": 4.742791234140715, + "grad_norm": 0.0, + "learning_rate": 9.592869065564043e-06, + "loss": 0.2352, + "step": 2056 + }, + { + "epoch": 4.745098039215686, + "grad_norm": 0.0, + "learning_rate": 9.584565006361735e-06, + "loss": 0.1008, + "step": 2057 + }, + { + "epoch": 4.747404844290657, + "grad_norm": 0.0, + "learning_rate": 9.576261234117998e-06, + "loss": 0.1621, + "step": 2058 + }, + { + "epoch": 4.749711649365628, + "grad_norm": 0.0, + "learning_rate": 9.567957754568596e-06, + "loss": 0.1246, + "step": 2059 + }, + { + "epoch": 4.7520184544405994, + "grad_norm": 0.0, + "learning_rate": 9.559654573449093e-06, + "loss": 0.2057, + "step": 2060 + }, + { + "epoch": 4.754325259515571, + "grad_norm": 0.0, + "learning_rate": 9.551351696494854e-06, + "loss": 0.2435, + "step": 2061 + }, + { + "epoch": 4.756632064590542, + "grad_norm": 0.0, + "learning_rate": 9.543049129441021e-06, + "loss": 0.2536, + "step": 2062 + }, + { + "epoch": 4.7589388696655135, + "grad_norm": 0.0, + "learning_rate": 9.534746878022533e-06, + "loss": 0.1903, + "step": 2063 + }, + { + "epoch": 4.7612456747404845, + "grad_norm": 0.0, + "learning_rate": 9.526444947974112e-06, + "loss": 0.2132, + "step": 2064 + }, + { + "epoch": 4.763552479815456, + "grad_norm": 0.0, + "learning_rate": 9.518143345030247e-06, + "loss": 0.175, + "step": 2065 + }, + { + "epoch": 4.765859284890427, + "grad_norm": 0.0, + "learning_rate": 9.509842074925204e-06, + "loss": 0.1628, + "step": 2066 + }, + { + "epoch": 4.768166089965398, + "grad_norm": 0.0, + "learning_rate": 9.501541143393028e-06, + "loss": 0.1632, + "step": 2067 + }, + { + "epoch": 4.770472895040369, + "grad_norm": 0.0, + "learning_rate": 9.493240556167527e-06, + "loss": 0.1342, + "step": 2068 + }, + { + "epoch": 4.77277970011534, + "grad_norm": 0.0, + "learning_rate": 9.484940318982261e-06, + "loss": 0.1732, + "step": 2069 + }, + { + "epoch": 4.775086505190312, + "grad_norm": 0.0, + "learning_rate": 9.476640437570562e-06, + "loss": 0.248, + "step": 2070 + }, + { + "epoch": 4.777393310265283, + "grad_norm": 0.0, + "learning_rate": 9.468340917665508e-06, + "loss": 0.1878, + "step": 2071 + }, + { + "epoch": 4.779700115340254, + "grad_norm": 0.0, + "learning_rate": 9.460041764999929e-06, + "loss": 0.144, + "step": 2072 + }, + { + "epoch": 4.782006920415225, + "grad_norm": 0.0, + "learning_rate": 9.4517429853064e-06, + "loss": 0.1364, + "step": 2073 + }, + { + "epoch": 4.784313725490196, + "grad_norm": 0.0, + "learning_rate": 9.443444584317244e-06, + "loss": 0.1836, + "step": 2074 + }, + { + "epoch": 4.786620530565167, + "grad_norm": 0.0, + "learning_rate": 9.435146567764516e-06, + "loss": 0.2614, + "step": 2075 + }, + { + "epoch": 4.788927335640138, + "grad_norm": 0.0, + "learning_rate": 9.426848941380007e-06, + "loss": 0.1042, + "step": 2076 + }, + { + "epoch": 4.79123414071511, + "grad_norm": 0.0, + "learning_rate": 9.418551710895243e-06, + "loss": 0.229, + "step": 2077 + }, + { + "epoch": 4.793540945790081, + "grad_norm": 0.0, + "learning_rate": 9.410254882041469e-06, + "loss": 0.1581, + "step": 2078 + }, + { + "epoch": 4.795847750865052, + "grad_norm": 0.0, + "learning_rate": 9.401958460549658e-06, + "loss": 0.2721, + "step": 2079 + }, + { + "epoch": 4.798154555940023, + "grad_norm": 0.0, + "learning_rate": 9.393662452150504e-06, + "loss": 0.1733, + "step": 2080 + }, + { + "epoch": 4.800461361014994, + "grad_norm": 0.0, + "learning_rate": 9.385366862574405e-06, + "loss": 0.098, + "step": 2081 + }, + { + "epoch": 4.802768166089965, + "grad_norm": 0.0, + "learning_rate": 9.377071697551479e-06, + "loss": 0.2411, + "step": 2082 + }, + { + "epoch": 4.805074971164936, + "grad_norm": 0.0, + "learning_rate": 9.368776962811552e-06, + "loss": 0.3038, + "step": 2083 + }, + { + "epoch": 4.807381776239907, + "grad_norm": 0.0, + "learning_rate": 9.360482664084144e-06, + "loss": 0.2478, + "step": 2084 + }, + { + "epoch": 4.809688581314878, + "grad_norm": 0.0, + "learning_rate": 9.352188807098482e-06, + "loss": 0.2386, + "step": 2085 + }, + { + "epoch": 4.81199538638985, + "grad_norm": 0.0, + "learning_rate": 9.343895397583486e-06, + "loss": 0.1666, + "step": 2086 + }, + { + "epoch": 4.814302191464821, + "grad_norm": 0.0, + "learning_rate": 9.33560244126776e-06, + "loss": 0.1843, + "step": 2087 + }, + { + "epoch": 4.8166089965397925, + "grad_norm": 0.0, + "learning_rate": 9.327309943879604e-06, + "loss": 0.1863, + "step": 2088 + }, + { + "epoch": 4.8189158016147635, + "grad_norm": 0.0, + "learning_rate": 9.319017911147e-06, + "loss": 0.1974, + "step": 2089 + }, + { + "epoch": 4.821222606689735, + "grad_norm": 0.0, + "learning_rate": 9.310726348797603e-06, + "loss": 0.2245, + "step": 2090 + }, + { + "epoch": 4.823529411764706, + "grad_norm": 0.0, + "learning_rate": 9.302435262558748e-06, + "loss": 0.1434, + "step": 2091 + }, + { + "epoch": 4.825836216839677, + "grad_norm": 0.0, + "learning_rate": 9.294144658157443e-06, + "loss": 0.3041, + "step": 2092 + }, + { + "epoch": 4.828143021914649, + "grad_norm": 0.0, + "learning_rate": 9.285854541320352e-06, + "loss": 0.2306, + "step": 2093 + }, + { + "epoch": 4.83044982698962, + "grad_norm": 0.0, + "learning_rate": 9.277564917773816e-06, + "loss": 0.1256, + "step": 2094 + }, + { + "epoch": 4.832756632064591, + "grad_norm": 0.0, + "learning_rate": 9.269275793243832e-06, + "loss": 0.2251, + "step": 2095 + }, + { + "epoch": 4.835063437139562, + "grad_norm": 0.0, + "learning_rate": 9.260987173456047e-06, + "loss": 0.1819, + "step": 2096 + }, + { + "epoch": 4.837370242214533, + "grad_norm": 0.0, + "learning_rate": 9.252699064135759e-06, + "loss": 0.1368, + "step": 2097 + }, + { + "epoch": 4.839677047289504, + "grad_norm": 0.0, + "learning_rate": 9.244411471007923e-06, + "loss": 0.2171, + "step": 2098 + }, + { + "epoch": 4.841983852364475, + "grad_norm": 0.0, + "learning_rate": 9.236124399797122e-06, + "loss": 0.2526, + "step": 2099 + }, + { + "epoch": 4.844290657439446, + "grad_norm": 0.0, + "learning_rate": 9.227837856227594e-06, + "loss": 0.2098, + "step": 2100 + }, + { + "epoch": 4.846597462514418, + "grad_norm": 0.0, + "learning_rate": 9.219551846023211e-06, + "loss": 0.297, + "step": 2101 + }, + { + "epoch": 4.848904267589389, + "grad_norm": 0.0, + "learning_rate": 9.211266374907463e-06, + "loss": 0.1249, + "step": 2102 + }, + { + "epoch": 4.85121107266436, + "grad_norm": 0.0, + "learning_rate": 9.202981448603477e-06, + "loss": 0.1494, + "step": 2103 + }, + { + "epoch": 4.853517877739331, + "grad_norm": 0.0, + "learning_rate": 9.194697072834009e-06, + "loss": 0.1136, + "step": 2104 + }, + { + "epoch": 4.855824682814302, + "grad_norm": 0.0, + "learning_rate": 9.18641325332142e-06, + "loss": 0.1963, + "step": 2105 + }, + { + "epoch": 4.858131487889273, + "grad_norm": 0.0, + "learning_rate": 9.178129995787698e-06, + "loss": 0.2581, + "step": 2106 + }, + { + "epoch": 4.860438292964244, + "grad_norm": 0.0, + "learning_rate": 9.169847305954448e-06, + "loss": 0.1883, + "step": 2107 + }, + { + "epoch": 4.862745098039216, + "grad_norm": 0.0, + "learning_rate": 9.16156518954287e-06, + "loss": 0.1476, + "step": 2108 + }, + { + "epoch": 4.865051903114187, + "grad_norm": 0.0, + "learning_rate": 9.153283652273768e-06, + "loss": 0.2835, + "step": 2109 + }, + { + "epoch": 4.867358708189158, + "grad_norm": 0.0, + "learning_rate": 9.145002699867556e-06, + "loss": 0.1969, + "step": 2110 + }, + { + "epoch": 4.869665513264129, + "grad_norm": 0.0, + "learning_rate": 9.136722338044244e-06, + "loss": 0.1918, + "step": 2111 + }, + { + "epoch": 4.8719723183391, + "grad_norm": 0.0, + "learning_rate": 9.128442572523418e-06, + "loss": 0.2292, + "step": 2112 + }, + { + "epoch": 4.874279123414071, + "grad_norm": 0.0, + "learning_rate": 9.120163409024272e-06, + "loss": 0.1756, + "step": 2113 + }, + { + "epoch": 4.8765859284890425, + "grad_norm": 0.0, + "learning_rate": 9.111884853265573e-06, + "loss": 0.2545, + "step": 2114 + }, + { + "epoch": 4.8788927335640135, + "grad_norm": 0.0, + "learning_rate": 9.103606910965666e-06, + "loss": 0.1474, + "step": 2115 + }, + { + "epoch": 4.881199538638985, + "grad_norm": 0.0, + "learning_rate": 9.09532958784248e-06, + "loss": 0.263, + "step": 2116 + }, + { + "epoch": 4.8835063437139565, + "grad_norm": 0.0, + "learning_rate": 9.087052889613519e-06, + "loss": 0.1743, + "step": 2117 + }, + { + "epoch": 4.885813148788928, + "grad_norm": 0.0, + "learning_rate": 9.078776821995839e-06, + "loss": 0.135, + "step": 2118 + }, + { + "epoch": 4.888119953863899, + "grad_norm": 0.0, + "learning_rate": 9.07050139070608e-06, + "loss": 0.1813, + "step": 2119 + }, + { + "epoch": 4.89042675893887, + "grad_norm": 0.0, + "learning_rate": 9.062226601460429e-06, + "loss": 0.1624, + "step": 2120 + }, + { + "epoch": 4.892733564013841, + "grad_norm": 0.0, + "learning_rate": 9.05395245997463e-06, + "loss": 0.1605, + "step": 2121 + }, + { + "epoch": 4.895040369088812, + "grad_norm": 0.0, + "learning_rate": 9.045678971963988e-06, + "loss": 0.2081, + "step": 2122 + }, + { + "epoch": 4.897347174163783, + "grad_norm": 0.0, + "learning_rate": 9.037406143143356e-06, + "loss": 0.1657, + "step": 2123 + }, + { + "epoch": 4.899653979238755, + "grad_norm": 0.0, + "learning_rate": 9.02913397922712e-06, + "loss": 0.2662, + "step": 2124 + }, + { + "epoch": 4.901960784313726, + "grad_norm": 0.0, + "learning_rate": 9.020862485929219e-06, + "loss": 0.2152, + "step": 2125 + }, + { + "epoch": 4.904267589388697, + "grad_norm": 0.0, + "learning_rate": 9.012591668963123e-06, + "loss": 0.1353, + "step": 2126 + }, + { + "epoch": 4.906574394463668, + "grad_norm": 0.0, + "learning_rate": 9.004321534041836e-06, + "loss": 0.1237, + "step": 2127 + }, + { + "epoch": 4.908881199538639, + "grad_norm": 0.0, + "learning_rate": 8.996052086877888e-06, + "loss": 0.1727, + "step": 2128 + }, + { + "epoch": 4.91118800461361, + "grad_norm": 0.0, + "learning_rate": 8.987783333183345e-06, + "loss": 0.2387, + "step": 2129 + }, + { + "epoch": 4.913494809688581, + "grad_norm": 0.0, + "learning_rate": 8.979515278669776e-06, + "loss": 0.1998, + "step": 2130 + }, + { + "epoch": 4.915801614763552, + "grad_norm": 0.0, + "learning_rate": 8.971247929048283e-06, + "loss": 0.2162, + "step": 2131 + }, + { + "epoch": 4.918108419838523, + "grad_norm": 0.0, + "learning_rate": 8.962981290029475e-06, + "loss": 0.2075, + "step": 2132 + }, + { + "epoch": 4.920415224913495, + "grad_norm": 0.0, + "learning_rate": 8.954715367323468e-06, + "loss": 0.1986, + "step": 2133 + }, + { + "epoch": 4.922722029988466, + "grad_norm": 0.0, + "learning_rate": 8.946450166639883e-06, + "loss": 0.1778, + "step": 2134 + }, + { + "epoch": 4.925028835063437, + "grad_norm": 0.0, + "learning_rate": 8.938185693687853e-06, + "loss": 0.1975, + "step": 2135 + }, + { + "epoch": 4.927335640138408, + "grad_norm": 0.0, + "learning_rate": 8.92992195417599e-06, + "loss": 0.2094, + "step": 2136 + }, + { + "epoch": 4.929642445213379, + "grad_norm": 0.0, + "learning_rate": 8.921658953812416e-06, + "loss": 0.1985, + "step": 2137 + }, + { + "epoch": 4.93194925028835, + "grad_norm": 0.0, + "learning_rate": 8.913396698304733e-06, + "loss": 0.1639, + "step": 2138 + }, + { + "epoch": 4.9342560553633215, + "grad_norm": 0.0, + "learning_rate": 8.905135193360032e-06, + "loss": 0.1222, + "step": 2139 + }, + { + "epoch": 4.936562860438293, + "grad_norm": 0.0, + "learning_rate": 8.896874444684882e-06, + "loss": 0.1553, + "step": 2140 + }, + { + "epoch": 4.9388696655132645, + "grad_norm": 0.0, + "learning_rate": 8.888614457985343e-06, + "loss": 0.2064, + "step": 2141 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 0.0, + "learning_rate": 8.880355238966923e-06, + "loss": 0.2544, + "step": 2142 + }, + { + "epoch": 4.9434832756632066, + "grad_norm": 0.0, + "learning_rate": 8.872096793334624e-06, + "loss": 0.1528, + "step": 2143 + }, + { + "epoch": 4.945790080738178, + "grad_norm": 0.0, + "learning_rate": 8.863839126792905e-06, + "loss": 0.2023, + "step": 2144 + }, + { + "epoch": 4.948096885813149, + "grad_norm": 0.0, + "learning_rate": 8.855582245045682e-06, + "loss": 0.12, + "step": 2145 + }, + { + "epoch": 4.95040369088812, + "grad_norm": 0.0, + "learning_rate": 8.847326153796335e-06, + "loss": 0.1589, + "step": 2146 + }, + { + "epoch": 4.952710495963091, + "grad_norm": 0.0, + "learning_rate": 8.839070858747697e-06, + "loss": 0.2069, + "step": 2147 + }, + { + "epoch": 4.955017301038062, + "grad_norm": 0.0, + "learning_rate": 8.830816365602053e-06, + "loss": 0.1201, + "step": 2148 + }, + { + "epoch": 4.957324106113034, + "grad_norm": 0.0, + "learning_rate": 8.822562680061127e-06, + "loss": 0.2688, + "step": 2149 + }, + { + "epoch": 4.959630911188005, + "grad_norm": 0.0, + "learning_rate": 8.814309807826092e-06, + "loss": 0.188, + "step": 2150 + }, + { + "epoch": 4.961937716262976, + "grad_norm": 0.0, + "learning_rate": 8.806057754597559e-06, + "loss": 0.178, + "step": 2151 + }, + { + "epoch": 4.964244521337947, + "grad_norm": 0.0, + "learning_rate": 8.797806526075566e-06, + "loss": 0.2219, + "step": 2152 + }, + { + "epoch": 4.966551326412918, + "grad_norm": 0.0, + "learning_rate": 8.789556127959586e-06, + "loss": 0.1495, + "step": 2153 + }, + { + "epoch": 4.968858131487889, + "grad_norm": 0.0, + "learning_rate": 8.781306565948528e-06, + "loss": 0.1692, + "step": 2154 + }, + { + "epoch": 4.97116493656286, + "grad_norm": 0.0, + "learning_rate": 8.773057845740702e-06, + "loss": 0.1442, + "step": 2155 + }, + { + "epoch": 4.973471741637832, + "grad_norm": 0.0, + "learning_rate": 8.76480997303386e-06, + "loss": 0.1999, + "step": 2156 + }, + { + "epoch": 4.975778546712803, + "grad_norm": 0.0, + "learning_rate": 8.756562953525151e-06, + "loss": 0.1109, + "step": 2157 + }, + { + "epoch": 4.978085351787774, + "grad_norm": 0.0, + "learning_rate": 8.74831679291114e-06, + "loss": 0.1679, + "step": 2158 + }, + { + "epoch": 4.980392156862745, + "grad_norm": 0.0, + "learning_rate": 8.740071496887803e-06, + "loss": 0.2638, + "step": 2159 + }, + { + "epoch": 4.982698961937716, + "grad_norm": 0.0, + "learning_rate": 8.731827071150519e-06, + "loss": 0.1411, + "step": 2160 + }, + { + "epoch": 4.985005767012687, + "grad_norm": 0.0, + "learning_rate": 8.723583521394054e-06, + "loss": 0.1695, + "step": 2161 + }, + { + "epoch": 4.987312572087658, + "grad_norm": 0.0, + "learning_rate": 8.715340853312586e-06, + "loss": 0.1538, + "step": 2162 + }, + { + "epoch": 4.989619377162629, + "grad_norm": 0.0, + "learning_rate": 8.70709907259967e-06, + "loss": 0.1721, + "step": 2163 + }, + { + "epoch": 4.9919261822376, + "grad_norm": 0.0, + "learning_rate": 8.698858184948254e-06, + "loss": 0.1933, + "step": 2164 + }, + { + "epoch": 4.994232987312572, + "grad_norm": 0.0, + "learning_rate": 8.690618196050667e-06, + "loss": 0.1941, + "step": 2165 + }, + { + "epoch": 4.996539792387543, + "grad_norm": 0.0, + "learning_rate": 8.682379111598626e-06, + "loss": 0.2005, + "step": 2166 + }, + { + "epoch": 4.9988465974625145, + "grad_norm": 0.0, + "learning_rate": 8.674140937283208e-06, + "loss": 0.2144, + "step": 2167 + }, + { + "epoch": 5.0011534025374855, + "grad_norm": 0.0, + "learning_rate": 8.665903678794873e-06, + "loss": 0.1495, + "step": 2168 + }, + { + "epoch": 5.003460207612457, + "grad_norm": 0.0, + "learning_rate": 8.657667341823449e-06, + "loss": 0.1516, + "step": 2169 + }, + { + "epoch": 5.005767012687428, + "grad_norm": 0.0, + "learning_rate": 8.649431932058111e-06, + "loss": 0.1195, + "step": 2170 + }, + { + "epoch": 5.008073817762399, + "grad_norm": 0.0, + "learning_rate": 8.641197455187418e-06, + "loss": 0.0936, + "step": 2171 + }, + { + "epoch": 5.010380622837371, + "grad_norm": 0.0, + "learning_rate": 8.632963916899268e-06, + "loss": 0.1459, + "step": 2172 + }, + { + "epoch": 5.012687427912342, + "grad_norm": 0.0, + "learning_rate": 8.624731322880913e-06, + "loss": 0.1414, + "step": 2173 + }, + { + "epoch": 5.014994232987313, + "grad_norm": 0.0, + "learning_rate": 8.616499678818958e-06, + "loss": 0.1322, + "step": 2174 + }, + { + "epoch": 5.017301038062284, + "grad_norm": 0.0, + "learning_rate": 8.60826899039935e-06, + "loss": 0.124, + "step": 2175 + }, + { + "epoch": 5.019607843137255, + "grad_norm": 0.0, + "learning_rate": 8.600039263307367e-06, + "loss": 0.1516, + "step": 2176 + }, + { + "epoch": 5.021914648212226, + "grad_norm": 0.0, + "learning_rate": 8.591810503227634e-06, + "loss": 0.1479, + "step": 2177 + }, + { + "epoch": 5.024221453287197, + "grad_norm": 0.0, + "learning_rate": 8.583582715844113e-06, + "loss": 0.0904, + "step": 2178 + }, + { + "epoch": 5.026528258362168, + "grad_norm": 0.0, + "learning_rate": 8.575355906840073e-06, + "loss": 0.1361, + "step": 2179 + }, + { + "epoch": 5.02883506343714, + "grad_norm": 0.0, + "learning_rate": 8.567130081898127e-06, + "loss": 0.0867, + "step": 2180 + }, + { + "epoch": 5.031141868512111, + "grad_norm": 0.0, + "learning_rate": 8.558905246700202e-06, + "loss": 0.0681, + "step": 2181 + }, + { + "epoch": 5.033448673587082, + "grad_norm": 0.0, + "learning_rate": 8.550681406927534e-06, + "loss": 0.1284, + "step": 2182 + }, + { + "epoch": 5.035755478662053, + "grad_norm": 0.0, + "learning_rate": 8.542458568260682e-06, + "loss": 0.1254, + "step": 2183 + }, + { + "epoch": 5.038062283737024, + "grad_norm": 0.0, + "learning_rate": 8.534236736379515e-06, + "loss": 0.1262, + "step": 2184 + }, + { + "epoch": 5.040369088811995, + "grad_norm": 0.0, + "learning_rate": 8.52601591696319e-06, + "loss": 0.1241, + "step": 2185 + }, + { + "epoch": 5.042675893886966, + "grad_norm": 0.0, + "learning_rate": 8.517796115690183e-06, + "loss": 0.08, + "step": 2186 + }, + { + "epoch": 5.044982698961937, + "grad_norm": 0.0, + "learning_rate": 8.509577338238255e-06, + "loss": 0.1439, + "step": 2187 + }, + { + "epoch": 5.047289504036909, + "grad_norm": 0.0, + "learning_rate": 8.501359590284472e-06, + "loss": 0.1756, + "step": 2188 + }, + { + "epoch": 5.04959630911188, + "grad_norm": 0.0, + "learning_rate": 8.49314287750517e-06, + "loss": 0.1842, + "step": 2189 + }, + { + "epoch": 5.051903114186851, + "grad_norm": 0.0, + "learning_rate": 8.484927205575985e-06, + "loss": 0.1437, + "step": 2190 + }, + { + "epoch": 5.054209919261822, + "grad_norm": 0.0, + "learning_rate": 8.476712580171838e-06, + "loss": 0.0606, + "step": 2191 + }, + { + "epoch": 5.0565167243367934, + "grad_norm": 0.0, + "learning_rate": 8.46849900696691e-06, + "loss": 0.121, + "step": 2192 + }, + { + "epoch": 5.0588235294117645, + "grad_norm": 0.0, + "learning_rate": 8.460286491634664e-06, + "loss": 0.1937, + "step": 2193 + }, + { + "epoch": 5.0611303344867355, + "grad_norm": 0.0, + "learning_rate": 8.45207503984784e-06, + "loss": 0.0853, + "step": 2194 + }, + { + "epoch": 5.063437139561707, + "grad_norm": 0.0, + "learning_rate": 8.443864657278428e-06, + "loss": 0.0855, + "step": 2195 + }, + { + "epoch": 5.0657439446366785, + "grad_norm": 0.0, + "learning_rate": 8.43565534959769e-06, + "loss": 0.1329, + "step": 2196 + }, + { + "epoch": 5.06805074971165, + "grad_norm": 0.0, + "learning_rate": 8.427447122476148e-06, + "loss": 0.1022, + "step": 2197 + }, + { + "epoch": 5.070357554786621, + "grad_norm": 0.0, + "learning_rate": 8.419239981583567e-06, + "loss": 0.179, + "step": 2198 + }, + { + "epoch": 5.072664359861592, + "grad_norm": 0.0, + "learning_rate": 8.411033932588969e-06, + "loss": 0.1191, + "step": 2199 + }, + { + "epoch": 5.074971164936563, + "grad_norm": 0.0, + "learning_rate": 8.40282898116062e-06, + "loss": 0.0543, + "step": 2200 + }, + { + "epoch": 5.077277970011534, + "grad_norm": 0.0, + "learning_rate": 8.394625132966025e-06, + "loss": 0.1193, + "step": 2201 + }, + { + "epoch": 5.079584775086505, + "grad_norm": 0.0, + "learning_rate": 8.386422393671934e-06, + "loss": 0.1245, + "step": 2202 + }, + { + "epoch": 5.081891580161477, + "grad_norm": 0.0, + "learning_rate": 8.378220768944328e-06, + "loss": 0.1485, + "step": 2203 + }, + { + "epoch": 5.084198385236448, + "grad_norm": 0.0, + "learning_rate": 8.370020264448413e-06, + "loss": 0.0965, + "step": 2204 + }, + { + "epoch": 5.086505190311419, + "grad_norm": 0.0, + "learning_rate": 8.361820885848623e-06, + "loss": 0.0803, + "step": 2205 + }, + { + "epoch": 5.08881199538639, + "grad_norm": 0.0, + "learning_rate": 8.353622638808628e-06, + "loss": 0.0971, + "step": 2206 + }, + { + "epoch": 5.091118800461361, + "grad_norm": 0.0, + "learning_rate": 8.34542552899129e-06, + "loss": 0.1029, + "step": 2207 + }, + { + "epoch": 5.093425605536332, + "grad_norm": 0.0, + "learning_rate": 8.337229562058707e-06, + "loss": 0.1029, + "step": 2208 + }, + { + "epoch": 5.095732410611303, + "grad_norm": 0.0, + "learning_rate": 8.329034743672187e-06, + "loss": 0.1178, + "step": 2209 + }, + { + "epoch": 5.098039215686274, + "grad_norm": 0.0, + "learning_rate": 8.32084107949223e-06, + "loss": 0.1704, + "step": 2210 + }, + { + "epoch": 5.100346020761246, + "grad_norm": 0.0, + "learning_rate": 8.312648575178552e-06, + "loss": 0.1339, + "step": 2211 + }, + { + "epoch": 5.102652825836217, + "grad_norm": 0.0, + "learning_rate": 8.304457236390062e-06, + "loss": 0.1364, + "step": 2212 + }, + { + "epoch": 5.104959630911188, + "grad_norm": 0.0, + "learning_rate": 8.296267068784862e-06, + "loss": 0.1252, + "step": 2213 + }, + { + "epoch": 5.107266435986159, + "grad_norm": 0.0, + "learning_rate": 8.28807807802025e-06, + "loss": 0.099, + "step": 2214 + }, + { + "epoch": 5.10957324106113, + "grad_norm": 0.0, + "learning_rate": 8.279890269752715e-06, + "loss": 0.1126, + "step": 2215 + }, + { + "epoch": 5.111880046136101, + "grad_norm": 0.0, + "learning_rate": 8.271703649637911e-06, + "loss": 0.1939, + "step": 2216 + }, + { + "epoch": 5.114186851211072, + "grad_norm": 0.0, + "learning_rate": 8.263518223330698e-06, + "loss": 0.0793, + "step": 2217 + }, + { + "epoch": 5.1164936562860435, + "grad_norm": 0.0, + "learning_rate": 8.25533399648509e-06, + "loss": 0.1413, + "step": 2218 + }, + { + "epoch": 5.118800461361015, + "grad_norm": 0.0, + "learning_rate": 8.247150974754275e-06, + "loss": 0.1041, + "step": 2219 + }, + { + "epoch": 5.1211072664359865, + "grad_norm": 0.0, + "learning_rate": 8.238969163790617e-06, + "loss": 0.102, + "step": 2220 + }, + { + "epoch": 5.1234140715109575, + "grad_norm": 0.0, + "learning_rate": 8.230788569245648e-06, + "loss": 0.1206, + "step": 2221 + }, + { + "epoch": 5.125720876585929, + "grad_norm": 0.0, + "learning_rate": 8.222609196770037e-06, + "loss": 0.0987, + "step": 2222 + }, + { + "epoch": 5.1280276816609, + "grad_norm": 0.0, + "learning_rate": 8.214431052013636e-06, + "loss": 0.0742, + "step": 2223 + }, + { + "epoch": 5.130334486735871, + "grad_norm": 0.0, + "learning_rate": 8.206254140625425e-06, + "loss": 0.1438, + "step": 2224 + }, + { + "epoch": 5.132641291810842, + "grad_norm": 0.0, + "learning_rate": 8.198078468253556e-06, + "loss": 0.1136, + "step": 2225 + }, + { + "epoch": 5.134948096885813, + "grad_norm": 0.0, + "learning_rate": 8.189904040545302e-06, + "loss": 0.1128, + "step": 2226 + }, + { + "epoch": 5.137254901960785, + "grad_norm": 0.0, + "learning_rate": 8.181730863147094e-06, + "loss": 0.1252, + "step": 2227 + }, + { + "epoch": 5.139561707035756, + "grad_norm": 0.0, + "learning_rate": 8.173558941704487e-06, + "loss": 0.1318, + "step": 2228 + }, + { + "epoch": 5.141868512110727, + "grad_norm": 0.0, + "learning_rate": 8.165388281862177e-06, + "loss": 0.1683, + "step": 2229 + }, + { + "epoch": 5.144175317185698, + "grad_norm": 0.0, + "learning_rate": 8.157218889263984e-06, + "loss": 0.0917, + "step": 2230 + }, + { + "epoch": 5.146482122260669, + "grad_norm": 0.0, + "learning_rate": 8.149050769552856e-06, + "loss": 0.1561, + "step": 2231 + }, + { + "epoch": 5.14878892733564, + "grad_norm": 0.0, + "learning_rate": 8.140883928370855e-06, + "loss": 0.1362, + "step": 2232 + }, + { + "epoch": 5.151095732410611, + "grad_norm": 0.0, + "learning_rate": 8.132718371359168e-06, + "loss": 0.1303, + "step": 2233 + }, + { + "epoch": 5.153402537485582, + "grad_norm": 0.0, + "learning_rate": 8.124554104158094e-06, + "loss": 0.096, + "step": 2234 + }, + { + "epoch": 5.155709342560554, + "grad_norm": 0.0, + "learning_rate": 8.116391132407033e-06, + "loss": 0.1079, + "step": 2235 + }, + { + "epoch": 5.158016147635525, + "grad_norm": 0.0, + "learning_rate": 8.108229461744496e-06, + "loss": 0.0833, + "step": 2236 + }, + { + "epoch": 5.160322952710496, + "grad_norm": 0.0, + "learning_rate": 8.100069097808103e-06, + "loss": 0.1309, + "step": 2237 + }, + { + "epoch": 5.162629757785467, + "grad_norm": 0.0, + "learning_rate": 8.091910046234552e-06, + "loss": 0.1182, + "step": 2238 + }, + { + "epoch": 5.164936562860438, + "grad_norm": 0.0, + "learning_rate": 8.083752312659653e-06, + "loss": 0.1312, + "step": 2239 + }, + { + "epoch": 5.167243367935409, + "grad_norm": 0.0, + "learning_rate": 8.075595902718302e-06, + "loss": 0.1302, + "step": 2240 + }, + { + "epoch": 5.16955017301038, + "grad_norm": 0.0, + "learning_rate": 8.06744082204447e-06, + "loss": 0.17, + "step": 2241 + }, + { + "epoch": 5.171856978085351, + "grad_norm": 0.0, + "learning_rate": 8.059287076271216e-06, + "loss": 0.1281, + "step": 2242 + }, + { + "epoch": 5.174163783160323, + "grad_norm": 0.0, + "learning_rate": 8.051134671030686e-06, + "loss": 0.0902, + "step": 2243 + }, + { + "epoch": 5.176470588235294, + "grad_norm": 0.0, + "learning_rate": 8.042983611954087e-06, + "loss": 0.1183, + "step": 2244 + }, + { + "epoch": 5.178777393310265, + "grad_norm": 0.0, + "learning_rate": 8.034833904671698e-06, + "loss": 0.079, + "step": 2245 + }, + { + "epoch": 5.1810841983852365, + "grad_norm": 0.0, + "learning_rate": 8.026685554812877e-06, + "loss": 0.1056, + "step": 2246 + }, + { + "epoch": 5.1833910034602075, + "grad_norm": 0.0, + "learning_rate": 8.018538568006027e-06, + "loss": 0.1229, + "step": 2247 + }, + { + "epoch": 5.185697808535179, + "grad_norm": 0.0, + "learning_rate": 8.010392949878616e-06, + "loss": 0.0956, + "step": 2248 + }, + { + "epoch": 5.18800461361015, + "grad_norm": 0.0, + "learning_rate": 8.002248706057177e-06, + "loss": 0.101, + "step": 2249 + }, + { + "epoch": 5.190311418685121, + "grad_norm": 0.0, + "learning_rate": 7.994105842167274e-06, + "loss": 0.1429, + "step": 2250 + }, + { + "epoch": 5.192618223760093, + "grad_norm": 0.0, + "learning_rate": 7.985964363833532e-06, + "loss": 0.2251, + "step": 2251 + }, + { + "epoch": 5.194925028835064, + "grad_norm": 0.0, + "learning_rate": 7.977824276679623e-06, + "loss": 0.1238, + "step": 2252 + }, + { + "epoch": 5.197231833910035, + "grad_norm": 0.0, + "learning_rate": 7.96968558632824e-06, + "loss": 0.1255, + "step": 2253 + }, + { + "epoch": 5.199538638985006, + "grad_norm": 0.0, + "learning_rate": 7.961548298401125e-06, + "loss": 0.0941, + "step": 2254 + }, + { + "epoch": 5.201845444059977, + "grad_norm": 0.0, + "learning_rate": 7.953412418519052e-06, + "loss": 0.0794, + "step": 2255 + }, + { + "epoch": 5.204152249134948, + "grad_norm": 0.0, + "learning_rate": 7.945277952301811e-06, + "loss": 0.1098, + "step": 2256 + }, + { + "epoch": 5.206459054209919, + "grad_norm": 0.0, + "learning_rate": 7.937144905368226e-06, + "loss": 0.1163, + "step": 2257 + }, + { + "epoch": 5.20876585928489, + "grad_norm": 0.0, + "learning_rate": 7.929013283336141e-06, + "loss": 0.119, + "step": 2258 + }, + { + "epoch": 5.211072664359862, + "grad_norm": 0.0, + "learning_rate": 7.92088309182241e-06, + "loss": 0.094, + "step": 2259 + }, + { + "epoch": 5.213379469434833, + "grad_norm": 0.0, + "learning_rate": 7.912754336442897e-06, + "loss": 0.1079, + "step": 2260 + }, + { + "epoch": 5.215686274509804, + "grad_norm": 0.0, + "learning_rate": 7.904627022812484e-06, + "loss": 0.1326, + "step": 2261 + }, + { + "epoch": 5.217993079584775, + "grad_norm": 0.0, + "learning_rate": 7.896501156545044e-06, + "loss": 0.151, + "step": 2262 + }, + { + "epoch": 5.220299884659746, + "grad_norm": 0.0, + "learning_rate": 7.888376743253462e-06, + "loss": 0.1152, + "step": 2263 + }, + { + "epoch": 5.222606689734717, + "grad_norm": 0.0, + "learning_rate": 7.88025378854962e-06, + "loss": 0.1926, + "step": 2264 + }, + { + "epoch": 5.224913494809688, + "grad_norm": 0.0, + "learning_rate": 7.872132298044382e-06, + "loss": 0.1437, + "step": 2265 + }, + { + "epoch": 5.22722029988466, + "grad_norm": 0.0, + "learning_rate": 7.864012277347602e-06, + "loss": 0.0927, + "step": 2266 + }, + { + "epoch": 5.229527104959631, + "grad_norm": 0.0, + "learning_rate": 7.855893732068124e-06, + "loss": 0.0738, + "step": 2267 + }, + { + "epoch": 5.231833910034602, + "grad_norm": 0.0, + "learning_rate": 7.847776667813782e-06, + "loss": 0.055, + "step": 2268 + }, + { + "epoch": 5.234140715109573, + "grad_norm": 0.0, + "learning_rate": 7.839661090191362e-06, + "loss": 0.0677, + "step": 2269 + }, + { + "epoch": 5.236447520184544, + "grad_norm": 0.0, + "learning_rate": 7.831547004806647e-06, + "loss": 0.1054, + "step": 2270 + }, + { + "epoch": 5.2387543252595155, + "grad_norm": 0.0, + "learning_rate": 7.823434417264378e-06, + "loss": 0.1636, + "step": 2271 + }, + { + "epoch": 5.2410611303344865, + "grad_norm": 0.0, + "learning_rate": 7.815323333168262e-06, + "loss": 0.099, + "step": 2272 + }, + { + "epoch": 5.243367935409458, + "grad_norm": 0.0, + "learning_rate": 7.807213758120965e-06, + "loss": 0.0882, + "step": 2273 + }, + { + "epoch": 5.245674740484429, + "grad_norm": 0.0, + "learning_rate": 7.799105697724127e-06, + "loss": 0.1198, + "step": 2274 + }, + { + "epoch": 5.2479815455594006, + "grad_norm": 0.0, + "learning_rate": 7.790999157578314e-06, + "loss": 0.0729, + "step": 2275 + }, + { + "epoch": 5.250288350634372, + "grad_norm": 0.0, + "learning_rate": 7.782894143283065e-06, + "loss": 0.1131, + "step": 2276 + }, + { + "epoch": 5.252595155709343, + "grad_norm": 0.0, + "learning_rate": 7.774790660436857e-06, + "loss": 0.0498, + "step": 2277 + }, + { + "epoch": 5.254901960784314, + "grad_norm": 0.0, + "learning_rate": 7.766688714637109e-06, + "loss": 0.1083, + "step": 2278 + }, + { + "epoch": 5.257208765859285, + "grad_norm": 0.0, + "learning_rate": 7.758588311480174e-06, + "loss": 0.0884, + "step": 2279 + }, + { + "epoch": 5.259515570934256, + "grad_norm": 0.0, + "learning_rate": 7.750489456561351e-06, + "loss": 0.1061, + "step": 2280 + }, + { + "epoch": 5.261822376009227, + "grad_norm": 0.0, + "learning_rate": 7.742392155474858e-06, + "loss": 0.1311, + "step": 2281 + }, + { + "epoch": 5.264129181084199, + "grad_norm": 0.0, + "learning_rate": 7.734296413813847e-06, + "loss": 0.0854, + "step": 2282 + }, + { + "epoch": 5.26643598615917, + "grad_norm": 0.0, + "learning_rate": 7.726202237170387e-06, + "loss": 0.1119, + "step": 2283 + }, + { + "epoch": 5.268742791234141, + "grad_norm": 0.0, + "learning_rate": 7.718109631135472e-06, + "loss": 0.156, + "step": 2284 + }, + { + "epoch": 5.271049596309112, + "grad_norm": 0.0, + "learning_rate": 7.710018601299004e-06, + "loss": 0.0985, + "step": 2285 + }, + { + "epoch": 5.273356401384083, + "grad_norm": 0.0, + "learning_rate": 7.701929153249808e-06, + "loss": 0.1588, + "step": 2286 + }, + { + "epoch": 5.275663206459054, + "grad_norm": 0.0, + "learning_rate": 7.6938412925756e-06, + "loss": 0.1022, + "step": 2287 + }, + { + "epoch": 5.277970011534025, + "grad_norm": 0.0, + "learning_rate": 7.685755024863013e-06, + "loss": 0.1446, + "step": 2288 + }, + { + "epoch": 5.280276816608996, + "grad_norm": 0.0, + "learning_rate": 7.677670355697577e-06, + "loss": 0.1797, + "step": 2289 + }, + { + "epoch": 5.282583621683968, + "grad_norm": 0.0, + "learning_rate": 7.669587290663711e-06, + "loss": 0.1068, + "step": 2290 + }, + { + "epoch": 5.284890426758939, + "grad_norm": 0.0, + "learning_rate": 7.661505835344733e-06, + "loss": 0.1554, + "step": 2291 + }, + { + "epoch": 5.28719723183391, + "grad_norm": 0.0, + "learning_rate": 7.653425995322852e-06, + "loss": 0.151, + "step": 2292 + }, + { + "epoch": 5.289504036908881, + "grad_norm": 0.0, + "learning_rate": 7.645347776179144e-06, + "loss": 0.1269, + "step": 2293 + }, + { + "epoch": 5.291810841983852, + "grad_norm": 0.0, + "learning_rate": 7.637271183493587e-06, + "loss": 0.1839, + "step": 2294 + }, + { + "epoch": 5.294117647058823, + "grad_norm": 0.0, + "learning_rate": 7.629196222845027e-06, + "loss": 0.1341, + "step": 2295 + }, + { + "epoch": 5.296424452133794, + "grad_norm": 0.0, + "learning_rate": 7.621122899811177e-06, + "loss": 0.0932, + "step": 2296 + }, + { + "epoch": 5.2987312572087655, + "grad_norm": 0.0, + "learning_rate": 7.613051219968624e-06, + "loss": 0.1643, + "step": 2297 + }, + { + "epoch": 5.301038062283737, + "grad_norm": 0.0, + "learning_rate": 7.6049811888928235e-06, + "loss": 0.1563, + "step": 2298 + }, + { + "epoch": 5.3033448673587085, + "grad_norm": 0.0, + "learning_rate": 7.596912812158083e-06, + "loss": 0.162, + "step": 2299 + }, + { + "epoch": 5.3056516724336795, + "grad_norm": 0.0, + "learning_rate": 7.588846095337574e-06, + "loss": 0.1715, + "step": 2300 + }, + { + "epoch": 5.307958477508651, + "grad_norm": 0.0, + "learning_rate": 7.580781044003324e-06, + "loss": 0.143, + "step": 2301 + }, + { + "epoch": 5.310265282583622, + "grad_norm": 0.0, + "learning_rate": 7.5727176637262034e-06, + "loss": 0.1004, + "step": 2302 + }, + { + "epoch": 5.312572087658593, + "grad_norm": 0.0, + "learning_rate": 7.564655960075927e-06, + "loss": 0.1638, + "step": 2303 + }, + { + "epoch": 5.314878892733564, + "grad_norm": 0.0, + "learning_rate": 7.556595938621058e-06, + "loss": 0.1063, + "step": 2304 + }, + { + "epoch": 5.317185697808535, + "grad_norm": 0.0, + "learning_rate": 7.5485376049290014e-06, + "loss": 0.0884, + "step": 2305 + }, + { + "epoch": 5.319492502883507, + "grad_norm": 0.0, + "learning_rate": 7.540480964565981e-06, + "loss": 0.1348, + "step": 2306 + }, + { + "epoch": 5.321799307958478, + "grad_norm": 0.0, + "learning_rate": 7.532426023097063e-06, + "loss": 0.0931, + "step": 2307 + }, + { + "epoch": 5.324106113033449, + "grad_norm": 0.0, + "learning_rate": 7.524372786086143e-06, + "loss": 0.1618, + "step": 2308 + }, + { + "epoch": 5.32641291810842, + "grad_norm": 0.0, + "learning_rate": 7.516321259095921e-06, + "loss": 0.1083, + "step": 2309 + }, + { + "epoch": 5.328719723183391, + "grad_norm": 0.0, + "learning_rate": 7.508271447687936e-06, + "loss": 0.1479, + "step": 2310 + }, + { + "epoch": 5.331026528258362, + "grad_norm": 0.0, + "learning_rate": 7.500223357422537e-06, + "loss": 0.1094, + "step": 2311 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.0, + "learning_rate": 7.492176993858873e-06, + "loss": 0.1584, + "step": 2312 + }, + { + "epoch": 5.335640138408304, + "grad_norm": 0.0, + "learning_rate": 7.484132362554915e-06, + "loss": 0.1474, + "step": 2313 + }, + { + "epoch": 5.337946943483276, + "grad_norm": 0.0, + "learning_rate": 7.476089469067432e-06, + "loss": 0.1665, + "step": 2314 + }, + { + "epoch": 5.340253748558247, + "grad_norm": 0.0, + "learning_rate": 7.468048318951983e-06, + "loss": 0.1302, + "step": 2315 + }, + { + "epoch": 5.342560553633218, + "grad_norm": 0.0, + "learning_rate": 7.4600089177629384e-06, + "loss": 0.1754, + "step": 2316 + }, + { + "epoch": 5.344867358708189, + "grad_norm": 0.0, + "learning_rate": 7.451971271053455e-06, + "loss": 0.1228, + "step": 2317 + }, + { + "epoch": 5.34717416378316, + "grad_norm": 0.0, + "learning_rate": 7.4439353843754715e-06, + "loss": 0.1626, + "step": 2318 + }, + { + "epoch": 5.349480968858131, + "grad_norm": 0.0, + "learning_rate": 7.435901263279717e-06, + "loss": 0.0944, + "step": 2319 + }, + { + "epoch": 5.351787773933102, + "grad_norm": 0.0, + "learning_rate": 7.4278689133157034e-06, + "loss": 0.1013, + "step": 2320 + }, + { + "epoch": 5.354094579008073, + "grad_norm": 0.0, + "learning_rate": 7.419838340031709e-06, + "loss": 0.1058, + "step": 2321 + }, + { + "epoch": 5.356401384083045, + "grad_norm": 0.0, + "learning_rate": 7.411809548974792e-06, + "loss": 0.1163, + "step": 2322 + }, + { + "epoch": 5.358708189158016, + "grad_norm": 0.0, + "learning_rate": 7.403782545690787e-06, + "loss": 0.1901, + "step": 2323 + }, + { + "epoch": 5.361014994232987, + "grad_norm": 0.0, + "learning_rate": 7.395757335724276e-06, + "loss": 0.1381, + "step": 2324 + }, + { + "epoch": 5.3633217993079585, + "grad_norm": 0.0, + "learning_rate": 7.387733924618617e-06, + "loss": 0.1357, + "step": 2325 + }, + { + "epoch": 5.3656286043829295, + "grad_norm": 0.0, + "learning_rate": 7.3797123179159225e-06, + "loss": 0.0372, + "step": 2326 + }, + { + "epoch": 5.367935409457901, + "grad_norm": 0.0, + "learning_rate": 7.371692521157048e-06, + "loss": 0.06, + "step": 2327 + }, + { + "epoch": 5.370242214532872, + "grad_norm": 0.0, + "learning_rate": 7.3636745398816135e-06, + "loss": 0.1012, + "step": 2328 + }, + { + "epoch": 5.372549019607844, + "grad_norm": 0.0, + "learning_rate": 7.355658379627981e-06, + "loss": 0.1568, + "step": 2329 + }, + { + "epoch": 5.374855824682815, + "grad_norm": 0.0, + "learning_rate": 7.347644045933244e-06, + "loss": 0.1249, + "step": 2330 + }, + { + "epoch": 5.377162629757786, + "grad_norm": 0.0, + "learning_rate": 7.33963154433325e-06, + "loss": 0.1521, + "step": 2331 + }, + { + "epoch": 5.379469434832757, + "grad_norm": 0.0, + "learning_rate": 7.331620880362571e-06, + "loss": 0.0785, + "step": 2332 + }, + { + "epoch": 5.381776239907728, + "grad_norm": 0.0, + "learning_rate": 7.323612059554514e-06, + "loss": 0.1795, + "step": 2333 + }, + { + "epoch": 5.384083044982699, + "grad_norm": 0.0, + "learning_rate": 7.315605087441107e-06, + "loss": 0.1333, + "step": 2334 + }, + { + "epoch": 5.38638985005767, + "grad_norm": 0.0, + "learning_rate": 7.307599969553111e-06, + "loss": 0.1032, + "step": 2335 + }, + { + "epoch": 5.388696655132641, + "grad_norm": 0.0, + "learning_rate": 7.299596711419994e-06, + "loss": 0.1417, + "step": 2336 + }, + { + "epoch": 5.391003460207612, + "grad_norm": 0.0, + "learning_rate": 7.291595318569951e-06, + "loss": 0.173, + "step": 2337 + }, + { + "epoch": 5.393310265282584, + "grad_norm": 0.0, + "learning_rate": 7.2835957965298805e-06, + "loss": 0.0894, + "step": 2338 + }, + { + "epoch": 5.395617070357555, + "grad_norm": 0.0, + "learning_rate": 7.2755981508253935e-06, + "loss": 0.1691, + "step": 2339 + }, + { + "epoch": 5.397923875432526, + "grad_norm": 0.0, + "learning_rate": 7.267602386980801e-06, + "loss": 0.1301, + "step": 2340 + }, + { + "epoch": 5.400230680507497, + "grad_norm": 0.0, + "learning_rate": 7.259608510519121e-06, + "loss": 0.0999, + "step": 2341 + }, + { + "epoch": 5.402537485582468, + "grad_norm": 0.0, + "learning_rate": 7.2516165269620534e-06, + "loss": 0.1334, + "step": 2342 + }, + { + "epoch": 5.404844290657439, + "grad_norm": 0.0, + "learning_rate": 7.243626441830009e-06, + "loss": 0.2163, + "step": 2343 + }, + { + "epoch": 5.40715109573241, + "grad_norm": 0.0, + "learning_rate": 7.235638260642075e-06, + "loss": 0.129, + "step": 2344 + }, + { + "epoch": 5.409457900807382, + "grad_norm": 0.0, + "learning_rate": 7.227651988916032e-06, + "loss": 0.149, + "step": 2345 + }, + { + "epoch": 5.411764705882353, + "grad_norm": 0.0, + "learning_rate": 7.219667632168326e-06, + "loss": 0.1241, + "step": 2346 + }, + { + "epoch": 5.414071510957324, + "grad_norm": 0.0, + "learning_rate": 7.2116851959140965e-06, + "loss": 0.1807, + "step": 2347 + }, + { + "epoch": 5.416378316032295, + "grad_norm": 0.0, + "learning_rate": 7.203704685667156e-06, + "loss": 0.0846, + "step": 2348 + }, + { + "epoch": 5.418685121107266, + "grad_norm": 0.0, + "learning_rate": 7.1957261069399745e-06, + "loss": 0.1518, + "step": 2349 + }, + { + "epoch": 5.4209919261822375, + "grad_norm": 0.0, + "learning_rate": 7.187749465243694e-06, + "loss": 0.1478, + "step": 2350 + }, + { + "epoch": 5.4232987312572085, + "grad_norm": 0.0, + "learning_rate": 7.179774766088127e-06, + "loss": 0.1471, + "step": 2351 + }, + { + "epoch": 5.42560553633218, + "grad_norm": 0.0, + "learning_rate": 7.171802014981726e-06, + "loss": 0.1405, + "step": 2352 + }, + { + "epoch": 5.4279123414071515, + "grad_norm": 0.0, + "learning_rate": 7.163831217431615e-06, + "loss": 0.1103, + "step": 2353 + }, + { + "epoch": 5.430219146482123, + "grad_norm": 0.0, + "learning_rate": 7.1558623789435634e-06, + "loss": 0.0927, + "step": 2354 + }, + { + "epoch": 5.432525951557094, + "grad_norm": 0.0, + "learning_rate": 7.14789550502198e-06, + "loss": 0.1815, + "step": 2355 + }, + { + "epoch": 5.434832756632065, + "grad_norm": 0.0, + "learning_rate": 7.139930601169926e-06, + "loss": 0.1077, + "step": 2356 + }, + { + "epoch": 5.437139561707036, + "grad_norm": 0.0, + "learning_rate": 7.131967672889101e-06, + "loss": 0.144, + "step": 2357 + }, + { + "epoch": 5.439446366782007, + "grad_norm": 0.0, + "learning_rate": 7.124006725679828e-06, + "loss": 0.1472, + "step": 2358 + }, + { + "epoch": 5.441753171856978, + "grad_norm": 0.0, + "learning_rate": 7.116047765041078e-06, + "loss": 0.1114, + "step": 2359 + }, + { + "epoch": 5.444059976931949, + "grad_norm": 0.0, + "learning_rate": 7.108090796470446e-06, + "loss": 0.1308, + "step": 2360 + }, + { + "epoch": 5.446366782006921, + "grad_norm": 0.0, + "learning_rate": 7.100135825464138e-06, + "loss": 0.1317, + "step": 2361 + }, + { + "epoch": 5.448673587081892, + "grad_norm": 0.0, + "learning_rate": 7.092182857516998e-06, + "loss": 0.1821, + "step": 2362 + }, + { + "epoch": 5.450980392156863, + "grad_norm": 0.0, + "learning_rate": 7.084231898122478e-06, + "loss": 0.1191, + "step": 2363 + }, + { + "epoch": 5.453287197231834, + "grad_norm": 0.0, + "learning_rate": 7.076282952772634e-06, + "loss": 0.1682, + "step": 2364 + }, + { + "epoch": 5.455594002306805, + "grad_norm": 0.0, + "learning_rate": 7.0683360269581465e-06, + "loss": 0.1238, + "step": 2365 + }, + { + "epoch": 5.457900807381776, + "grad_norm": 0.0, + "learning_rate": 7.060391126168297e-06, + "loss": 0.1263, + "step": 2366 + }, + { + "epoch": 5.460207612456747, + "grad_norm": 0.0, + "learning_rate": 7.052448255890958e-06, + "loss": 0.12, + "step": 2367 + }, + { + "epoch": 5.462514417531718, + "grad_norm": 0.0, + "learning_rate": 7.044507421612613e-06, + "loss": 0.094, + "step": 2368 + }, + { + "epoch": 5.46482122260669, + "grad_norm": 0.0, + "learning_rate": 7.036568628818332e-06, + "loss": 0.1063, + "step": 2369 + }, + { + "epoch": 5.467128027681661, + "grad_norm": 0.0, + "learning_rate": 7.028631882991771e-06, + "loss": 0.158, + "step": 2370 + }, + { + "epoch": 5.469434832756632, + "grad_norm": 0.0, + "learning_rate": 7.02069718961518e-06, + "loss": 0.1242, + "step": 2371 + }, + { + "epoch": 5.471741637831603, + "grad_norm": 0.0, + "learning_rate": 7.012764554169393e-06, + "loss": 0.1032, + "step": 2372 + }, + { + "epoch": 5.474048442906574, + "grad_norm": 0.0, + "learning_rate": 7.004833982133808e-06, + "loss": 0.1063, + "step": 2373 + }, + { + "epoch": 5.476355247981545, + "grad_norm": 0.0, + "learning_rate": 6.996905478986415e-06, + "loss": 0.145, + "step": 2374 + }, + { + "epoch": 5.478662053056516, + "grad_norm": 0.0, + "learning_rate": 6.988979050203769e-06, + "loss": 0.0888, + "step": 2375 + }, + { + "epoch": 5.4809688581314875, + "grad_norm": 0.0, + "learning_rate": 6.981054701260981e-06, + "loss": 0.1449, + "step": 2376 + }, + { + "epoch": 5.483275663206459, + "grad_norm": 0.0, + "learning_rate": 6.973132437631743e-06, + "loss": 0.193, + "step": 2377 + }, + { + "epoch": 5.4855824682814305, + "grad_norm": 0.0, + "learning_rate": 6.9652122647882966e-06, + "loss": 0.2291, + "step": 2378 + }, + { + "epoch": 5.4878892733564015, + "grad_norm": 0.0, + "learning_rate": 6.957294188201438e-06, + "loss": 0.1084, + "step": 2379 + }, + { + "epoch": 5.490196078431373, + "grad_norm": 0.0, + "learning_rate": 6.949378213340522e-06, + "loss": 0.1257, + "step": 2380 + }, + { + "epoch": 5.492502883506344, + "grad_norm": 0.0, + "learning_rate": 6.94146434567345e-06, + "loss": 0.1278, + "step": 2381 + }, + { + "epoch": 5.494809688581315, + "grad_norm": 0.0, + "learning_rate": 6.933552590666659e-06, + "loss": 0.0772, + "step": 2382 + }, + { + "epoch": 5.497116493656286, + "grad_norm": 0.0, + "learning_rate": 6.9256429537851365e-06, + "loss": 0.1118, + "step": 2383 + }, + { + "epoch": 5.499423298731257, + "grad_norm": 0.0, + "learning_rate": 6.917735440492407e-06, + "loss": 0.1075, + "step": 2384 + }, + { + "epoch": 5.501730103806229, + "grad_norm": 0.0, + "learning_rate": 6.909830056250527e-06, + "loss": 0.143, + "step": 2385 + }, + { + "epoch": 5.5040369088812, + "grad_norm": 0.0, + "learning_rate": 6.9019268065200765e-06, + "loss": 0.0998, + "step": 2386 + }, + { + "epoch": 5.506343713956171, + "grad_norm": 0.0, + "learning_rate": 6.8940256967601625e-06, + "loss": 0.1115, + "step": 2387 + }, + { + "epoch": 5.508650519031142, + "grad_norm": 0.0, + "learning_rate": 6.886126732428424e-06, + "loss": 0.1694, + "step": 2388 + }, + { + "epoch": 5.510957324106113, + "grad_norm": 0.0, + "learning_rate": 6.878229918981003e-06, + "loss": 0.0988, + "step": 2389 + }, + { + "epoch": 5.513264129181084, + "grad_norm": 0.0, + "learning_rate": 6.870335261872569e-06, + "loss": 0.2149, + "step": 2390 + }, + { + "epoch": 5.515570934256055, + "grad_norm": 0.0, + "learning_rate": 6.862442766556297e-06, + "loss": 0.1279, + "step": 2391 + }, + { + "epoch": 5.517877739331027, + "grad_norm": 0.0, + "learning_rate": 6.854552438483866e-06, + "loss": 0.1479, + "step": 2392 + }, + { + "epoch": 5.520184544405998, + "grad_norm": 0.0, + "learning_rate": 6.846664283105455e-06, + "loss": 0.0917, + "step": 2393 + }, + { + "epoch": 5.522491349480969, + "grad_norm": 0.0, + "learning_rate": 6.83877830586976e-06, + "loss": 0.117, + "step": 2394 + }, + { + "epoch": 5.52479815455594, + "grad_norm": 0.0, + "learning_rate": 6.830894512223947e-06, + "loss": 0.1405, + "step": 2395 + }, + { + "epoch": 5.527104959630911, + "grad_norm": 0.0, + "learning_rate": 6.823012907613691e-06, + "loss": 0.1338, + "step": 2396 + }, + { + "epoch": 5.529411764705882, + "grad_norm": 0.0, + "learning_rate": 6.815133497483157e-06, + "loss": 0.0751, + "step": 2397 + }, + { + "epoch": 5.531718569780853, + "grad_norm": 0.0, + "learning_rate": 6.807256287274981e-06, + "loss": 0.1442, + "step": 2398 + }, + { + "epoch": 5.534025374855824, + "grad_norm": 0.0, + "learning_rate": 6.799381282430284e-06, + "loss": 0.0848, + "step": 2399 + }, + { + "epoch": 5.536332179930795, + "grad_norm": 0.0, + "learning_rate": 6.791508488388675e-06, + "loss": 0.1538, + "step": 2400 + }, + { + "epoch": 5.538638985005767, + "grad_norm": 0.0, + "learning_rate": 6.783637910588216e-06, + "loss": 0.1288, + "step": 2401 + }, + { + "epoch": 5.540945790080738, + "grad_norm": 0.0, + "learning_rate": 6.775769554465455e-06, + "loss": 0.0907, + "step": 2402 + }, + { + "epoch": 5.5432525951557095, + "grad_norm": 0.0, + "learning_rate": 6.767903425455402e-06, + "loss": 0.1576, + "step": 2403 + }, + { + "epoch": 5.5455594002306805, + "grad_norm": 0.0, + "learning_rate": 6.76003952899152e-06, + "loss": 0.142, + "step": 2404 + }, + { + "epoch": 5.5478662053056516, + "grad_norm": 0.0, + "learning_rate": 6.752177870505736e-06, + "loss": 0.1113, + "step": 2405 + }, + { + "epoch": 5.550173010380623, + "grad_norm": 0.0, + "learning_rate": 6.744318455428436e-06, + "loss": 0.0818, + "step": 2406 + }, + { + "epoch": 5.552479815455594, + "grad_norm": 0.0, + "learning_rate": 6.736461289188445e-06, + "loss": 0.1264, + "step": 2407 + }, + { + "epoch": 5.554786620530566, + "grad_norm": 0.0, + "learning_rate": 6.728606377213045e-06, + "loss": 0.0775, + "step": 2408 + }, + { + "epoch": 5.557093425605537, + "grad_norm": 0.0, + "learning_rate": 6.720753724927957e-06, + "loss": 0.1695, + "step": 2409 + }, + { + "epoch": 5.559400230680508, + "grad_norm": 0.0, + "learning_rate": 6.712903337757339e-06, + "loss": 0.0953, + "step": 2410 + }, + { + "epoch": 5.561707035755479, + "grad_norm": 0.0, + "learning_rate": 6.705055221123788e-06, + "loss": 0.1091, + "step": 2411 + }, + { + "epoch": 5.56401384083045, + "grad_norm": 0.0, + "learning_rate": 6.697209380448333e-06, + "loss": 0.1006, + "step": 2412 + }, + { + "epoch": 5.566320645905421, + "grad_norm": 0.0, + "learning_rate": 6.689365821150421e-06, + "loss": 0.1257, + "step": 2413 + }, + { + "epoch": 5.568627450980392, + "grad_norm": 0.0, + "learning_rate": 6.681524548647936e-06, + "loss": 0.1336, + "step": 2414 + }, + { + "epoch": 5.570934256055363, + "grad_norm": 0.0, + "learning_rate": 6.673685568357182e-06, + "loss": 0.111, + "step": 2415 + }, + { + "epoch": 5.573241061130334, + "grad_norm": 0.0, + "learning_rate": 6.665848885692867e-06, + "loss": 0.1131, + "step": 2416 + }, + { + "epoch": 5.575547866205306, + "grad_norm": 0.0, + "learning_rate": 6.6580145060681255e-06, + "loss": 0.152, + "step": 2417 + }, + { + "epoch": 5.577854671280277, + "grad_norm": 0.0, + "learning_rate": 6.650182434894496e-06, + "loss": 0.1047, + "step": 2418 + }, + { + "epoch": 5.580161476355248, + "grad_norm": 0.0, + "learning_rate": 6.642352677581917e-06, + "loss": 0.1094, + "step": 2419 + }, + { + "epoch": 5.582468281430219, + "grad_norm": 0.0, + "learning_rate": 6.634525239538736e-06, + "loss": 0.1242, + "step": 2420 + }, + { + "epoch": 5.58477508650519, + "grad_norm": 0.0, + "learning_rate": 6.6267001261717015e-06, + "loss": 0.1919, + "step": 2421 + }, + { + "epoch": 5.587081891580161, + "grad_norm": 0.0, + "learning_rate": 6.618877342885945e-06, + "loss": 0.1693, + "step": 2422 + }, + { + "epoch": 5.589388696655132, + "grad_norm": 0.0, + "learning_rate": 6.611056895084997e-06, + "loss": 0.2294, + "step": 2423 + }, + { + "epoch": 5.591695501730104, + "grad_norm": 0.0, + "learning_rate": 6.603238788170771e-06, + "loss": 0.1217, + "step": 2424 + }, + { + "epoch": 5.594002306805075, + "grad_norm": 0.0, + "learning_rate": 6.595423027543572e-06, + "loss": 0.1884, + "step": 2425 + }, + { + "epoch": 5.596309111880046, + "grad_norm": 0.0, + "learning_rate": 6.587609618602065e-06, + "loss": 0.1357, + "step": 2426 + }, + { + "epoch": 5.598615916955017, + "grad_norm": 0.0, + "learning_rate": 6.579798566743314e-06, + "loss": 0.1135, + "step": 2427 + }, + { + "epoch": 5.600922722029988, + "grad_norm": 0.0, + "learning_rate": 6.571989877362738e-06, + "loss": 0.1454, + "step": 2428 + }, + { + "epoch": 5.6032295271049595, + "grad_norm": 0.0, + "learning_rate": 6.5641835558541314e-06, + "loss": 0.0846, + "step": 2429 + }, + { + "epoch": 5.6055363321799305, + "grad_norm": 0.0, + "learning_rate": 6.5563796076096484e-06, + "loss": 0.1248, + "step": 2430 + }, + { + "epoch": 5.607843137254902, + "grad_norm": 0.0, + "learning_rate": 6.548578038019815e-06, + "loss": 0.1146, + "step": 2431 + }, + { + "epoch": 5.610149942329873, + "grad_norm": 0.0, + "learning_rate": 6.540778852473497e-06, + "loss": 0.1195, + "step": 2432 + }, + { + "epoch": 5.612456747404845, + "grad_norm": 0.0, + "learning_rate": 6.532982056357928e-06, + "loss": 0.1321, + "step": 2433 + }, + { + "epoch": 5.614763552479816, + "grad_norm": 0.0, + "learning_rate": 6.525187655058687e-06, + "loss": 0.1762, + "step": 2434 + }, + { + "epoch": 5.617070357554787, + "grad_norm": 0.0, + "learning_rate": 6.517395653959694e-06, + "loss": 0.1277, + "step": 2435 + }, + { + "epoch": 5.619377162629758, + "grad_norm": 0.0, + "learning_rate": 6.5096060584432134e-06, + "loss": 0.0624, + "step": 2436 + }, + { + "epoch": 5.621683967704729, + "grad_norm": 0.0, + "learning_rate": 6.501818873889856e-06, + "loss": 0.092, + "step": 2437 + }, + { + "epoch": 5.6239907727797, + "grad_norm": 0.0, + "learning_rate": 6.494034105678551e-06, + "loss": 0.1774, + "step": 2438 + }, + { + "epoch": 5.626297577854672, + "grad_norm": 0.0, + "learning_rate": 6.486251759186573e-06, + "loss": 0.074, + "step": 2439 + }, + { + "epoch": 5.628604382929643, + "grad_norm": 0.0, + "learning_rate": 6.478471839789522e-06, + "loss": 0.1198, + "step": 2440 + }, + { + "epoch": 5.630911188004614, + "grad_norm": 0.0, + "learning_rate": 6.4706943528613135e-06, + "loss": 0.1509, + "step": 2441 + }, + { + "epoch": 5.633217993079585, + "grad_norm": 0.0, + "learning_rate": 6.462919303774186e-06, + "loss": 0.1143, + "step": 2442 + }, + { + "epoch": 5.635524798154556, + "grad_norm": 0.0, + "learning_rate": 6.455146697898703e-06, + "loss": 0.079, + "step": 2443 + }, + { + "epoch": 5.637831603229527, + "grad_norm": 0.0, + "learning_rate": 6.447376540603725e-06, + "loss": 0.1958, + "step": 2444 + }, + { + "epoch": 5.640138408304498, + "grad_norm": 0.0, + "learning_rate": 6.439608837256432e-06, + "loss": 0.1881, + "step": 2445 + }, + { + "epoch": 5.642445213379469, + "grad_norm": 0.0, + "learning_rate": 6.4318435932223115e-06, + "loss": 0.1026, + "step": 2446 + }, + { + "epoch": 5.64475201845444, + "grad_norm": 0.0, + "learning_rate": 6.424080813865139e-06, + "loss": 0.0794, + "step": 2447 + }, + { + "epoch": 5.647058823529412, + "grad_norm": 0.0, + "learning_rate": 6.4163205045469975e-06, + "loss": 0.0691, + "step": 2448 + }, + { + "epoch": 5.649365628604383, + "grad_norm": 0.0, + "learning_rate": 6.408562670628267e-06, + "loss": 0.1581, + "step": 2449 + }, + { + "epoch": 5.651672433679354, + "grad_norm": 0.0, + "learning_rate": 6.400807317467604e-06, + "loss": 0.0775, + "step": 2450 + }, + { + "epoch": 5.653979238754325, + "grad_norm": 0.0, + "learning_rate": 6.393054450421963e-06, + "loss": 0.1166, + "step": 2451 + }, + { + "epoch": 5.656286043829296, + "grad_norm": 0.0, + "learning_rate": 6.3853040748465855e-06, + "loss": 0.1114, + "step": 2452 + }, + { + "epoch": 5.658592848904267, + "grad_norm": 0.0, + "learning_rate": 6.377556196094974e-06, + "loss": 0.0991, + "step": 2453 + }, + { + "epoch": 5.660899653979238, + "grad_norm": 0.0, + "learning_rate": 6.36981081951892e-06, + "loss": 0.1253, + "step": 2454 + }, + { + "epoch": 5.66320645905421, + "grad_norm": 0.0, + "learning_rate": 6.362067950468489e-06, + "loss": 0.1908, + "step": 2455 + }, + { + "epoch": 5.665513264129181, + "grad_norm": 0.0, + "learning_rate": 6.3543275942920004e-06, + "loss": 0.1327, + "step": 2456 + }, + { + "epoch": 5.6678200692041525, + "grad_norm": 0.0, + "learning_rate": 6.34658975633605e-06, + "loss": 0.1737, + "step": 2457 + }, + { + "epoch": 5.6701268742791235, + "grad_norm": 0.0, + "learning_rate": 6.338854441945495e-06, + "loss": 0.0993, + "step": 2458 + }, + { + "epoch": 5.672433679354095, + "grad_norm": 0.0, + "learning_rate": 6.331121656463441e-06, + "loss": 0.1192, + "step": 2459 + }, + { + "epoch": 5.674740484429066, + "grad_norm": 0.0, + "learning_rate": 6.32339140523125e-06, + "loss": 0.1421, + "step": 2460 + }, + { + "epoch": 5.677047289504037, + "grad_norm": 0.0, + "learning_rate": 6.3156636935885344e-06, + "loss": 0.0705, + "step": 2461 + }, + { + "epoch": 5.679354094579008, + "grad_norm": 0.0, + "learning_rate": 6.3079385268731575e-06, + "loss": 0.1405, + "step": 2462 + }, + { + "epoch": 5.681660899653979, + "grad_norm": 0.0, + "learning_rate": 6.300215910421212e-06, + "loss": 0.1079, + "step": 2463 + }, + { + "epoch": 5.683967704728951, + "grad_norm": 0.0, + "learning_rate": 6.292495849567042e-06, + "loss": 0.0928, + "step": 2464 + }, + { + "epoch": 5.686274509803922, + "grad_norm": 0.0, + "learning_rate": 6.284778349643221e-06, + "loss": 0.1646, + "step": 2465 + }, + { + "epoch": 5.688581314878893, + "grad_norm": 0.0, + "learning_rate": 6.277063415980549e-06, + "loss": 0.1286, + "step": 2466 + }, + { + "epoch": 5.690888119953864, + "grad_norm": 0.0, + "learning_rate": 6.269351053908061e-06, + "loss": 0.1082, + "step": 2467 + }, + { + "epoch": 5.693194925028835, + "grad_norm": 0.0, + "learning_rate": 6.2616412687530145e-06, + "loss": 0.147, + "step": 2468 + }, + { + "epoch": 5.695501730103806, + "grad_norm": 0.0, + "learning_rate": 6.25393406584088e-06, + "loss": 0.1253, + "step": 2469 + }, + { + "epoch": 5.697808535178777, + "grad_norm": 0.0, + "learning_rate": 6.246229450495354e-06, + "loss": 0.1705, + "step": 2470 + }, + { + "epoch": 5.700115340253749, + "grad_norm": 0.0, + "learning_rate": 6.238527428038339e-06, + "loss": 0.1485, + "step": 2471 + }, + { + "epoch": 5.70242214532872, + "grad_norm": 0.0, + "learning_rate": 6.230828003789949e-06, + "loss": 0.132, + "step": 2472 + }, + { + "epoch": 5.704728950403691, + "grad_norm": 0.0, + "learning_rate": 6.2231311830684995e-06, + "loss": 0.1727, + "step": 2473 + }, + { + "epoch": 5.707035755478662, + "grad_norm": 0.0, + "learning_rate": 6.215436971190518e-06, + "loss": 0.1436, + "step": 2474 + }, + { + "epoch": 5.709342560553633, + "grad_norm": 0.0, + "learning_rate": 6.207745373470717e-06, + "loss": 0.1542, + "step": 2475 + }, + { + "epoch": 5.711649365628604, + "grad_norm": 0.0, + "learning_rate": 6.200056395222012e-06, + "loss": 0.1702, + "step": 2476 + }, + { + "epoch": 5.713956170703575, + "grad_norm": 0.0, + "learning_rate": 6.192370041755505e-06, + "loss": 0.0644, + "step": 2477 + }, + { + "epoch": 5.716262975778546, + "grad_norm": 0.0, + "learning_rate": 6.184686318380488e-06, + "loss": 0.2113, + "step": 2478 + }, + { + "epoch": 5.718569780853517, + "grad_norm": 0.0, + "learning_rate": 6.177005230404431e-06, + "loss": 0.1338, + "step": 2479 + }, + { + "epoch": 5.720876585928489, + "grad_norm": 0.0, + "learning_rate": 6.169326783132994e-06, + "loss": 0.1048, + "step": 2480 + }, + { + "epoch": 5.72318339100346, + "grad_norm": 0.0, + "learning_rate": 6.1616509818699975e-06, + "loss": 0.1198, + "step": 2481 + }, + { + "epoch": 5.7254901960784315, + "grad_norm": 0.0, + "learning_rate": 6.153977831917451e-06, + "loss": 0.159, + "step": 2482 + }, + { + "epoch": 5.7277970011534025, + "grad_norm": 0.0, + "learning_rate": 6.146307338575519e-06, + "loss": 0.1333, + "step": 2483 + }, + { + "epoch": 5.730103806228374, + "grad_norm": 0.0, + "learning_rate": 6.138639507142539e-06, + "loss": 0.1079, + "step": 2484 + }, + { + "epoch": 5.732410611303345, + "grad_norm": 0.0, + "learning_rate": 6.1309743429150045e-06, + "loss": 0.1, + "step": 2485 + }, + { + "epoch": 5.734717416378316, + "grad_norm": 0.0, + "learning_rate": 6.1233118511875765e-06, + "loss": 0.1193, + "step": 2486 + }, + { + "epoch": 5.737024221453288, + "grad_norm": 0.0, + "learning_rate": 6.115652037253054e-06, + "loss": 0.0936, + "step": 2487 + }, + { + "epoch": 5.739331026528259, + "grad_norm": 0.0, + "learning_rate": 6.107994906402401e-06, + "loss": 0.1722, + "step": 2488 + }, + { + "epoch": 5.74163783160323, + "grad_norm": 0.0, + "learning_rate": 6.1003404639247234e-06, + "loss": 0.1396, + "step": 2489 + }, + { + "epoch": 5.743944636678201, + "grad_norm": 0.0, + "learning_rate": 6.092688715107265e-06, + "loss": 0.129, + "step": 2490 + }, + { + "epoch": 5.746251441753172, + "grad_norm": 0.0, + "learning_rate": 6.085039665235413e-06, + "loss": 0.1404, + "step": 2491 + }, + { + "epoch": 5.748558246828143, + "grad_norm": 0.0, + "learning_rate": 6.077393319592697e-06, + "loss": 0.0678, + "step": 2492 + }, + { + "epoch": 5.750865051903114, + "grad_norm": 0.0, + "learning_rate": 6.069749683460765e-06, + "loss": 0.2005, + "step": 2493 + }, + { + "epoch": 5.753171856978085, + "grad_norm": 0.0, + "learning_rate": 6.062108762119403e-06, + "loss": 0.1355, + "step": 2494 + }, + { + "epoch": 5.755478662053056, + "grad_norm": 0.0, + "learning_rate": 6.054470560846524e-06, + "loss": 0.1392, + "step": 2495 + }, + { + "epoch": 5.757785467128028, + "grad_norm": 0.0, + "learning_rate": 6.046835084918152e-06, + "loss": 0.0526, + "step": 2496 + }, + { + "epoch": 5.760092272202999, + "grad_norm": 0.0, + "learning_rate": 6.039202339608432e-06, + "loss": 0.124, + "step": 2497 + }, + { + "epoch": 5.76239907727797, + "grad_norm": 0.0, + "learning_rate": 6.031572330189635e-06, + "loss": 0.0895, + "step": 2498 + }, + { + "epoch": 5.764705882352941, + "grad_norm": 0.0, + "learning_rate": 6.023945061932119e-06, + "loss": 0.135, + "step": 2499 + }, + { + "epoch": 5.767012687427912, + "grad_norm": 0.0, + "learning_rate": 6.016320540104369e-06, + "loss": 0.1245, + "step": 2500 + }, + { + "epoch": 5.769319492502883, + "grad_norm": 0.0, + "learning_rate": 6.008698769972967e-06, + "loss": 0.0985, + "step": 2501 + }, + { + "epoch": 5.771626297577855, + "grad_norm": 0.0, + "learning_rate": 6.001079756802592e-06, + "loss": 0.0832, + "step": 2502 + }, + { + "epoch": 5.773933102652826, + "grad_norm": 0.0, + "learning_rate": 5.993463505856015e-06, + "loss": 0.1478, + "step": 2503 + }, + { + "epoch": 5.776239907727797, + "grad_norm": 0.0, + "learning_rate": 5.9858500223941066e-06, + "loss": 0.1114, + "step": 2504 + }, + { + "epoch": 5.778546712802768, + "grad_norm": 0.0, + "learning_rate": 5.978239311675826e-06, + "loss": 0.1913, + "step": 2505 + }, + { + "epoch": 5.780853517877739, + "grad_norm": 0.0, + "learning_rate": 5.970631378958208e-06, + "loss": 0.1255, + "step": 2506 + }, + { + "epoch": 5.78316032295271, + "grad_norm": 0.0, + "learning_rate": 5.963026229496378e-06, + "loss": 0.1287, + "step": 2507 + }, + { + "epoch": 5.7854671280276815, + "grad_norm": 0.0, + "learning_rate": 5.955423868543537e-06, + "loss": 0.1836, + "step": 2508 + }, + { + "epoch": 5.7877739331026525, + "grad_norm": 0.0, + "learning_rate": 5.94782430135095e-06, + "loss": 0.1376, + "step": 2509 + }, + { + "epoch": 5.790080738177624, + "grad_norm": 0.0, + "learning_rate": 5.940227533167966e-06, + "loss": 0.1178, + "step": 2510 + }, + { + "epoch": 5.7923875432525955, + "grad_norm": 0.0, + "learning_rate": 5.932633569242e-06, + "loss": 0.161, + "step": 2511 + }, + { + "epoch": 5.794694348327567, + "grad_norm": 0.0, + "learning_rate": 5.925042414818514e-06, + "loss": 0.1216, + "step": 2512 + }, + { + "epoch": 5.797001153402538, + "grad_norm": 0.0, + "learning_rate": 5.917454075141049e-06, + "loss": 0.0988, + "step": 2513 + }, + { + "epoch": 5.799307958477509, + "grad_norm": 0.0, + "learning_rate": 5.909868555451191e-06, + "loss": 0.1342, + "step": 2514 + }, + { + "epoch": 5.80161476355248, + "grad_norm": 0.0, + "learning_rate": 5.902285860988576e-06, + "loss": 0.1284, + "step": 2515 + }, + { + "epoch": 5.803921568627451, + "grad_norm": 0.0, + "learning_rate": 5.8947059969908945e-06, + "loss": 0.102, + "step": 2516 + }, + { + "epoch": 5.806228373702422, + "grad_norm": 0.0, + "learning_rate": 5.887128968693887e-06, + "loss": 0.1243, + "step": 2517 + }, + { + "epoch": 5.808535178777394, + "grad_norm": 0.0, + "learning_rate": 5.879554781331317e-06, + "loss": 0.1588, + "step": 2518 + }, + { + "epoch": 5.810841983852365, + "grad_norm": 0.0, + "learning_rate": 5.871983440135005e-06, + "loss": 0.1552, + "step": 2519 + }, + { + "epoch": 5.813148788927336, + "grad_norm": 0.0, + "learning_rate": 5.864414950334796e-06, + "loss": 0.1465, + "step": 2520 + }, + { + "epoch": 5.815455594002307, + "grad_norm": 0.0, + "learning_rate": 5.8568493171585625e-06, + "loss": 0.1216, + "step": 2521 + }, + { + "epoch": 5.817762399077278, + "grad_norm": 0.0, + "learning_rate": 5.849286545832211e-06, + "loss": 0.1127, + "step": 2522 + }, + { + "epoch": 5.820069204152249, + "grad_norm": 0.0, + "learning_rate": 5.8417266415796745e-06, + "loss": 0.1564, + "step": 2523 + }, + { + "epoch": 5.82237600922722, + "grad_norm": 0.0, + "learning_rate": 5.83416960962289e-06, + "loss": 0.1462, + "step": 2524 + }, + { + "epoch": 5.824682814302191, + "grad_norm": 0.0, + "learning_rate": 5.8266154551818225e-06, + "loss": 0.1546, + "step": 2525 + }, + { + "epoch": 5.826989619377162, + "grad_norm": 0.0, + "learning_rate": 5.819064183474451e-06, + "loss": 0.138, + "step": 2526 + }, + { + "epoch": 5.829296424452134, + "grad_norm": 0.0, + "learning_rate": 5.811515799716754e-06, + "loss": 0.1563, + "step": 2527 + }, + { + "epoch": 5.831603229527105, + "grad_norm": 0.0, + "learning_rate": 5.80397030912272e-06, + "loss": 0.096, + "step": 2528 + }, + { + "epoch": 5.833910034602076, + "grad_norm": 0.0, + "learning_rate": 5.796427716904347e-06, + "loss": 0.1868, + "step": 2529 + }, + { + "epoch": 5.836216839677047, + "grad_norm": 0.0, + "learning_rate": 5.7888880282716155e-06, + "loss": 0.1338, + "step": 2530 + }, + { + "epoch": 5.838523644752018, + "grad_norm": 0.0, + "learning_rate": 5.78135124843251e-06, + "loss": 0.1738, + "step": 2531 + }, + { + "epoch": 5.840830449826989, + "grad_norm": 0.0, + "learning_rate": 5.773817382593008e-06, + "loss": 0.1124, + "step": 2532 + }, + { + "epoch": 5.8431372549019605, + "grad_norm": 0.0, + "learning_rate": 5.766286435957063e-06, + "loss": 0.1062, + "step": 2533 + }, + { + "epoch": 5.845444059976932, + "grad_norm": 0.0, + "learning_rate": 5.758758413726626e-06, + "loss": 0.1081, + "step": 2534 + }, + { + "epoch": 5.8477508650519034, + "grad_norm": 0.0, + "learning_rate": 5.751233321101617e-06, + "loss": 0.0829, + "step": 2535 + }, + { + "epoch": 5.8500576701268745, + "grad_norm": 0.0, + "learning_rate": 5.743711163279941e-06, + "loss": 0.1177, + "step": 2536 + }, + { + "epoch": 5.8523644752018456, + "grad_norm": 0.0, + "learning_rate": 5.736191945457463e-06, + "loss": 0.0796, + "step": 2537 + }, + { + "epoch": 5.854671280276817, + "grad_norm": 0.0, + "learning_rate": 5.728675672828037e-06, + "loss": 0.1223, + "step": 2538 + }, + { + "epoch": 5.856978085351788, + "grad_norm": 0.0, + "learning_rate": 5.72116235058346e-06, + "loss": 0.1488, + "step": 2539 + }, + { + "epoch": 5.859284890426759, + "grad_norm": 0.0, + "learning_rate": 5.713651983913506e-06, + "loss": 0.1481, + "step": 2540 + }, + { + "epoch": 5.86159169550173, + "grad_norm": 0.0, + "learning_rate": 5.706144578005908e-06, + "loss": 0.1741, + "step": 2541 + }, + { + "epoch": 5.863898500576701, + "grad_norm": 0.0, + "learning_rate": 5.698640138046349e-06, + "loss": 0.0745, + "step": 2542 + }, + { + "epoch": 5.866205305651673, + "grad_norm": 0.0, + "learning_rate": 5.69113866921846e-06, + "loss": 0.1262, + "step": 2543 + }, + { + "epoch": 5.868512110726644, + "grad_norm": 0.0, + "learning_rate": 5.683640176703824e-06, + "loss": 0.1171, + "step": 2544 + }, + { + "epoch": 5.870818915801615, + "grad_norm": 0.0, + "learning_rate": 5.6761446656819745e-06, + "loss": 0.0939, + "step": 2545 + }, + { + "epoch": 5.873125720876586, + "grad_norm": 0.0, + "learning_rate": 5.668652141330373e-06, + "loss": 0.1156, + "step": 2546 + }, + { + "epoch": 5.875432525951557, + "grad_norm": 0.0, + "learning_rate": 5.66116260882442e-06, + "loss": 0.1162, + "step": 2547 + }, + { + "epoch": 5.877739331026528, + "grad_norm": 0.0, + "learning_rate": 5.653676073337462e-06, + "loss": 0.1024, + "step": 2548 + }, + { + "epoch": 5.880046136101499, + "grad_norm": 0.0, + "learning_rate": 5.646192540040758e-06, + "loss": 0.0845, + "step": 2549 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 0.0, + "learning_rate": 5.638712014103507e-06, + "loss": 0.1197, + "step": 2550 + }, + { + "epoch": 5.884659746251442, + "grad_norm": 0.0, + "learning_rate": 5.631234500692828e-06, + "loss": 0.0753, + "step": 2551 + }, + { + "epoch": 5.886966551326413, + "grad_norm": 0.0, + "learning_rate": 5.623760004973749e-06, + "loss": 0.1028, + "step": 2552 + }, + { + "epoch": 5.889273356401384, + "grad_norm": 0.0, + "learning_rate": 5.616288532109225e-06, + "loss": 0.0449, + "step": 2553 + }, + { + "epoch": 5.891580161476355, + "grad_norm": 0.0, + "learning_rate": 5.608820087260125e-06, + "loss": 0.1727, + "step": 2554 + }, + { + "epoch": 5.893886966551326, + "grad_norm": 0.0, + "learning_rate": 5.6013546755852086e-06, + "loss": 0.1287, + "step": 2555 + }, + { + "epoch": 5.896193771626297, + "grad_norm": 0.0, + "learning_rate": 5.5938923022411615e-06, + "loss": 0.1475, + "step": 2556 + }, + { + "epoch": 5.898500576701268, + "grad_norm": 0.0, + "learning_rate": 5.586432972382561e-06, + "loss": 0.114, + "step": 2557 + }, + { + "epoch": 5.900807381776239, + "grad_norm": 0.0, + "learning_rate": 5.578976691161877e-06, + "loss": 0.1175, + "step": 2558 + }, + { + "epoch": 5.903114186851211, + "grad_norm": 0.0, + "learning_rate": 5.571523463729487e-06, + "loss": 0.1512, + "step": 2559 + }, + { + "epoch": 5.905420991926182, + "grad_norm": 0.0, + "learning_rate": 5.564073295233645e-06, + "loss": 0.1316, + "step": 2560 + }, + { + "epoch": 5.9077277970011535, + "grad_norm": 0.0, + "learning_rate": 5.556626190820497e-06, + "loss": 0.0906, + "step": 2561 + }, + { + "epoch": 5.9100346020761245, + "grad_norm": 0.0, + "learning_rate": 5.549182155634076e-06, + "loss": 0.0759, + "step": 2562 + }, + { + "epoch": 5.912341407151096, + "grad_norm": 0.0, + "learning_rate": 5.541741194816299e-06, + "loss": 0.1043, + "step": 2563 + }, + { + "epoch": 5.914648212226067, + "grad_norm": 0.0, + "learning_rate": 5.5343033135069434e-06, + "loss": 0.0509, + "step": 2564 + }, + { + "epoch": 5.916955017301038, + "grad_norm": 0.0, + "learning_rate": 5.526868516843673e-06, + "loss": 0.076, + "step": 2565 + }, + { + "epoch": 5.91926182237601, + "grad_norm": 0.0, + "learning_rate": 5.519436809962024e-06, + "loss": 0.1838, + "step": 2566 + }, + { + "epoch": 5.921568627450981, + "grad_norm": 0.0, + "learning_rate": 5.512008197995379e-06, + "loss": 0.2291, + "step": 2567 + }, + { + "epoch": 5.923875432525952, + "grad_norm": 0.0, + "learning_rate": 5.504582686075002e-06, + "loss": 0.1515, + "step": 2568 + }, + { + "epoch": 5.926182237600923, + "grad_norm": 0.0, + "learning_rate": 5.4971602793300134e-06, + "loss": 0.1934, + "step": 2569 + }, + { + "epoch": 5.928489042675894, + "grad_norm": 0.0, + "learning_rate": 5.4897409828873745e-06, + "loss": 0.1412, + "step": 2570 + }, + { + "epoch": 5.930795847750865, + "grad_norm": 0.0, + "learning_rate": 5.4823248018719184e-06, + "loss": 0.085, + "step": 2571 + }, + { + "epoch": 5.933102652825836, + "grad_norm": 0.0, + "learning_rate": 5.47491174140631e-06, + "loss": 0.092, + "step": 2572 + }, + { + "epoch": 5.935409457900807, + "grad_norm": 0.0, + "learning_rate": 5.467501806611062e-06, + "loss": 0.1537, + "step": 2573 + }, + { + "epoch": 5.937716262975779, + "grad_norm": 0.0, + "learning_rate": 5.460095002604533e-06, + "loss": 0.1386, + "step": 2574 + }, + { + "epoch": 5.94002306805075, + "grad_norm": 0.0, + "learning_rate": 5.452691334502922e-06, + "loss": 0.1335, + "step": 2575 + }, + { + "epoch": 5.942329873125721, + "grad_norm": 0.0, + "learning_rate": 5.445290807420247e-06, + "loss": 0.1451, + "step": 2576 + }, + { + "epoch": 5.944636678200692, + "grad_norm": 0.0, + "learning_rate": 5.43789342646837e-06, + "loss": 0.123, + "step": 2577 + }, + { + "epoch": 5.946943483275663, + "grad_norm": 0.0, + "learning_rate": 5.430499196756977e-06, + "loss": 0.1452, + "step": 2578 + }, + { + "epoch": 5.949250288350634, + "grad_norm": 0.0, + "learning_rate": 5.423108123393581e-06, + "loss": 0.0929, + "step": 2579 + }, + { + "epoch": 5.951557093425605, + "grad_norm": 0.0, + "learning_rate": 5.415720211483499e-06, + "loss": 0.1697, + "step": 2580 + }, + { + "epoch": 5.953863898500577, + "grad_norm": 0.0, + "learning_rate": 5.4083354661298816e-06, + "loss": 0.1125, + "step": 2581 + }, + { + "epoch": 5.956170703575548, + "grad_norm": 0.0, + "learning_rate": 5.4009538924336864e-06, + "loss": 0.0718, + "step": 2582 + }, + { + "epoch": 5.958477508650519, + "grad_norm": 0.0, + "learning_rate": 5.393575495493679e-06, + "loss": 0.0888, + "step": 2583 + }, + { + "epoch": 5.96078431372549, + "grad_norm": 0.0, + "learning_rate": 5.386200280406426e-06, + "loss": 0.1225, + "step": 2584 + }, + { + "epoch": 5.963091118800461, + "grad_norm": 0.0, + "learning_rate": 5.3788282522663085e-06, + "loss": 0.1321, + "step": 2585 + }, + { + "epoch": 5.965397923875432, + "grad_norm": 0.0, + "learning_rate": 5.37145941616549e-06, + "loss": 0.1045, + "step": 2586 + }, + { + "epoch": 5.9677047289504035, + "grad_norm": 0.0, + "learning_rate": 5.364093777193944e-06, + "loss": 0.1374, + "step": 2587 + }, + { + "epoch": 5.9700115340253745, + "grad_norm": 0.0, + "learning_rate": 5.356731340439432e-06, + "loss": 0.093, + "step": 2588 + }, + { + "epoch": 5.972318339100346, + "grad_norm": 0.0, + "learning_rate": 5.349372110987496e-06, + "loss": 0.1599, + "step": 2589 + }, + { + "epoch": 5.9746251441753175, + "grad_norm": 0.0, + "learning_rate": 5.342016093921469e-06, + "loss": 0.0876, + "step": 2590 + }, + { + "epoch": 5.976931949250289, + "grad_norm": 0.0, + "learning_rate": 5.33466329432247e-06, + "loss": 0.1862, + "step": 2591 + }, + { + "epoch": 5.97923875432526, + "grad_norm": 0.0, + "learning_rate": 5.32731371726938e-06, + "loss": 0.0925, + "step": 2592 + }, + { + "epoch": 5.981545559400231, + "grad_norm": 0.0, + "learning_rate": 5.319967367838868e-06, + "loss": 0.1385, + "step": 2593 + }, + { + "epoch": 5.983852364475202, + "grad_norm": 0.0, + "learning_rate": 5.312624251105374e-06, + "loss": 0.1227, + "step": 2594 + }, + { + "epoch": 5.986159169550173, + "grad_norm": 0.0, + "learning_rate": 5.305284372141095e-06, + "loss": 0.1392, + "step": 2595 + }, + { + "epoch": 5.988465974625144, + "grad_norm": 0.0, + "learning_rate": 5.297947736015994e-06, + "loss": 0.1249, + "step": 2596 + }, + { + "epoch": 5.990772779700116, + "grad_norm": 0.0, + "learning_rate": 5.290614347797802e-06, + "loss": 0.2109, + "step": 2597 + }, + { + "epoch": 5.993079584775087, + "grad_norm": 0.0, + "learning_rate": 5.283284212551997e-06, + "loss": 0.0994, + "step": 2598 + }, + { + "epoch": 5.995386389850058, + "grad_norm": 0.0, + "learning_rate": 5.275957335341815e-06, + "loss": 0.0851, + "step": 2599 + }, + { + "epoch": 5.997693194925029, + "grad_norm": 0.0, + "learning_rate": 5.268633721228247e-06, + "loss": 0.0957, + "step": 2600 + }, + { + "epoch": 6.0, + "grad_norm": 0.0, + "learning_rate": 5.2613133752700145e-06, + "loss": 0.1511, + "step": 2601 + }, + { + "epoch": 6.002306805074971, + "grad_norm": 0.0, + "learning_rate": 5.253996302523596e-06, + "loss": 0.1146, + "step": 2602 + }, + { + "epoch": 6.004613610149942, + "grad_norm": 0.0, + "learning_rate": 5.246682508043206e-06, + "loss": 0.1119, + "step": 2603 + }, + { + "epoch": 6.006920415224913, + "grad_norm": 0.0, + "learning_rate": 5.239371996880786e-06, + "loss": 0.0804, + "step": 2604 + }, + { + "epoch": 6.009227220299885, + "grad_norm": 0.0, + "learning_rate": 5.232064774086022e-06, + "loss": 0.0861, + "step": 2605 + }, + { + "epoch": 6.011534025374856, + "grad_norm": 0.0, + "learning_rate": 5.224760844706324e-06, + "loss": 0.0499, + "step": 2606 + }, + { + "epoch": 6.013840830449827, + "grad_norm": 0.0, + "learning_rate": 5.217460213786822e-06, + "loss": 0.0922, + "step": 2607 + }, + { + "epoch": 6.016147635524798, + "grad_norm": 0.0, + "learning_rate": 5.210162886370367e-06, + "loss": 0.0639, + "step": 2608 + }, + { + "epoch": 6.018454440599769, + "grad_norm": 0.0, + "learning_rate": 5.202868867497542e-06, + "loss": 0.0369, + "step": 2609 + }, + { + "epoch": 6.02076124567474, + "grad_norm": 0.0, + "learning_rate": 5.195578162206627e-06, + "loss": 0.0579, + "step": 2610 + }, + { + "epoch": 6.023068050749711, + "grad_norm": 0.0, + "learning_rate": 5.188290775533624e-06, + "loss": 0.065, + "step": 2611 + }, + { + "epoch": 6.0253748558246825, + "grad_norm": 0.0, + "learning_rate": 5.181006712512245e-06, + "loss": 0.0972, + "step": 2612 + }, + { + "epoch": 6.027681660899654, + "grad_norm": 0.0, + "learning_rate": 5.1737259781738934e-06, + "loss": 0.0671, + "step": 2613 + }, + { + "epoch": 6.0299884659746255, + "grad_norm": 0.0, + "learning_rate": 5.1664485775476844e-06, + "loss": 0.0759, + "step": 2614 + }, + { + "epoch": 6.0322952710495965, + "grad_norm": 0.0, + "learning_rate": 5.159174515660432e-06, + "loss": 0.1878, + "step": 2615 + }, + { + "epoch": 6.034602076124568, + "grad_norm": 0.0, + "learning_rate": 5.151903797536631e-06, + "loss": 0.0727, + "step": 2616 + }, + { + "epoch": 6.036908881199539, + "grad_norm": 0.0, + "learning_rate": 5.144636428198477e-06, + "loss": 0.0616, + "step": 2617 + }, + { + "epoch": 6.03921568627451, + "grad_norm": 0.0, + "learning_rate": 5.137372412665857e-06, + "loss": 0.0811, + "step": 2618 + }, + { + "epoch": 6.041522491349481, + "grad_norm": 0.0, + "learning_rate": 5.130111755956327e-06, + "loss": 0.0916, + "step": 2619 + }, + { + "epoch": 6.043829296424452, + "grad_norm": 0.0, + "learning_rate": 5.122854463085136e-06, + "loss": 0.1091, + "step": 2620 + }, + { + "epoch": 6.046136101499424, + "grad_norm": 0.0, + "learning_rate": 5.115600539065197e-06, + "loss": 0.1055, + "step": 2621 + }, + { + "epoch": 6.048442906574395, + "grad_norm": 0.0, + "learning_rate": 5.108349988907111e-06, + "loss": 0.0608, + "step": 2622 + }, + { + "epoch": 6.050749711649366, + "grad_norm": 0.0, + "learning_rate": 5.101102817619132e-06, + "loss": 0.0828, + "step": 2623 + }, + { + "epoch": 6.053056516724337, + "grad_norm": 0.0, + "learning_rate": 5.093859030207192e-06, + "loss": 0.0786, + "step": 2624 + }, + { + "epoch": 6.055363321799308, + "grad_norm": 0.0, + "learning_rate": 5.086618631674888e-06, + "loss": 0.0395, + "step": 2625 + }, + { + "epoch": 6.057670126874279, + "grad_norm": 0.0, + "learning_rate": 5.079381627023461e-06, + "loss": 0.1019, + "step": 2626 + }, + { + "epoch": 6.05997693194925, + "grad_norm": 0.0, + "learning_rate": 5.072148021251822e-06, + "loss": 0.0662, + "step": 2627 + }, + { + "epoch": 6.062283737024221, + "grad_norm": 0.0, + "learning_rate": 5.064917819356532e-06, + "loss": 0.0503, + "step": 2628 + }, + { + "epoch": 6.064590542099193, + "grad_norm": 0.0, + "learning_rate": 5.057691026331792e-06, + "loss": 0.0785, + "step": 2629 + }, + { + "epoch": 6.066897347174164, + "grad_norm": 0.0, + "learning_rate": 5.05046764716946e-06, + "loss": 0.1151, + "step": 2630 + }, + { + "epoch": 6.069204152249135, + "grad_norm": 0.0, + "learning_rate": 5.043247686859024e-06, + "loss": 0.0487, + "step": 2631 + }, + { + "epoch": 6.071510957324106, + "grad_norm": 0.0, + "learning_rate": 5.036031150387624e-06, + "loss": 0.0943, + "step": 2632 + }, + { + "epoch": 6.073817762399077, + "grad_norm": 0.0, + "learning_rate": 5.0288180427400205e-06, + "loss": 0.0708, + "step": 2633 + }, + { + "epoch": 6.076124567474048, + "grad_norm": 0.0, + "learning_rate": 5.021608368898621e-06, + "loss": 0.1047, + "step": 2634 + }, + { + "epoch": 6.078431372549019, + "grad_norm": 0.0, + "learning_rate": 5.014402133843443e-06, + "loss": 0.0962, + "step": 2635 + }, + { + "epoch": 6.08073817762399, + "grad_norm": 0.0, + "learning_rate": 5.007199342552145e-06, + "loss": 0.053, + "step": 2636 + }, + { + "epoch": 6.083044982698962, + "grad_norm": 0.0, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0772, + "step": 2637 + }, + { + "epoch": 6.085351787773933, + "grad_norm": 0.0, + "learning_rate": 4.9928041111599e-06, + "loss": 0.094, + "step": 2638 + }, + { + "epoch": 6.087658592848904, + "grad_norm": 0.0, + "learning_rate": 4.985611681002347e-06, + "loss": 0.072, + "step": 2639 + }, + { + "epoch": 6.0899653979238755, + "grad_norm": 0.0, + "learning_rate": 4.978422714495465e-06, + "loss": 0.0748, + "step": 2640 + }, + { + "epoch": 6.0922722029988465, + "grad_norm": 0.0, + "learning_rate": 4.971237216604967e-06, + "loss": 0.085, + "step": 2641 + }, + { + "epoch": 6.094579008073818, + "grad_norm": 0.0, + "learning_rate": 4.964055192294187e-06, + "loss": 0.0766, + "step": 2642 + }, + { + "epoch": 6.096885813148789, + "grad_norm": 0.0, + "learning_rate": 4.956876646524059e-06, + "loss": 0.0768, + "step": 2643 + }, + { + "epoch": 6.09919261822376, + "grad_norm": 0.0, + "learning_rate": 4.949701584253103e-06, + "loss": 0.1084, + "step": 2644 + }, + { + "epoch": 6.101499423298732, + "grad_norm": 0.0, + "learning_rate": 4.942530010437435e-06, + "loss": 0.1116, + "step": 2645 + }, + { + "epoch": 6.103806228373703, + "grad_norm": 0.0, + "learning_rate": 4.935361930030774e-06, + "loss": 0.0626, + "step": 2646 + }, + { + "epoch": 6.106113033448674, + "grad_norm": 0.0, + "learning_rate": 4.92819734798441e-06, + "loss": 0.054, + "step": 2647 + }, + { + "epoch": 6.108419838523645, + "grad_norm": 0.0, + "learning_rate": 4.921036269247225e-06, + "loss": 0.0926, + "step": 2648 + }, + { + "epoch": 6.110726643598616, + "grad_norm": 0.0, + "learning_rate": 4.9138786987656865e-06, + "loss": 0.0991, + "step": 2649 + }, + { + "epoch": 6.113033448673587, + "grad_norm": 0.0, + "learning_rate": 4.906724641483822e-06, + "loss": 0.065, + "step": 2650 + }, + { + "epoch": 6.115340253748558, + "grad_norm": 0.0, + "learning_rate": 4.899574102343247e-06, + "loss": 0.0881, + "step": 2651 + }, + { + "epoch": 6.117647058823529, + "grad_norm": 0.0, + "learning_rate": 4.892427086283147e-06, + "loss": 0.0662, + "step": 2652 + }, + { + "epoch": 6.119953863898501, + "grad_norm": 0.0, + "learning_rate": 4.885283598240259e-06, + "loss": 0.0465, + "step": 2653 + }, + { + "epoch": 6.122260668973472, + "grad_norm": 0.0, + "learning_rate": 4.878143643148899e-06, + "loss": 0.0947, + "step": 2654 + }, + { + "epoch": 6.124567474048443, + "grad_norm": 0.0, + "learning_rate": 4.87100722594094e-06, + "loss": 0.0735, + "step": 2655 + }, + { + "epoch": 6.126874279123414, + "grad_norm": 0.0, + "learning_rate": 4.863874351545803e-06, + "loss": 0.1198, + "step": 2656 + }, + { + "epoch": 6.129181084198385, + "grad_norm": 0.0, + "learning_rate": 4.856745024890466e-06, + "loss": 0.0644, + "step": 2657 + }, + { + "epoch": 6.131487889273356, + "grad_norm": 0.0, + "learning_rate": 4.849619250899458e-06, + "loss": 0.0615, + "step": 2658 + }, + { + "epoch": 6.133794694348327, + "grad_norm": 0.0, + "learning_rate": 4.8424970344948585e-06, + "loss": 0.1107, + "step": 2659 + }, + { + "epoch": 6.136101499423299, + "grad_norm": 0.0, + "learning_rate": 4.8353783805962776e-06, + "loss": 0.0549, + "step": 2660 + }, + { + "epoch": 6.13840830449827, + "grad_norm": 0.0, + "learning_rate": 4.8282632941208725e-06, + "loss": 0.1303, + "step": 2661 + }, + { + "epoch": 6.140715109573241, + "grad_norm": 0.0, + "learning_rate": 4.821151779983343e-06, + "loss": 0.0795, + "step": 2662 + }, + { + "epoch": 6.143021914648212, + "grad_norm": 0.0, + "learning_rate": 4.814043843095903e-06, + "loss": 0.0781, + "step": 2663 + }, + { + "epoch": 6.145328719723183, + "grad_norm": 0.0, + "learning_rate": 4.806939488368308e-06, + "loss": 0.0848, + "step": 2664 + }, + { + "epoch": 6.1476355247981544, + "grad_norm": 0.0, + "learning_rate": 4.799838720707847e-06, + "loss": 0.0651, + "step": 2665 + }, + { + "epoch": 6.1499423298731255, + "grad_norm": 0.0, + "learning_rate": 4.792741545019307e-06, + "loss": 0.0796, + "step": 2666 + }, + { + "epoch": 6.1522491349480966, + "grad_norm": 0.0, + "learning_rate": 4.78564796620502e-06, + "loss": 0.1282, + "step": 2667 + }, + { + "epoch": 6.154555940023068, + "grad_norm": 0.0, + "learning_rate": 4.7785579891648185e-06, + "loss": 0.1346, + "step": 2668 + }, + { + "epoch": 6.1568627450980395, + "grad_norm": 0.0, + "learning_rate": 4.771471618796043e-06, + "loss": 0.1288, + "step": 2669 + }, + { + "epoch": 6.159169550173011, + "grad_norm": 0.0, + "learning_rate": 4.764388859993556e-06, + "loss": 0.1429, + "step": 2670 + }, + { + "epoch": 6.161476355247982, + "grad_norm": 0.0, + "learning_rate": 4.757309717649723e-06, + "loss": 0.0846, + "step": 2671 + }, + { + "epoch": 6.163783160322953, + "grad_norm": 0.0, + "learning_rate": 4.7502341966544e-06, + "loss": 0.0925, + "step": 2672 + }, + { + "epoch": 6.166089965397924, + "grad_norm": 0.0, + "learning_rate": 4.743162301894952e-06, + "loss": 0.082, + "step": 2673 + }, + { + "epoch": 6.168396770472895, + "grad_norm": 0.0, + "learning_rate": 4.736094038256244e-06, + "loss": 0.1284, + "step": 2674 + }, + { + "epoch": 6.170703575547866, + "grad_norm": 0.0, + "learning_rate": 4.729029410620615e-06, + "loss": 0.0698, + "step": 2675 + }, + { + "epoch": 6.173010380622838, + "grad_norm": 0.0, + "learning_rate": 4.7219684238679066e-06, + "loss": 0.1155, + "step": 2676 + }, + { + "epoch": 6.175317185697809, + "grad_norm": 0.0, + "learning_rate": 4.714911082875446e-06, + "loss": 0.0439, + "step": 2677 + }, + { + "epoch": 6.17762399077278, + "grad_norm": 0.0, + "learning_rate": 4.707857392518032e-06, + "loss": 0.076, + "step": 2678 + }, + { + "epoch": 6.179930795847751, + "grad_norm": 0.0, + "learning_rate": 4.700807357667953e-06, + "loss": 0.0895, + "step": 2679 + }, + { + "epoch": 6.182237600922722, + "grad_norm": 0.0, + "learning_rate": 4.693760983194959e-06, + "loss": 0.038, + "step": 2680 + }, + { + "epoch": 6.184544405997693, + "grad_norm": 0.0, + "learning_rate": 4.686718273966291e-06, + "loss": 0.0556, + "step": 2681 + }, + { + "epoch": 6.186851211072664, + "grad_norm": 0.0, + "learning_rate": 4.679679234846636e-06, + "loss": 0.0768, + "step": 2682 + }, + { + "epoch": 6.189158016147635, + "grad_norm": 0.0, + "learning_rate": 4.6726438706981644e-06, + "loss": 0.0554, + "step": 2683 + }, + { + "epoch": 6.191464821222607, + "grad_norm": 0.0, + "learning_rate": 4.665612186380495e-06, + "loss": 0.0765, + "step": 2684 + }, + { + "epoch": 6.193771626297578, + "grad_norm": 0.0, + "learning_rate": 4.658584186750713e-06, + "loss": 0.0688, + "step": 2685 + }, + { + "epoch": 6.196078431372549, + "grad_norm": 0.0, + "learning_rate": 4.65155987666336e-06, + "loss": 0.0694, + "step": 2686 + }, + { + "epoch": 6.19838523644752, + "grad_norm": 0.0, + "learning_rate": 4.644539260970417e-06, + "loss": 0.0388, + "step": 2687 + }, + { + "epoch": 6.200692041522491, + "grad_norm": 0.0, + "learning_rate": 4.637522344521323e-06, + "loss": 0.0528, + "step": 2688 + }, + { + "epoch": 6.202998846597462, + "grad_norm": 0.0, + "learning_rate": 4.630509132162967e-06, + "loss": 0.0554, + "step": 2689 + }, + { + "epoch": 6.205305651672433, + "grad_norm": 0.0, + "learning_rate": 4.623499628739663e-06, + "loss": 0.0824, + "step": 2690 + }, + { + "epoch": 6.2076124567474045, + "grad_norm": 0.0, + "learning_rate": 4.616493839093179e-06, + "loss": 0.1451, + "step": 2691 + }, + { + "epoch": 6.209919261822376, + "grad_norm": 0.0, + "learning_rate": 4.609491768062705e-06, + "loss": 0.1045, + "step": 2692 + }, + { + "epoch": 6.2122260668973475, + "grad_norm": 0.0, + "learning_rate": 4.6024934204848745e-06, + "loss": 0.1097, + "step": 2693 + }, + { + "epoch": 6.2145328719723185, + "grad_norm": 0.0, + "learning_rate": 4.595498801193736e-06, + "loss": 0.1139, + "step": 2694 + }, + { + "epoch": 6.21683967704729, + "grad_norm": 0.0, + "learning_rate": 4.588507915020778e-06, + "loss": 0.0265, + "step": 2695 + }, + { + "epoch": 6.219146482122261, + "grad_norm": 0.0, + "learning_rate": 4.581520766794893e-06, + "loss": 0.0559, + "step": 2696 + }, + { + "epoch": 6.221453287197232, + "grad_norm": 0.0, + "learning_rate": 4.5745373613424075e-06, + "loss": 0.0957, + "step": 2697 + }, + { + "epoch": 6.223760092272203, + "grad_norm": 0.0, + "learning_rate": 4.567557703487051e-06, + "loss": 0.0644, + "step": 2698 + }, + { + "epoch": 6.226066897347174, + "grad_norm": 0.0, + "learning_rate": 4.560581798049977e-06, + "loss": 0.0998, + "step": 2699 + }, + { + "epoch": 6.228373702422146, + "grad_norm": 0.0, + "learning_rate": 4.5536096498497295e-06, + "loss": 0.0674, + "step": 2700 + }, + { + "epoch": 6.230680507497117, + "grad_norm": 0.0, + "learning_rate": 4.546641263702271e-06, + "loss": 0.0434, + "step": 2701 + }, + { + "epoch": 6.232987312572088, + "grad_norm": 0.0, + "learning_rate": 4.539676644420966e-06, + "loss": 0.0628, + "step": 2702 + }, + { + "epoch": 6.235294117647059, + "grad_norm": 0.0, + "learning_rate": 4.532715796816565e-06, + "loss": 0.0968, + "step": 2703 + }, + { + "epoch": 6.23760092272203, + "grad_norm": 0.0, + "learning_rate": 4.525758725697226e-06, + "loss": 0.0926, + "step": 2704 + }, + { + "epoch": 6.239907727797001, + "grad_norm": 0.0, + "learning_rate": 4.518805435868492e-06, + "loss": 0.0519, + "step": 2705 + }, + { + "epoch": 6.242214532871972, + "grad_norm": 0.0, + "learning_rate": 4.511855932133289e-06, + "loss": 0.0633, + "step": 2706 + }, + { + "epoch": 6.244521337946943, + "grad_norm": 0.0, + "learning_rate": 4.504910219291941e-06, + "loss": 0.1143, + "step": 2707 + }, + { + "epoch": 6.246828143021915, + "grad_norm": 0.0, + "learning_rate": 4.497968302142146e-06, + "loss": 0.0863, + "step": 2708 + }, + { + "epoch": 6.249134948096886, + "grad_norm": 0.0, + "learning_rate": 4.491030185478976e-06, + "loss": 0.093, + "step": 2709 + }, + { + "epoch": 6.251441753171857, + "grad_norm": 0.0, + "learning_rate": 4.484095874094885e-06, + "loss": 0.0933, + "step": 2710 + }, + { + "epoch": 6.253748558246828, + "grad_norm": 0.0, + "learning_rate": 4.4771653727797e-06, + "loss": 0.0907, + "step": 2711 + }, + { + "epoch": 6.256055363321799, + "grad_norm": 0.0, + "learning_rate": 4.470238686320606e-06, + "loss": 0.0515, + "step": 2712 + }, + { + "epoch": 6.25836216839677, + "grad_norm": 0.0, + "learning_rate": 4.46331581950216e-06, + "loss": 0.0852, + "step": 2713 + }, + { + "epoch": 6.260668973471741, + "grad_norm": 0.0, + "learning_rate": 4.4563967771062856e-06, + "loss": 0.0479, + "step": 2714 + }, + { + "epoch": 6.262975778546712, + "grad_norm": 0.0, + "learning_rate": 4.449481563912252e-06, + "loss": 0.0899, + "step": 2715 + }, + { + "epoch": 6.265282583621684, + "grad_norm": 0.0, + "learning_rate": 4.442570184696694e-06, + "loss": 0.0724, + "step": 2716 + }, + { + "epoch": 6.267589388696655, + "grad_norm": 0.0, + "learning_rate": 4.435662644233594e-06, + "loss": 0.0957, + "step": 2717 + }, + { + "epoch": 6.269896193771626, + "grad_norm": 0.0, + "learning_rate": 4.428758947294278e-06, + "loss": 0.0864, + "step": 2718 + }, + { + "epoch": 6.2722029988465975, + "grad_norm": 0.0, + "learning_rate": 4.4218590986474276e-06, + "loss": 0.0556, + "step": 2719 + }, + { + "epoch": 6.2745098039215685, + "grad_norm": 0.0, + "learning_rate": 4.4149631030590625e-06, + "loss": 0.0482, + "step": 2720 + }, + { + "epoch": 6.27681660899654, + "grad_norm": 0.0, + "learning_rate": 4.408070965292534e-06, + "loss": 0.0958, + "step": 2721 + }, + { + "epoch": 6.279123414071511, + "grad_norm": 0.0, + "learning_rate": 4.4011826901085346e-06, + "loss": 0.0468, + "step": 2722 + }, + { + "epoch": 6.281430219146483, + "grad_norm": 0.0, + "learning_rate": 4.394298282265095e-06, + "loss": 0.0719, + "step": 2723 + }, + { + "epoch": 6.283737024221454, + "grad_norm": 0.0, + "learning_rate": 4.387417746517557e-06, + "loss": 0.1054, + "step": 2724 + }, + { + "epoch": 6.286043829296425, + "grad_norm": 0.0, + "learning_rate": 4.380541087618606e-06, + "loss": 0.1578, + "step": 2725 + }, + { + "epoch": 6.288350634371396, + "grad_norm": 0.0, + "learning_rate": 4.373668310318243e-06, + "loss": 0.0914, + "step": 2726 + }, + { + "epoch": 6.290657439446367, + "grad_norm": 0.0, + "learning_rate": 4.3667994193637794e-06, + "loss": 0.0572, + "step": 2727 + }, + { + "epoch": 6.292964244521338, + "grad_norm": 0.0, + "learning_rate": 4.359934419499859e-06, + "loss": 0.1185, + "step": 2728 + }, + { + "epoch": 6.295271049596309, + "grad_norm": 0.0, + "learning_rate": 4.353073315468417e-06, + "loss": 0.0588, + "step": 2729 + }, + { + "epoch": 6.29757785467128, + "grad_norm": 0.0, + "learning_rate": 4.34621611200872e-06, + "loss": 0.1343, + "step": 2730 + }, + { + "epoch": 6.299884659746251, + "grad_norm": 0.0, + "learning_rate": 4.339362813857321e-06, + "loss": 0.0782, + "step": 2731 + }, + { + "epoch": 6.302191464821223, + "grad_norm": 0.0, + "learning_rate": 4.3325134257480905e-06, + "loss": 0.0931, + "step": 2732 + }, + { + "epoch": 6.304498269896194, + "grad_norm": 0.0, + "learning_rate": 4.325667952412184e-06, + "loss": 0.0548, + "step": 2733 + }, + { + "epoch": 6.306805074971165, + "grad_norm": 0.0, + "learning_rate": 4.318826398578063e-06, + "loss": 0.0935, + "step": 2734 + }, + { + "epoch": 6.309111880046136, + "grad_norm": 0.0, + "learning_rate": 4.311988768971484e-06, + "loss": 0.0824, + "step": 2735 + }, + { + "epoch": 6.311418685121107, + "grad_norm": 0.0, + "learning_rate": 4.305155068315481e-06, + "loss": 0.0388, + "step": 2736 + }, + { + "epoch": 6.313725490196078, + "grad_norm": 0.0, + "learning_rate": 4.298325301330383e-06, + "loss": 0.101, + "step": 2737 + }, + { + "epoch": 6.316032295271049, + "grad_norm": 0.0, + "learning_rate": 4.2914994727338e-06, + "loss": 0.0764, + "step": 2738 + }, + { + "epoch": 6.318339100346021, + "grad_norm": 0.0, + "learning_rate": 4.284677587240625e-06, + "loss": 0.0703, + "step": 2739 + }, + { + "epoch": 6.320645905420992, + "grad_norm": 0.0, + "learning_rate": 4.277859649563021e-06, + "loss": 0.0704, + "step": 2740 + }, + { + "epoch": 6.322952710495963, + "grad_norm": 0.0, + "learning_rate": 4.27104566441042e-06, + "loss": 0.0462, + "step": 2741 + }, + { + "epoch": 6.325259515570934, + "grad_norm": 0.0, + "learning_rate": 4.264235636489542e-06, + "loss": 0.0791, + "step": 2742 + }, + { + "epoch": 6.327566320645905, + "grad_norm": 0.0, + "learning_rate": 4.257429570504353e-06, + "loss": 0.1206, + "step": 2743 + }, + { + "epoch": 6.3298731257208765, + "grad_norm": 0.0, + "learning_rate": 4.250627471156094e-06, + "loss": 0.0771, + "step": 2744 + }, + { + "epoch": 6.3321799307958475, + "grad_norm": 0.0, + "learning_rate": 4.2438293431432665e-06, + "loss": 0.1001, + "step": 2745 + }, + { + "epoch": 6.334486735870819, + "grad_norm": 0.0, + "learning_rate": 4.237035191161621e-06, + "loss": 0.0646, + "step": 2746 + }, + { + "epoch": 6.33679354094579, + "grad_norm": 0.0, + "learning_rate": 4.23024501990417e-06, + "loss": 0.0403, + "step": 2747 + }, + { + "epoch": 6.339100346020762, + "grad_norm": 0.0, + "learning_rate": 4.223458834061175e-06, + "loss": 0.1073, + "step": 2748 + }, + { + "epoch": 6.341407151095733, + "grad_norm": 0.0, + "learning_rate": 4.216676638320135e-06, + "loss": 0.084, + "step": 2749 + }, + { + "epoch": 6.343713956170704, + "grad_norm": 0.0, + "learning_rate": 4.209898437365805e-06, + "loss": 0.0622, + "step": 2750 + }, + { + "epoch": 6.346020761245675, + "grad_norm": 0.0, + "learning_rate": 4.203124235880179e-06, + "loss": 0.0858, + "step": 2751 + }, + { + "epoch": 6.348327566320646, + "grad_norm": 0.0, + "learning_rate": 4.196354038542476e-06, + "loss": 0.1237, + "step": 2752 + }, + { + "epoch": 6.350634371395617, + "grad_norm": 0.0, + "learning_rate": 4.189587850029169e-06, + "loss": 0.082, + "step": 2753 + }, + { + "epoch": 6.352941176470588, + "grad_norm": 0.0, + "learning_rate": 4.182825675013945e-06, + "loss": 0.0733, + "step": 2754 + }, + { + "epoch": 6.35524798154556, + "grad_norm": 0.0, + "learning_rate": 4.176067518167723e-06, + "loss": 0.0226, + "step": 2755 + }, + { + "epoch": 6.357554786620531, + "grad_norm": 0.0, + "learning_rate": 4.169313384158653e-06, + "loss": 0.0708, + "step": 2756 + }, + { + "epoch": 6.359861591695502, + "grad_norm": 0.0, + "learning_rate": 4.162563277652104e-06, + "loss": 0.0794, + "step": 2757 + }, + { + "epoch": 6.362168396770473, + "grad_norm": 0.0, + "learning_rate": 4.1558172033106535e-06, + "loss": 0.1407, + "step": 2758 + }, + { + "epoch": 6.364475201845444, + "grad_norm": 0.0, + "learning_rate": 4.1490751657941055e-06, + "loss": 0.1244, + "step": 2759 + }, + { + "epoch": 6.366782006920415, + "grad_norm": 0.0, + "learning_rate": 4.142337169759472e-06, + "loss": 0.0783, + "step": 2760 + }, + { + "epoch": 6.369088811995386, + "grad_norm": 0.0, + "learning_rate": 4.135603219860971e-06, + "loss": 0.0665, + "step": 2761 + }, + { + "epoch": 6.371395617070357, + "grad_norm": 0.0, + "learning_rate": 4.128873320750027e-06, + "loss": 0.0866, + "step": 2762 + }, + { + "epoch": 6.373702422145329, + "grad_norm": 0.0, + "learning_rate": 4.12214747707527e-06, + "loss": 0.0985, + "step": 2763 + }, + { + "epoch": 6.3760092272203, + "grad_norm": 0.0, + "learning_rate": 4.1154256934825195e-06, + "loss": 0.0969, + "step": 2764 + }, + { + "epoch": 6.378316032295271, + "grad_norm": 0.0, + "learning_rate": 4.108707974614804e-06, + "loss": 0.0619, + "step": 2765 + }, + { + "epoch": 6.380622837370242, + "grad_norm": 0.0, + "learning_rate": 4.101994325112332e-06, + "loss": 0.0491, + "step": 2766 + }, + { + "epoch": 6.382929642445213, + "grad_norm": 0.0, + "learning_rate": 4.095284749612504e-06, + "loss": 0.0789, + "step": 2767 + }, + { + "epoch": 6.385236447520184, + "grad_norm": 0.0, + "learning_rate": 4.0885792527499094e-06, + "loss": 0.0918, + "step": 2768 + }, + { + "epoch": 6.387543252595155, + "grad_norm": 0.0, + "learning_rate": 4.0818778391563255e-06, + "loss": 0.0719, + "step": 2769 + }, + { + "epoch": 6.3898500576701265, + "grad_norm": 0.0, + "learning_rate": 4.075180513460695e-06, + "loss": 0.0826, + "step": 2770 + }, + { + "epoch": 6.392156862745098, + "grad_norm": 0.0, + "learning_rate": 4.068487280289146e-06, + "loss": 0.1064, + "step": 2771 + }, + { + "epoch": 6.3944636678200695, + "grad_norm": 0.0, + "learning_rate": 4.061798144264986e-06, + "loss": 0.0847, + "step": 2772 + }, + { + "epoch": 6.3967704728950405, + "grad_norm": 0.0, + "learning_rate": 4.055113110008675e-06, + "loss": 0.1167, + "step": 2773 + }, + { + "epoch": 6.399077277970012, + "grad_norm": 0.0, + "learning_rate": 4.048432182137855e-06, + "loss": 0.1066, + "step": 2774 + }, + { + "epoch": 6.401384083044983, + "grad_norm": 0.0, + "learning_rate": 4.041755365267323e-06, + "loss": 0.1408, + "step": 2775 + }, + { + "epoch": 6.403690888119954, + "grad_norm": 0.0, + "learning_rate": 4.0350826640090475e-06, + "loss": 0.0874, + "step": 2776 + }, + { + "epoch": 6.405997693194925, + "grad_norm": 0.0, + "learning_rate": 4.028414082972141e-06, + "loss": 0.0697, + "step": 2777 + }, + { + "epoch": 6.408304498269896, + "grad_norm": 0.0, + "learning_rate": 4.021749626762869e-06, + "loss": 0.1271, + "step": 2778 + }, + { + "epoch": 6.410611303344868, + "grad_norm": 0.0, + "learning_rate": 4.015089299984666e-06, + "loss": 0.1097, + "step": 2779 + }, + { + "epoch": 6.412918108419839, + "grad_norm": 0.0, + "learning_rate": 4.00843310723809e-06, + "loss": 0.0493, + "step": 2780 + }, + { + "epoch": 6.41522491349481, + "grad_norm": 0.0, + "learning_rate": 4.001781053120863e-06, + "loss": 0.0921, + "step": 2781 + }, + { + "epoch": 6.417531718569781, + "grad_norm": 0.0, + "learning_rate": 3.995133142227843e-06, + "loss": 0.0508, + "step": 2782 + }, + { + "epoch": 6.419838523644752, + "grad_norm": 0.0, + "learning_rate": 3.988489379151016e-06, + "loss": 0.0959, + "step": 2783 + }, + { + "epoch": 6.422145328719723, + "grad_norm": 0.0, + "learning_rate": 3.981849768479516e-06, + "loss": 0.0906, + "step": 2784 + }, + { + "epoch": 6.424452133794694, + "grad_norm": 0.0, + "learning_rate": 3.975214314799607e-06, + "loss": 0.0485, + "step": 2785 + }, + { + "epoch": 6.426758938869666, + "grad_norm": 0.0, + "learning_rate": 3.9685830226946695e-06, + "loss": 0.0884, + "step": 2786 + }, + { + "epoch": 6.429065743944637, + "grad_norm": 0.0, + "learning_rate": 3.961955896745224e-06, + "loss": 0.0757, + "step": 2787 + }, + { + "epoch": 6.431372549019608, + "grad_norm": 0.0, + "learning_rate": 3.95533294152891e-06, + "loss": 0.119, + "step": 2788 + }, + { + "epoch": 6.433679354094579, + "grad_norm": 0.0, + "learning_rate": 3.9487141616204804e-06, + "loss": 0.112, + "step": 2789 + }, + { + "epoch": 6.43598615916955, + "grad_norm": 0.0, + "learning_rate": 3.942099561591802e-06, + "loss": 0.1185, + "step": 2790 + }, + { + "epoch": 6.438292964244521, + "grad_norm": 0.0, + "learning_rate": 3.9354891460118695e-06, + "loss": 0.0595, + "step": 2791 + }, + { + "epoch": 6.440599769319492, + "grad_norm": 0.0, + "learning_rate": 3.928882919446767e-06, + "loss": 0.0925, + "step": 2792 + }, + { + "epoch": 6.442906574394463, + "grad_norm": 0.0, + "learning_rate": 3.922280886459701e-06, + "loss": 0.0893, + "step": 2793 + }, + { + "epoch": 6.445213379469434, + "grad_norm": 0.0, + "learning_rate": 3.915683051610979e-06, + "loss": 0.0476, + "step": 2794 + }, + { + "epoch": 6.447520184544406, + "grad_norm": 0.0, + "learning_rate": 3.909089419457997e-06, + "loss": 0.0597, + "step": 2795 + }, + { + "epoch": 6.449826989619377, + "grad_norm": 0.0, + "learning_rate": 3.902499994555261e-06, + "loss": 0.0745, + "step": 2796 + }, + { + "epoch": 6.4521337946943484, + "grad_norm": 0.0, + "learning_rate": 3.89591478145437e-06, + "loss": 0.0743, + "step": 2797 + }, + { + "epoch": 6.4544405997693195, + "grad_norm": 0.0, + "learning_rate": 3.889333784704003e-06, + "loss": 0.1137, + "step": 2798 + }, + { + "epoch": 6.4567474048442905, + "grad_norm": 0.0, + "learning_rate": 3.882757008849936e-06, + "loss": 0.0642, + "step": 2799 + }, + { + "epoch": 6.459054209919262, + "grad_norm": 0.0, + "learning_rate": 3.876184458435031e-06, + "loss": 0.0735, + "step": 2800 + }, + { + "epoch": 6.461361014994233, + "grad_norm": 0.0, + "learning_rate": 3.8696161379992225e-06, + "loss": 0.0654, + "step": 2801 + }, + { + "epoch": 6.463667820069205, + "grad_norm": 0.0, + "learning_rate": 3.8630520520795275e-06, + "loss": 0.0759, + "step": 2802 + }, + { + "epoch": 6.465974625144176, + "grad_norm": 0.0, + "learning_rate": 3.856492205210043e-06, + "loss": 0.0688, + "step": 2803 + }, + { + "epoch": 6.468281430219147, + "grad_norm": 0.0, + "learning_rate": 3.849936601921928e-06, + "loss": 0.0633, + "step": 2804 + }, + { + "epoch": 6.470588235294118, + "grad_norm": 0.0, + "learning_rate": 3.8433852467434175e-06, + "loss": 0.103, + "step": 2805 + }, + { + "epoch": 6.472895040369089, + "grad_norm": 0.0, + "learning_rate": 3.836838144199816e-06, + "loss": 0.1367, + "step": 2806 + }, + { + "epoch": 6.47520184544406, + "grad_norm": 0.0, + "learning_rate": 3.830295298813475e-06, + "loss": 0.0702, + "step": 2807 + }, + { + "epoch": 6.477508650519031, + "grad_norm": 0.0, + "learning_rate": 3.823756715103822e-06, + "loss": 0.1265, + "step": 2808 + }, + { + "epoch": 6.479815455594002, + "grad_norm": 0.0, + "learning_rate": 3.8172223975873355e-06, + "loss": 0.118, + "step": 2809 + }, + { + "epoch": 6.482122260668973, + "grad_norm": 0.0, + "learning_rate": 3.8106923507775396e-06, + "loss": 0.0743, + "step": 2810 + }, + { + "epoch": 6.484429065743945, + "grad_norm": 0.0, + "learning_rate": 3.804166579185018e-06, + "loss": 0.0862, + "step": 2811 + }, + { + "epoch": 6.486735870818916, + "grad_norm": 0.0, + "learning_rate": 3.797645087317401e-06, + "loss": 0.0808, + "step": 2812 + }, + { + "epoch": 6.489042675893887, + "grad_norm": 0.0, + "learning_rate": 3.7911278796793518e-06, + "loss": 0.0719, + "step": 2813 + }, + { + "epoch": 6.491349480968858, + "grad_norm": 0.0, + "learning_rate": 3.78461496077259e-06, + "loss": 0.0828, + "step": 2814 + }, + { + "epoch": 6.493656286043829, + "grad_norm": 0.0, + "learning_rate": 3.7781063350958592e-06, + "loss": 0.0877, + "step": 2815 + }, + { + "epoch": 6.4959630911188, + "grad_norm": 0.0, + "learning_rate": 3.771602007144948e-06, + "loss": 0.0477, + "step": 2816 + }, + { + "epoch": 6.498269896193771, + "grad_norm": 0.0, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.0818, + "step": 2817 + }, + { + "epoch": 6.500576701268743, + "grad_norm": 0.0, + "learning_rate": 3.758606262388859e-06, + "loss": 0.0762, + "step": 2818 + }, + { + "epoch": 6.502883506343714, + "grad_norm": 0.0, + "learning_rate": 3.7521148545604003e-06, + "loss": 0.101, + "step": 2819 + }, + { + "epoch": 6.505190311418685, + "grad_norm": 0.0, + "learning_rate": 3.7456277624111725e-06, + "loss": 0.0609, + "step": 2820 + }, + { + "epoch": 6.507497116493656, + "grad_norm": 0.0, + "learning_rate": 3.739144990422089e-06, + "loss": 0.0868, + "step": 2821 + }, + { + "epoch": 6.509803921568627, + "grad_norm": 0.0, + "learning_rate": 3.7326665430710798e-06, + "loss": 0.043, + "step": 2822 + }, + { + "epoch": 6.5121107266435985, + "grad_norm": 0.0, + "learning_rate": 3.726192424833075e-06, + "loss": 0.081, + "step": 2823 + }, + { + "epoch": 6.5144175317185695, + "grad_norm": 0.0, + "learning_rate": 3.719722640180029e-06, + "loss": 0.05, + "step": 2824 + }, + { + "epoch": 6.516724336793541, + "grad_norm": 0.0, + "learning_rate": 3.7132571935808924e-06, + "loss": 0.075, + "step": 2825 + }, + { + "epoch": 6.519031141868512, + "grad_norm": 0.0, + "learning_rate": 3.7067960895016277e-06, + "loss": 0.046, + "step": 2826 + }, + { + "epoch": 6.521337946943484, + "grad_norm": 0.0, + "learning_rate": 3.7003393324051874e-06, + "loss": 0.0445, + "step": 2827 + }, + { + "epoch": 6.523644752018455, + "grad_norm": 0.0, + "learning_rate": 3.6938869267515343e-06, + "loss": 0.0892, + "step": 2828 + }, + { + "epoch": 6.525951557093426, + "grad_norm": 0.0, + "learning_rate": 3.687438876997612e-06, + "loss": 0.0905, + "step": 2829 + }, + { + "epoch": 6.528258362168397, + "grad_norm": 0.0, + "learning_rate": 3.680995187597365e-06, + "loss": 0.0718, + "step": 2830 + }, + { + "epoch": 6.530565167243368, + "grad_norm": 0.0, + "learning_rate": 3.6745558630017254e-06, + "loss": 0.0897, + "step": 2831 + }, + { + "epoch": 6.532871972318339, + "grad_norm": 0.0, + "learning_rate": 3.6681209076586035e-06, + "loss": 0.1317, + "step": 2832 + }, + { + "epoch": 6.53517877739331, + "grad_norm": 0.0, + "learning_rate": 3.661690326012897e-06, + "loss": 0.1293, + "step": 2833 + }, + { + "epoch": 6.537485582468282, + "grad_norm": 0.0, + "learning_rate": 3.6552641225064843e-06, + "loss": 0.1249, + "step": 2834 + }, + { + "epoch": 6.539792387543253, + "grad_norm": 0.0, + "learning_rate": 3.6488423015782128e-06, + "loss": 0.0769, + "step": 2835 + }, + { + "epoch": 6.542099192618224, + "grad_norm": 0.0, + "learning_rate": 3.6424248676639075e-06, + "loss": 0.0704, + "step": 2836 + }, + { + "epoch": 6.544405997693195, + "grad_norm": 0.0, + "learning_rate": 3.636011825196365e-06, + "loss": 0.0532, + "step": 2837 + }, + { + "epoch": 6.546712802768166, + "grad_norm": 0.0, + "learning_rate": 3.6296031786053455e-06, + "loss": 0.1, + "step": 2838 + }, + { + "epoch": 6.549019607843137, + "grad_norm": 0.0, + "learning_rate": 3.6231989323175665e-06, + "loss": 0.0657, + "step": 2839 + }, + { + "epoch": 6.551326412918108, + "grad_norm": 0.0, + "learning_rate": 3.6167990907567207e-06, + "loss": 0.0864, + "step": 2840 + }, + { + "epoch": 6.553633217993079, + "grad_norm": 0.0, + "learning_rate": 3.610403658343443e-06, + "loss": 0.0742, + "step": 2841 + }, + { + "epoch": 6.555940023068051, + "grad_norm": 0.0, + "learning_rate": 3.6040126394953334e-06, + "loss": 0.079, + "step": 2842 + }, + { + "epoch": 6.558246828143022, + "grad_norm": 0.0, + "learning_rate": 3.5976260386269423e-06, + "loss": 0.0699, + "step": 2843 + }, + { + "epoch": 6.560553633217993, + "grad_norm": 0.0, + "learning_rate": 3.591243860149759e-06, + "loss": 0.0793, + "step": 2844 + }, + { + "epoch": 6.562860438292964, + "grad_norm": 0.0, + "learning_rate": 3.5848661084722302e-06, + "loss": 0.0901, + "step": 2845 + }, + { + "epoch": 6.565167243367935, + "grad_norm": 0.0, + "learning_rate": 3.57849278799974e-06, + "loss": 0.1013, + "step": 2846 + }, + { + "epoch": 6.567474048442906, + "grad_norm": 0.0, + "learning_rate": 3.5721239031346067e-06, + "loss": 0.0519, + "step": 2847 + }, + { + "epoch": 6.569780853517877, + "grad_norm": 0.0, + "learning_rate": 3.565759458276091e-06, + "loss": 0.0601, + "step": 2848 + }, + { + "epoch": 6.572087658592849, + "grad_norm": 0.0, + "learning_rate": 3.5593994578203893e-06, + "loss": 0.0346, + "step": 2849 + }, + { + "epoch": 6.57439446366782, + "grad_norm": 0.0, + "learning_rate": 3.5530439061606202e-06, + "loss": 0.0856, + "step": 2850 + }, + { + "epoch": 6.5767012687427915, + "grad_norm": 0.0, + "learning_rate": 3.546692807686829e-06, + "loss": 0.0592, + "step": 2851 + }, + { + "epoch": 6.5790080738177625, + "grad_norm": 0.0, + "learning_rate": 3.540346166785994e-06, + "loss": 0.0783, + "step": 2852 + }, + { + "epoch": 6.581314878892734, + "grad_norm": 0.0, + "learning_rate": 3.534003987842005e-06, + "loss": 0.0626, + "step": 2853 + }, + { + "epoch": 6.583621683967705, + "grad_norm": 0.0, + "learning_rate": 3.527666275235677e-06, + "loss": 0.0913, + "step": 2854 + }, + { + "epoch": 6.585928489042676, + "grad_norm": 0.0, + "learning_rate": 3.5213330333447347e-06, + "loss": 0.114, + "step": 2855 + }, + { + "epoch": 6.588235294117647, + "grad_norm": 0.0, + "learning_rate": 3.5150042665438233e-06, + "loss": 0.0742, + "step": 2856 + }, + { + "epoch": 6.590542099192618, + "grad_norm": 0.0, + "learning_rate": 3.5086799792044812e-06, + "loss": 0.1008, + "step": 2857 + }, + { + "epoch": 6.59284890426759, + "grad_norm": 0.0, + "learning_rate": 3.5023601756951665e-06, + "loss": 0.0827, + "step": 2858 + }, + { + "epoch": 6.595155709342561, + "grad_norm": 0.0, + "learning_rate": 3.496044860381238e-06, + "loss": 0.0735, + "step": 2859 + }, + { + "epoch": 6.597462514417532, + "grad_norm": 0.0, + "learning_rate": 3.4897340376249455e-06, + "loss": 0.0927, + "step": 2860 + }, + { + "epoch": 6.599769319492503, + "grad_norm": 0.0, + "learning_rate": 3.483427711785449e-06, + "loss": 0.1152, + "step": 2861 + }, + { + "epoch": 6.602076124567474, + "grad_norm": 0.0, + "learning_rate": 3.4771258872187917e-06, + "loss": 0.1049, + "step": 2862 + }, + { + "epoch": 6.604382929642445, + "grad_norm": 0.0, + "learning_rate": 3.4708285682779074e-06, + "loss": 0.17, + "step": 2863 + }, + { + "epoch": 6.606689734717416, + "grad_norm": 0.0, + "learning_rate": 3.464535759312625e-06, + "loss": 0.0628, + "step": 2864 + }, + { + "epoch": 6.608996539792388, + "grad_norm": 0.0, + "learning_rate": 3.4582474646696575e-06, + "loss": 0.099, + "step": 2865 + }, + { + "epoch": 6.611303344867359, + "grad_norm": 0.0, + "learning_rate": 3.451963688692591e-06, + "loss": 0.1256, + "step": 2866 + }, + { + "epoch": 6.61361014994233, + "grad_norm": 0.0, + "learning_rate": 3.4456844357218977e-06, + "loss": 0.085, + "step": 2867 + }, + { + "epoch": 6.615916955017301, + "grad_norm": 0.0, + "learning_rate": 3.4394097100949286e-06, + "loss": 0.1082, + "step": 2868 + }, + { + "epoch": 6.618223760092272, + "grad_norm": 0.0, + "learning_rate": 3.433139516145896e-06, + "loss": 0.094, + "step": 2869 + }, + { + "epoch": 6.620530565167243, + "grad_norm": 0.0, + "learning_rate": 3.4268738582058913e-06, + "loss": 0.0579, + "step": 2870 + }, + { + "epoch": 6.622837370242214, + "grad_norm": 0.0, + "learning_rate": 3.4206127406028744e-06, + "loss": 0.069, + "step": 2871 + }, + { + "epoch": 6.625144175317185, + "grad_norm": 0.0, + "learning_rate": 3.414356167661658e-06, + "loss": 0.0987, + "step": 2872 + }, + { + "epoch": 6.627450980392156, + "grad_norm": 0.0, + "learning_rate": 3.4081041437039288e-06, + "loss": 0.1038, + "step": 2873 + }, + { + "epoch": 6.629757785467128, + "grad_norm": 0.0, + "learning_rate": 3.401856673048217e-06, + "loss": 0.0504, + "step": 2874 + }, + { + "epoch": 6.632064590542099, + "grad_norm": 0.0, + "learning_rate": 3.3956137600099248e-06, + "loss": 0.0484, + "step": 2875 + }, + { + "epoch": 6.6343713956170705, + "grad_norm": 0.0, + "learning_rate": 3.3893754089012886e-06, + "loss": 0.0852, + "step": 2876 + }, + { + "epoch": 6.6366782006920415, + "grad_norm": 0.0, + "learning_rate": 3.3831416240314085e-06, + "loss": 0.0883, + "step": 2877 + }, + { + "epoch": 6.638985005767013, + "grad_norm": 0.0, + "learning_rate": 3.3769124097062178e-06, + "loss": 0.1297, + "step": 2878 + }, + { + "epoch": 6.641291810841984, + "grad_norm": 0.0, + "learning_rate": 3.3706877702285033e-06, + "loss": 0.0628, + "step": 2879 + }, + { + "epoch": 6.643598615916955, + "grad_norm": 0.0, + "learning_rate": 3.3644677098978894e-06, + "loss": 0.0988, + "step": 2880 + }, + { + "epoch": 6.645905420991927, + "grad_norm": 0.0, + "learning_rate": 3.35825223301083e-06, + "loss": 0.0751, + "step": 2881 + }, + { + "epoch": 6.648212226066898, + "grad_norm": 0.0, + "learning_rate": 3.3520413438606215e-06, + "loss": 0.0957, + "step": 2882 + }, + { + "epoch": 6.650519031141869, + "grad_norm": 0.0, + "learning_rate": 3.3458350467373914e-06, + "loss": 0.0607, + "step": 2883 + }, + { + "epoch": 6.65282583621684, + "grad_norm": 0.0, + "learning_rate": 3.339633345928085e-06, + "loss": 0.1428, + "step": 2884 + }, + { + "epoch": 6.655132641291811, + "grad_norm": 0.0, + "learning_rate": 3.333436245716488e-06, + "loss": 0.0921, + "step": 2885 + }, + { + "epoch": 6.657439446366782, + "grad_norm": 0.0, + "learning_rate": 3.3272437503831945e-06, + "loss": 0.0353, + "step": 2886 + }, + { + "epoch": 6.659746251441753, + "grad_norm": 0.0, + "learning_rate": 3.3210558642056277e-06, + "loss": 0.0859, + "step": 2887 + }, + { + "epoch": 6.662053056516724, + "grad_norm": 0.0, + "learning_rate": 3.3148725914580183e-06, + "loss": 0.0779, + "step": 2888 + }, + { + "epoch": 6.664359861591695, + "grad_norm": 0.0, + "learning_rate": 3.308693936411421e-06, + "loss": 0.0946, + "step": 2889 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.0, + "learning_rate": 3.3025199033336887e-06, + "loss": 0.089, + "step": 2890 + }, + { + "epoch": 6.668973471741638, + "grad_norm": 0.0, + "learning_rate": 3.29635049648949e-06, + "loss": 0.0835, + "step": 2891 + }, + { + "epoch": 6.671280276816609, + "grad_norm": 0.0, + "learning_rate": 3.290185720140301e-06, + "loss": 0.1067, + "step": 2892 + }, + { + "epoch": 6.67358708189158, + "grad_norm": 0.0, + "learning_rate": 3.284025578544385e-06, + "loss": 0.11, + "step": 2893 + }, + { + "epoch": 6.675893886966551, + "grad_norm": 0.0, + "learning_rate": 3.2778700759568194e-06, + "loss": 0.0726, + "step": 2894 + }, + { + "epoch": 6.678200692041522, + "grad_norm": 0.0, + "learning_rate": 3.2717192166294685e-06, + "loss": 0.0831, + "step": 2895 + }, + { + "epoch": 6.680507497116493, + "grad_norm": 0.0, + "learning_rate": 3.265573004810997e-06, + "loss": 0.089, + "step": 2896 + }, + { + "epoch": 6.682814302191465, + "grad_norm": 0.0, + "learning_rate": 3.2594314447468457e-06, + "loss": 0.0518, + "step": 2897 + }, + { + "epoch": 6.685121107266436, + "grad_norm": 0.0, + "learning_rate": 3.2532945406792573e-06, + "loss": 0.0802, + "step": 2898 + }, + { + "epoch": 6.687427912341407, + "grad_norm": 0.0, + "learning_rate": 3.2471622968472494e-06, + "loss": 0.1096, + "step": 2899 + }, + { + "epoch": 6.689734717416378, + "grad_norm": 0.0, + "learning_rate": 3.2410347174866188e-06, + "loss": 0.0426, + "step": 2900 + }, + { + "epoch": 6.692041522491349, + "grad_norm": 0.0, + "learning_rate": 3.234911806829948e-06, + "loss": 0.0535, + "step": 2901 + }, + { + "epoch": 6.6943483275663205, + "grad_norm": 0.0, + "learning_rate": 3.228793569106594e-06, + "loss": 0.0568, + "step": 2902 + }, + { + "epoch": 6.6966551326412915, + "grad_norm": 0.0, + "learning_rate": 3.222680008542678e-06, + "loss": 0.0826, + "step": 2903 + }, + { + "epoch": 6.698961937716263, + "grad_norm": 0.0, + "learning_rate": 3.216571129361097e-06, + "loss": 0.0656, + "step": 2904 + }, + { + "epoch": 6.7012687427912345, + "grad_norm": 0.0, + "learning_rate": 3.2104669357815167e-06, + "loss": 0.054, + "step": 2905 + }, + { + "epoch": 6.703575547866206, + "grad_norm": 0.0, + "learning_rate": 3.2043674320203565e-06, + "loss": 0.0608, + "step": 2906 + }, + { + "epoch": 6.705882352941177, + "grad_norm": 0.0, + "learning_rate": 3.1982726222908046e-06, + "loss": 0.0925, + "step": 2907 + }, + { + "epoch": 6.708189158016148, + "grad_norm": 0.0, + "learning_rate": 3.1921825108028093e-06, + "loss": 0.0519, + "step": 2908 + }, + { + "epoch": 6.710495963091119, + "grad_norm": 0.0, + "learning_rate": 3.1860971017630605e-06, + "loss": 0.1546, + "step": 2909 + }, + { + "epoch": 6.71280276816609, + "grad_norm": 0.0, + "learning_rate": 3.1800163993750166e-06, + "loss": 0.1249, + "step": 2910 + }, + { + "epoch": 6.715109573241061, + "grad_norm": 0.0, + "learning_rate": 3.1739404078388713e-06, + "loss": 0.0674, + "step": 2911 + }, + { + "epoch": 6.717416378316033, + "grad_norm": 0.0, + "learning_rate": 3.1678691313515688e-06, + "loss": 0.0841, + "step": 2912 + }, + { + "epoch": 6.719723183391004, + "grad_norm": 0.0, + "learning_rate": 3.161802574106799e-06, + "loss": 0.0628, + "step": 2913 + }, + { + "epoch": 6.722029988465975, + "grad_norm": 0.0, + "learning_rate": 3.1557407402949937e-06, + "loss": 0.0346, + "step": 2914 + }, + { + "epoch": 6.724336793540946, + "grad_norm": 0.0, + "learning_rate": 3.149683634103312e-06, + "loss": 0.0637, + "step": 2915 + }, + { + "epoch": 6.726643598615917, + "grad_norm": 0.0, + "learning_rate": 3.143631259715658e-06, + "loss": 0.0544, + "step": 2916 + }, + { + "epoch": 6.728950403690888, + "grad_norm": 0.0, + "learning_rate": 3.1375836213126653e-06, + "loss": 0.0388, + "step": 2917 + }, + { + "epoch": 6.731257208765859, + "grad_norm": 0.0, + "learning_rate": 3.13154072307169e-06, + "loss": 0.0701, + "step": 2918 + }, + { + "epoch": 6.73356401384083, + "grad_norm": 0.0, + "learning_rate": 3.1255025691668184e-06, + "loss": 0.0979, + "step": 2919 + }, + { + "epoch": 6.735870818915801, + "grad_norm": 0.0, + "learning_rate": 3.1194691637688645e-06, + "loss": 0.0714, + "step": 2920 + }, + { + "epoch": 6.738177623990773, + "grad_norm": 0.0, + "learning_rate": 3.1134405110453512e-06, + "loss": 0.0434, + "step": 2921 + }, + { + "epoch": 6.740484429065744, + "grad_norm": 0.0, + "learning_rate": 3.10741661516053e-06, + "loss": 0.1162, + "step": 2922 + }, + { + "epoch": 6.742791234140715, + "grad_norm": 0.0, + "learning_rate": 3.101397480275359e-06, + "loss": 0.0849, + "step": 2923 + }, + { + "epoch": 6.745098039215686, + "grad_norm": 0.0, + "learning_rate": 3.0953831105475064e-06, + "loss": 0.0431, + "step": 2924 + }, + { + "epoch": 6.747404844290657, + "grad_norm": 0.0, + "learning_rate": 3.089373510131354e-06, + "loss": 0.0894, + "step": 2925 + }, + { + "epoch": 6.749711649365628, + "grad_norm": 0.0, + "learning_rate": 3.083368683177993e-06, + "loss": 0.1019, + "step": 2926 + }, + { + "epoch": 6.7520184544405994, + "grad_norm": 0.0, + "learning_rate": 3.077368633835205e-06, + "loss": 0.0574, + "step": 2927 + }, + { + "epoch": 6.754325259515571, + "grad_norm": 0.0, + "learning_rate": 3.071373366247482e-06, + "loss": 0.1271, + "step": 2928 + }, + { + "epoch": 6.756632064590542, + "grad_norm": 0.0, + "learning_rate": 3.065382884556012e-06, + "loss": 0.0919, + "step": 2929 + }, + { + "epoch": 6.7589388696655135, + "grad_norm": 0.0, + "learning_rate": 3.0593971928986688e-06, + "loss": 0.0945, + "step": 2930 + }, + { + "epoch": 6.7612456747404845, + "grad_norm": 0.0, + "learning_rate": 3.0534162954100264e-06, + "loss": 0.1158, + "step": 2931 + }, + { + "epoch": 6.763552479815456, + "grad_norm": 0.0, + "learning_rate": 3.0474401962213483e-06, + "loss": 0.11, + "step": 2932 + }, + { + "epoch": 6.765859284890427, + "grad_norm": 0.0, + "learning_rate": 3.0414688994605724e-06, + "loss": 0.0829, + "step": 2933 + }, + { + "epoch": 6.768166089965398, + "grad_norm": 0.0, + "learning_rate": 3.0355024092523334e-06, + "loss": 0.0507, + "step": 2934 + }, + { + "epoch": 6.770472895040369, + "grad_norm": 0.0, + "learning_rate": 3.0295407297179326e-06, + "loss": 0.1086, + "step": 2935 + }, + { + "epoch": 6.77277970011534, + "grad_norm": 0.0, + "learning_rate": 3.0235838649753615e-06, + "loss": 0.057, + "step": 2936 + }, + { + "epoch": 6.775086505190312, + "grad_norm": 0.0, + "learning_rate": 3.017631819139273e-06, + "loss": 0.0772, + "step": 2937 + }, + { + "epoch": 6.777393310265283, + "grad_norm": 0.0, + "learning_rate": 3.0116845963209996e-06, + "loss": 0.1133, + "step": 2938 + }, + { + "epoch": 6.779700115340254, + "grad_norm": 0.0, + "learning_rate": 3.005742200628545e-06, + "loss": 0.055, + "step": 2939 + }, + { + "epoch": 6.782006920415225, + "grad_norm": 0.0, + "learning_rate": 2.999804636166567e-06, + "loss": 0.0833, + "step": 2940 + }, + { + "epoch": 6.784313725490196, + "grad_norm": 0.0, + "learning_rate": 2.9938719070363954e-06, + "loss": 0.0847, + "step": 2941 + }, + { + "epoch": 6.786620530565167, + "grad_norm": 0.0, + "learning_rate": 2.987944017336023e-06, + "loss": 0.075, + "step": 2942 + }, + { + "epoch": 6.788927335640138, + "grad_norm": 0.0, + "learning_rate": 2.9820209711600858e-06, + "loss": 0.0884, + "step": 2943 + }, + { + "epoch": 6.79123414071511, + "grad_norm": 0.0, + "learning_rate": 2.9761027725998883e-06, + "loss": 0.0939, + "step": 2944 + }, + { + "epoch": 6.793540945790081, + "grad_norm": 0.0, + "learning_rate": 2.970189425743383e-06, + "loss": 0.0841, + "step": 2945 + }, + { + "epoch": 6.795847750865052, + "grad_norm": 0.0, + "learning_rate": 2.9642809346751677e-06, + "loss": 0.095, + "step": 2946 + }, + { + "epoch": 6.798154555940023, + "grad_norm": 0.0, + "learning_rate": 2.958377303476483e-06, + "loss": 0.0691, + "step": 2947 + }, + { + "epoch": 6.800461361014994, + "grad_norm": 0.0, + "learning_rate": 2.952478536225224e-06, + "loss": 0.0933, + "step": 2948 + }, + { + "epoch": 6.802768166089965, + "grad_norm": 0.0, + "learning_rate": 2.9465846369959126e-06, + "loss": 0.1027, + "step": 2949 + }, + { + "epoch": 6.805074971164936, + "grad_norm": 0.0, + "learning_rate": 2.9406956098597208e-06, + "loss": 0.1112, + "step": 2950 + }, + { + "epoch": 6.807381776239907, + "grad_norm": 0.0, + "learning_rate": 2.934811458884449e-06, + "loss": 0.083, + "step": 2951 + }, + { + "epoch": 6.809688581314878, + "grad_norm": 0.0, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.106, + "step": 2952 + }, + { + "epoch": 6.81199538638985, + "grad_norm": 0.0, + "learning_rate": 2.9230578016710154e-06, + "loss": 0.0947, + "step": 2953 + }, + { + "epoch": 6.814302191464821, + "grad_norm": 0.0, + "learning_rate": 2.917188303551608e-06, + "loss": 0.077, + "step": 2954 + }, + { + "epoch": 6.8166089965397925, + "grad_norm": 0.0, + "learning_rate": 2.91132369783061e-06, + "loss": 0.0524, + "step": 2955 + }, + { + "epoch": 6.8189158016147635, + "grad_norm": 0.0, + "learning_rate": 2.905463988558955e-06, + "loss": 0.1328, + "step": 2956 + }, + { + "epoch": 6.821222606689735, + "grad_norm": 0.0, + "learning_rate": 2.8996091797841976e-06, + "loss": 0.0388, + "step": 2957 + }, + { + "epoch": 6.823529411764706, + "grad_norm": 0.0, + "learning_rate": 2.893759275550494e-06, + "loss": 0.0761, + "step": 2958 + }, + { + "epoch": 6.825836216839677, + "grad_norm": 0.0, + "learning_rate": 2.8879142798986293e-06, + "loss": 0.0588, + "step": 2959 + }, + { + "epoch": 6.828143021914649, + "grad_norm": 0.0, + "learning_rate": 2.882074196865986e-06, + "loss": 0.0752, + "step": 2960 + }, + { + "epoch": 6.83044982698962, + "grad_norm": 0.0, + "learning_rate": 2.876239030486554e-06, + "loss": 0.098, + "step": 2961 + }, + { + "epoch": 6.832756632064591, + "grad_norm": 0.0, + "learning_rate": 2.8704087847909333e-06, + "loss": 0.0587, + "step": 2962 + }, + { + "epoch": 6.835063437139562, + "grad_norm": 0.0, + "learning_rate": 2.8645834638063253e-06, + "loss": 0.0935, + "step": 2963 + }, + { + "epoch": 6.837370242214533, + "grad_norm": 0.0, + "learning_rate": 2.8587630715565185e-06, + "loss": 0.0797, + "step": 2964 + }, + { + "epoch": 6.839677047289504, + "grad_norm": 0.0, + "learning_rate": 2.8529476120619102e-06, + "loss": 0.1073, + "step": 2965 + }, + { + "epoch": 6.841983852364475, + "grad_norm": 0.0, + "learning_rate": 2.8471370893394866e-06, + "loss": 0.074, + "step": 2966 + }, + { + "epoch": 6.844290657439446, + "grad_norm": 0.0, + "learning_rate": 2.8413315074028157e-06, + "loss": 0.0967, + "step": 2967 + }, + { + "epoch": 6.846597462514418, + "grad_norm": 0.0, + "learning_rate": 2.8355308702620633e-06, + "loss": 0.04, + "step": 2968 + }, + { + "epoch": 6.848904267589389, + "grad_norm": 0.0, + "learning_rate": 2.829735181923978e-06, + "loss": 0.0665, + "step": 2969 + }, + { + "epoch": 6.85121107266436, + "grad_norm": 0.0, + "learning_rate": 2.823944446391881e-06, + "loss": 0.0812, + "step": 2970 + }, + { + "epoch": 6.853517877739331, + "grad_norm": 0.0, + "learning_rate": 2.818158667665686e-06, + "loss": 0.0836, + "step": 2971 + }, + { + "epoch": 6.855824682814302, + "grad_norm": 0.0, + "learning_rate": 2.8123778497418687e-06, + "loss": 0.073, + "step": 2972 + }, + { + "epoch": 6.858131487889273, + "grad_norm": 0.0, + "learning_rate": 2.8066019966134907e-06, + "loss": 0.0928, + "step": 2973 + }, + { + "epoch": 6.860438292964244, + "grad_norm": 0.0, + "learning_rate": 2.800831112270175e-06, + "loss": 0.0918, + "step": 2974 + }, + { + "epoch": 6.862745098039216, + "grad_norm": 0.0, + "learning_rate": 2.795065200698116e-06, + "loss": 0.1027, + "step": 2975 + }, + { + "epoch": 6.865051903114187, + "grad_norm": 0.0, + "learning_rate": 2.7893042658800793e-06, + "loss": 0.0605, + "step": 2976 + }, + { + "epoch": 6.867358708189158, + "grad_norm": 0.0, + "learning_rate": 2.783548311795379e-06, + "loss": 0.0793, + "step": 2977 + }, + { + "epoch": 6.869665513264129, + "grad_norm": 0.0, + "learning_rate": 2.777797342419901e-06, + "loss": 0.0954, + "step": 2978 + }, + { + "epoch": 6.8719723183391, + "grad_norm": 0.0, + "learning_rate": 2.7720513617260857e-06, + "loss": 0.082, + "step": 2979 + }, + { + "epoch": 6.874279123414071, + "grad_norm": 0.0, + "learning_rate": 2.76631037368292e-06, + "loss": 0.0648, + "step": 2980 + }, + { + "epoch": 6.8765859284890425, + "grad_norm": 0.0, + "learning_rate": 2.7605743822559504e-06, + "loss": 0.1019, + "step": 2981 + }, + { + "epoch": 6.8788927335640135, + "grad_norm": 0.0, + "learning_rate": 2.7548433914072736e-06, + "loss": 0.0661, + "step": 2982 + }, + { + "epoch": 6.881199538638985, + "grad_norm": 0.0, + "learning_rate": 2.7491174050955237e-06, + "loss": 0.1199, + "step": 2983 + }, + { + "epoch": 6.8835063437139565, + "grad_norm": 0.0, + "learning_rate": 2.7433964272758805e-06, + "loss": 0.0877, + "step": 2984 + }, + { + "epoch": 6.885813148788928, + "grad_norm": 0.0, + "learning_rate": 2.7376804619000706e-06, + "loss": 0.1212, + "step": 2985 + }, + { + "epoch": 6.888119953863899, + "grad_norm": 0.0, + "learning_rate": 2.7319695129163493e-06, + "loss": 0.0545, + "step": 2986 + }, + { + "epoch": 6.89042675893887, + "grad_norm": 0.0, + "learning_rate": 2.726263584269513e-06, + "loss": 0.0761, + "step": 2987 + }, + { + "epoch": 6.892733564013841, + "grad_norm": 0.0, + "learning_rate": 2.720562679900892e-06, + "loss": 0.063, + "step": 2988 + }, + { + "epoch": 6.895040369088812, + "grad_norm": 0.0, + "learning_rate": 2.714866803748337e-06, + "loss": 0.0811, + "step": 2989 + }, + { + "epoch": 6.897347174163783, + "grad_norm": 0.0, + "learning_rate": 2.709175959746233e-06, + "loss": 0.0705, + "step": 2990 + }, + { + "epoch": 6.899653979238755, + "grad_norm": 0.0, + "learning_rate": 2.703490151825492e-06, + "loss": 0.045, + "step": 2991 + }, + { + "epoch": 6.901960784313726, + "grad_norm": 0.0, + "learning_rate": 2.6978093839135365e-06, + "loss": 0.0717, + "step": 2992 + }, + { + "epoch": 6.904267589388697, + "grad_norm": 0.0, + "learning_rate": 2.6921336599343153e-06, + "loss": 0.0887, + "step": 2993 + }, + { + "epoch": 6.906574394463668, + "grad_norm": 0.0, + "learning_rate": 2.6864629838082957e-06, + "loss": 0.1213, + "step": 2994 + }, + { + "epoch": 6.908881199538639, + "grad_norm": 0.0, + "learning_rate": 2.6807973594524508e-06, + "loss": 0.1169, + "step": 2995 + }, + { + "epoch": 6.91118800461361, + "grad_norm": 0.0, + "learning_rate": 2.675136790780265e-06, + "loss": 0.1588, + "step": 2996 + }, + { + "epoch": 6.913494809688581, + "grad_norm": 0.0, + "learning_rate": 2.669481281701739e-06, + "loss": 0.1299, + "step": 2997 + }, + { + "epoch": 6.915801614763552, + "grad_norm": 0.0, + "learning_rate": 2.6638308361233677e-06, + "loss": 0.1275, + "step": 2998 + }, + { + "epoch": 6.918108419838523, + "grad_norm": 0.0, + "learning_rate": 2.6581854579481546e-06, + "loss": 0.1012, + "step": 2999 + }, + { + "epoch": 6.920415224913495, + "grad_norm": 0.0, + "learning_rate": 2.652545151075606e-06, + "loss": 0.0788, + "step": 3000 + }, + { + "epoch": 6.922722029988466, + "grad_norm": 0.0, + "learning_rate": 2.6469099194017144e-06, + "loss": 0.0714, + "step": 3001 + }, + { + "epoch": 6.925028835063437, + "grad_norm": 0.0, + "learning_rate": 2.641279766818977e-06, + "loss": 0.0648, + "step": 3002 + }, + { + "epoch": 6.927335640138408, + "grad_norm": 0.0, + "learning_rate": 2.635654697216382e-06, + "loss": 0.1825, + "step": 3003 + }, + { + "epoch": 6.929642445213379, + "grad_norm": 0.0, + "learning_rate": 2.630034714479397e-06, + "loss": 0.1082, + "step": 3004 + }, + { + "epoch": 6.93194925028835, + "grad_norm": 0.0, + "learning_rate": 2.624419822489985e-06, + "loss": 0.1228, + "step": 3005 + }, + { + "epoch": 6.9342560553633215, + "grad_norm": 0.0, + "learning_rate": 2.6188100251265947e-06, + "loss": 0.061, + "step": 3006 + }, + { + "epoch": 6.936562860438293, + "grad_norm": 0.0, + "learning_rate": 2.6132053262641467e-06, + "loss": 0.0675, + "step": 3007 + }, + { + "epoch": 6.9388696655132645, + "grad_norm": 0.0, + "learning_rate": 2.607605729774041e-06, + "loss": 0.1114, + "step": 3008 + }, + { + "epoch": 6.9411764705882355, + "grad_norm": 0.0, + "learning_rate": 2.6020112395241627e-06, + "loss": 0.0916, + "step": 3009 + }, + { + "epoch": 6.9434832756632066, + "grad_norm": 0.0, + "learning_rate": 2.596421859378858e-06, + "loss": 0.0606, + "step": 3010 + }, + { + "epoch": 6.945790080738178, + "grad_norm": 0.0, + "learning_rate": 2.5908375931989517e-06, + "loss": 0.0426, + "step": 3011 + }, + { + "epoch": 6.948096885813149, + "grad_norm": 0.0, + "learning_rate": 2.5852584448417327e-06, + "loss": 0.0635, + "step": 3012 + }, + { + "epoch": 6.95040369088812, + "grad_norm": 0.0, + "learning_rate": 2.5796844181609583e-06, + "loss": 0.0724, + "step": 3013 + }, + { + "epoch": 6.952710495963091, + "grad_norm": 0.0, + "learning_rate": 2.57411551700684e-06, + "loss": 0.1065, + "step": 3014 + }, + { + "epoch": 6.955017301038062, + "grad_norm": 0.0, + "learning_rate": 2.5685517452260566e-06, + "loss": 0.0856, + "step": 3015 + }, + { + "epoch": 6.957324106113034, + "grad_norm": 0.0, + "learning_rate": 2.562993106661744e-06, + "loss": 0.0739, + "step": 3016 + }, + { + "epoch": 6.959630911188005, + "grad_norm": 0.0, + "learning_rate": 2.5574396051534835e-06, + "loss": 0.096, + "step": 3017 + }, + { + "epoch": 6.961937716262976, + "grad_norm": 0.0, + "learning_rate": 2.55189124453732e-06, + "loss": 0.0694, + "step": 3018 + }, + { + "epoch": 6.964244521337947, + "grad_norm": 0.0, + "learning_rate": 2.5463480286457367e-06, + "loss": 0.0831, + "step": 3019 + }, + { + "epoch": 6.966551326412918, + "grad_norm": 0.0, + "learning_rate": 2.540809961307672e-06, + "loss": 0.1211, + "step": 3020 + }, + { + "epoch": 6.968858131487889, + "grad_norm": 0.0, + "learning_rate": 2.5352770463484986e-06, + "loss": 0.0619, + "step": 3021 + }, + { + "epoch": 6.97116493656286, + "grad_norm": 0.0, + "learning_rate": 2.529749287590042e-06, + "loss": 0.0844, + "step": 3022 + }, + { + "epoch": 6.973471741637832, + "grad_norm": 0.0, + "learning_rate": 2.524226688850554e-06, + "loss": 0.0669, + "step": 3023 + }, + { + "epoch": 6.975778546712803, + "grad_norm": 0.0, + "learning_rate": 2.51870925394473e-06, + "loss": 0.0934, + "step": 3024 + }, + { + "epoch": 6.978085351787774, + "grad_norm": 0.0, + "learning_rate": 2.513196986683699e-06, + "loss": 0.1004, + "step": 3025 + }, + { + "epoch": 6.980392156862745, + "grad_norm": 0.0, + "learning_rate": 2.5076898908750127e-06, + "loss": 0.0442, + "step": 3026 + }, + { + "epoch": 6.982698961937716, + "grad_norm": 0.0, + "learning_rate": 2.502187970322657e-06, + "loss": 0.0459, + "step": 3027 + }, + { + "epoch": 6.985005767012687, + "grad_norm": 0.0, + "learning_rate": 2.4966912288270473e-06, + "loss": 0.0687, + "step": 3028 + }, + { + "epoch": 6.987312572087658, + "grad_norm": 0.0, + "learning_rate": 2.4911996701850083e-06, + "loss": 0.0595, + "step": 3029 + }, + { + "epoch": 6.989619377162629, + "grad_norm": 0.0, + "learning_rate": 2.485713298189798e-06, + "loss": 0.0909, + "step": 3030 + }, + { + "epoch": 6.9919261822376, + "grad_norm": 0.0, + "learning_rate": 2.4802321166310815e-06, + "loss": 0.0885, + "step": 3031 + }, + { + "epoch": 6.994232987312572, + "grad_norm": 0.0, + "learning_rate": 2.4747561292949496e-06, + "loss": 0.0457, + "step": 3032 + }, + { + "epoch": 6.996539792387543, + "grad_norm": 0.0, + "learning_rate": 2.469285339963892e-06, + "loss": 0.0921, + "step": 3033 + }, + { + "epoch": 6.9988465974625145, + "grad_norm": 0.0, + "learning_rate": 2.4638197524168208e-06, + "loss": 0.0941, + "step": 3034 + }, + { + "epoch": 7.0011534025374855, + "grad_norm": 0.0, + "learning_rate": 2.458359370429043e-06, + "loss": 0.0697, + "step": 3035 + }, + { + "epoch": 7.003460207612457, + "grad_norm": 0.0, + "learning_rate": 2.45290419777228e-06, + "loss": 0.0689, + "step": 3036 + }, + { + "epoch": 7.005767012687428, + "grad_norm": 0.0, + "learning_rate": 2.447454238214654e-06, + "loss": 0.0591, + "step": 3037 + }, + { + "epoch": 7.008073817762399, + "grad_norm": 0.0, + "learning_rate": 2.4420094955206753e-06, + "loss": 0.0372, + "step": 3038 + }, + { + "epoch": 7.010380622837371, + "grad_norm": 0.0, + "learning_rate": 2.436569973451264e-06, + "loss": 0.0707, + "step": 3039 + }, + { + "epoch": 7.012687427912342, + "grad_norm": 0.0, + "learning_rate": 2.4311356757637305e-06, + "loss": 0.0448, + "step": 3040 + }, + { + "epoch": 7.014994232987313, + "grad_norm": 0.0, + "learning_rate": 2.4257066062117675e-06, + "loss": 0.0428, + "step": 3041 + }, + { + "epoch": 7.017301038062284, + "grad_norm": 0.0, + "learning_rate": 2.420282768545469e-06, + "loss": 0.0336, + "step": 3042 + }, + { + "epoch": 7.019607843137255, + "grad_norm": 0.0, + "learning_rate": 2.4148641665113116e-06, + "loss": 0.0656, + "step": 3043 + }, + { + "epoch": 7.021914648212226, + "grad_norm": 0.0, + "learning_rate": 2.409450803852149e-06, + "loss": 0.0699, + "step": 3044 + }, + { + "epoch": 7.024221453287197, + "grad_norm": 0.0, + "learning_rate": 2.4040426843072206e-06, + "loss": 0.047, + "step": 3045 + }, + { + "epoch": 7.026528258362168, + "grad_norm": 0.0, + "learning_rate": 2.3986398116121468e-06, + "loss": 0.0534, + "step": 3046 + }, + { + "epoch": 7.02883506343714, + "grad_norm": 0.0, + "learning_rate": 2.3932421894989167e-06, + "loss": 0.0303, + "step": 3047 + }, + { + "epoch": 7.031141868512111, + "grad_norm": 0.0, + "learning_rate": 2.387849821695899e-06, + "loss": 0.0537, + "step": 3048 + }, + { + "epoch": 7.033448673587082, + "grad_norm": 0.0, + "learning_rate": 2.3824627119278344e-06, + "loss": 0.0612, + "step": 3049 + }, + { + "epoch": 7.035755478662053, + "grad_norm": 0.0, + "learning_rate": 2.3770808639158216e-06, + "loss": 0.0809, + "step": 3050 + }, + { + "epoch": 7.038062283737024, + "grad_norm": 0.0, + "learning_rate": 2.371704281377335e-06, + "loss": 0.0446, + "step": 3051 + }, + { + "epoch": 7.040369088811995, + "grad_norm": 0.0, + "learning_rate": 2.366332968026207e-06, + "loss": 0.0575, + "step": 3052 + }, + { + "epoch": 7.042675893886966, + "grad_norm": 0.0, + "learning_rate": 2.3609669275726353e-06, + "loss": 0.0539, + "step": 3053 + }, + { + "epoch": 7.044982698961937, + "grad_norm": 0.0, + "learning_rate": 2.3556061637231653e-06, + "loss": 0.0575, + "step": 3054 + }, + { + "epoch": 7.047289504036909, + "grad_norm": 0.0, + "learning_rate": 2.3502506801807102e-06, + "loss": 0.0848, + "step": 3055 + }, + { + "epoch": 7.04959630911188, + "grad_norm": 0.0, + "learning_rate": 2.3449004806445263e-06, + "loss": 0.0552, + "step": 3056 + }, + { + "epoch": 7.051903114186851, + "grad_norm": 0.0, + "learning_rate": 2.339555568810221e-06, + "loss": 0.0552, + "step": 3057 + }, + { + "epoch": 7.054209919261822, + "grad_norm": 0.0, + "learning_rate": 2.3342159483697535e-06, + "loss": 0.0601, + "step": 3058 + }, + { + "epoch": 7.0565167243367934, + "grad_norm": 0.0, + "learning_rate": 2.328881623011431e-06, + "loss": 0.0285, + "step": 3059 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 0.0, + "learning_rate": 2.323552596419889e-06, + "loss": 0.0759, + "step": 3060 + }, + { + "epoch": 7.0611303344867355, + "grad_norm": 0.0, + "learning_rate": 2.318228872276118e-06, + "loss": 0.0305, + "step": 3061 + }, + { + "epoch": 7.063437139561707, + "grad_norm": 0.0, + "learning_rate": 2.3129104542574433e-06, + "loss": 0.028, + "step": 3062 + }, + { + "epoch": 7.0657439446366785, + "grad_norm": 0.0, + "learning_rate": 2.3075973460375134e-06, + "loss": 0.0483, + "step": 3063 + }, + { + "epoch": 7.06805074971165, + "grad_norm": 0.0, + "learning_rate": 2.3022895512863207e-06, + "loss": 0.0405, + "step": 3064 + }, + { + "epoch": 7.070357554786621, + "grad_norm": 0.0, + "learning_rate": 2.296987073670189e-06, + "loss": 0.0514, + "step": 3065 + }, + { + "epoch": 7.072664359861592, + "grad_norm": 0.0, + "learning_rate": 2.291689916851758e-06, + "loss": 0.085, + "step": 3066 + }, + { + "epoch": 7.074971164936563, + "grad_norm": 0.0, + "learning_rate": 2.2863980844900036e-06, + "loss": 0.0593, + "step": 3067 + }, + { + "epoch": 7.077277970011534, + "grad_norm": 0.0, + "learning_rate": 2.2811115802402174e-06, + "loss": 0.0387, + "step": 3068 + }, + { + "epoch": 7.079584775086505, + "grad_norm": 0.0, + "learning_rate": 2.275830407754006e-06, + "loss": 0.0554, + "step": 3069 + }, + { + "epoch": 7.081891580161477, + "grad_norm": 0.0, + "learning_rate": 2.2705545706793065e-06, + "loss": 0.0357, + "step": 3070 + }, + { + "epoch": 7.084198385236448, + "grad_norm": 0.0, + "learning_rate": 2.265284072660362e-06, + "loss": 0.0633, + "step": 3071 + }, + { + "epoch": 7.086505190311419, + "grad_norm": 0.0, + "learning_rate": 2.2600189173377263e-06, + "loss": 0.0452, + "step": 3072 + }, + { + "epoch": 7.08881199538639, + "grad_norm": 0.0, + "learning_rate": 2.254759108348267e-06, + "loss": 0.0699, + "step": 3073 + }, + { + "epoch": 7.091118800461361, + "grad_norm": 0.0, + "learning_rate": 2.2495046493251603e-06, + "loss": 0.0648, + "step": 3074 + }, + { + "epoch": 7.093425605536332, + "grad_norm": 0.0, + "learning_rate": 2.2442555438978774e-06, + "loss": 0.0547, + "step": 3075 + }, + { + "epoch": 7.095732410611303, + "grad_norm": 0.0, + "learning_rate": 2.239011795692203e-06, + "loss": 0.0244, + "step": 3076 + }, + { + "epoch": 7.098039215686274, + "grad_norm": 0.0, + "learning_rate": 2.2337734083302164e-06, + "loss": 0.0641, + "step": 3077 + }, + { + "epoch": 7.100346020761246, + "grad_norm": 0.0, + "learning_rate": 2.2285403854302912e-06, + "loss": 0.0633, + "step": 3078 + }, + { + "epoch": 7.102652825836217, + "grad_norm": 0.0, + "learning_rate": 2.2233127306071013e-06, + "loss": 0.0708, + "step": 3079 + }, + { + "epoch": 7.104959630911188, + "grad_norm": 0.0, + "learning_rate": 2.2180904474716057e-06, + "loss": 0.0416, + "step": 3080 + }, + { + "epoch": 7.107266435986159, + "grad_norm": 0.0, + "learning_rate": 2.2128735396310606e-06, + "loss": 0.0312, + "step": 3081 + }, + { + "epoch": 7.10957324106113, + "grad_norm": 0.0, + "learning_rate": 2.207662010689002e-06, + "loss": 0.0809, + "step": 3082 + }, + { + "epoch": 7.111880046136101, + "grad_norm": 0.0, + "learning_rate": 2.202455864245259e-06, + "loss": 0.1034, + "step": 3083 + }, + { + "epoch": 7.114186851211072, + "grad_norm": 0.0, + "learning_rate": 2.1972551038959313e-06, + "loss": 0.0825, + "step": 3084 + }, + { + "epoch": 7.1164936562860435, + "grad_norm": 0.0, + "learning_rate": 2.192059733233408e-06, + "loss": 0.062, + "step": 3085 + }, + { + "epoch": 7.118800461361015, + "grad_norm": 0.0, + "learning_rate": 2.1868697558463547e-06, + "loss": 0.0636, + "step": 3086 + }, + { + "epoch": 7.1211072664359865, + "grad_norm": 0.0, + "learning_rate": 2.1816851753197023e-06, + "loss": 0.06, + "step": 3087 + }, + { + "epoch": 7.1234140715109575, + "grad_norm": 0.0, + "learning_rate": 2.1765059952346655e-06, + "loss": 0.1039, + "step": 3088 + }, + { + "epoch": 7.125720876585929, + "grad_norm": 0.0, + "learning_rate": 2.1713322191687234e-06, + "loss": 0.1144, + "step": 3089 + }, + { + "epoch": 7.1280276816609, + "grad_norm": 0.0, + "learning_rate": 2.1661638506956208e-06, + "loss": 0.0737, + "step": 3090 + }, + { + "epoch": 7.130334486735871, + "grad_norm": 0.0, + "learning_rate": 2.161000893385371e-06, + "loss": 0.0432, + "step": 3091 + }, + { + "epoch": 7.132641291810842, + "grad_norm": 0.0, + "learning_rate": 2.155843350804243e-06, + "loss": 0.05, + "step": 3092 + }, + { + "epoch": 7.134948096885813, + "grad_norm": 0.0, + "learning_rate": 2.1506912265147772e-06, + "loss": 0.0861, + "step": 3093 + }, + { + "epoch": 7.137254901960785, + "grad_norm": 0.0, + "learning_rate": 2.1455445240757575e-06, + "loss": 0.065, + "step": 3094 + }, + { + "epoch": 7.139561707035756, + "grad_norm": 0.0, + "learning_rate": 2.140403247042232e-06, + "loss": 0.0602, + "step": 3095 + }, + { + "epoch": 7.141868512110727, + "grad_norm": 0.0, + "learning_rate": 2.1352673989655026e-06, + "loss": 0.0737, + "step": 3096 + }, + { + "epoch": 7.144175317185698, + "grad_norm": 0.0, + "learning_rate": 2.130136983393112e-06, + "loss": 0.0363, + "step": 3097 + }, + { + "epoch": 7.146482122260669, + "grad_norm": 0.0, + "learning_rate": 2.125012003868856e-06, + "loss": 0.0576, + "step": 3098 + }, + { + "epoch": 7.14878892733564, + "grad_norm": 0.0, + "learning_rate": 2.119892463932781e-06, + "loss": 0.0297, + "step": 3099 + }, + { + "epoch": 7.151095732410611, + "grad_norm": 0.0, + "learning_rate": 2.1147783671211643e-06, + "loss": 0.078, + "step": 3100 + }, + { + "epoch": 7.153402537485582, + "grad_norm": 0.0, + "learning_rate": 2.1096697169665312e-06, + "loss": 0.092, + "step": 3101 + }, + { + "epoch": 7.155709342560554, + "grad_norm": 0.0, + "learning_rate": 2.104566516997647e-06, + "loss": 0.0604, + "step": 3102 + }, + { + "epoch": 7.158016147635525, + "grad_norm": 0.0, + "learning_rate": 2.0994687707395012e-06, + "loss": 0.0211, + "step": 3103 + }, + { + "epoch": 7.160322952710496, + "grad_norm": 0.0, + "learning_rate": 2.0943764817133296e-06, + "loss": 0.0622, + "step": 3104 + }, + { + "epoch": 7.162629757785467, + "grad_norm": 0.0, + "learning_rate": 2.08928965343659e-06, + "loss": 0.0595, + "step": 3105 + }, + { + "epoch": 7.164936562860438, + "grad_norm": 0.0, + "learning_rate": 2.084208289422968e-06, + "loss": 0.0621, + "step": 3106 + }, + { + "epoch": 7.167243367935409, + "grad_norm": 0.0, + "learning_rate": 2.0791323931823783e-06, + "loss": 0.0265, + "step": 3107 + }, + { + "epoch": 7.16955017301038, + "grad_norm": 0.0, + "learning_rate": 2.0740619682209607e-06, + "loss": 0.0178, + "step": 3108 + }, + { + "epoch": 7.171856978085351, + "grad_norm": 0.0, + "learning_rate": 2.068997018041069e-06, + "loss": 0.0421, + "step": 3109 + }, + { + "epoch": 7.174163783160323, + "grad_norm": 0.0, + "learning_rate": 2.0639375461412803e-06, + "loss": 0.0269, + "step": 3110 + }, + { + "epoch": 7.176470588235294, + "grad_norm": 0.0, + "learning_rate": 2.05888355601639e-06, + "loss": 0.038, + "step": 3111 + }, + { + "epoch": 7.178777393310265, + "grad_norm": 0.0, + "learning_rate": 2.053835051157397e-06, + "loss": 0.0608, + "step": 3112 + }, + { + "epoch": 7.1810841983852365, + "grad_norm": 0.0, + "learning_rate": 2.048792035051521e-06, + "loss": 0.0335, + "step": 3113 + }, + { + "epoch": 7.1833910034602075, + "grad_norm": 0.0, + "learning_rate": 2.043754511182191e-06, + "loss": 0.057, + "step": 3114 + }, + { + "epoch": 7.185697808535179, + "grad_norm": 0.0, + "learning_rate": 2.0387224830290308e-06, + "loss": 0.0477, + "step": 3115 + }, + { + "epoch": 7.18800461361015, + "grad_norm": 0.0, + "learning_rate": 2.0336959540678813e-06, + "loss": 0.0507, + "step": 3116 + }, + { + "epoch": 7.190311418685121, + "grad_norm": 0.0, + "learning_rate": 2.0286749277707783e-06, + "loss": 0.0411, + "step": 3117 + }, + { + "epoch": 7.192618223760093, + "grad_norm": 0.0, + "learning_rate": 2.0236594076059534e-06, + "loss": 0.0759, + "step": 3118 + }, + { + "epoch": 7.194925028835064, + "grad_norm": 0.0, + "learning_rate": 2.0186493970378416e-06, + "loss": 0.0616, + "step": 3119 + }, + { + "epoch": 7.197231833910035, + "grad_norm": 0.0, + "learning_rate": 2.013644899527074e-06, + "loss": 0.0784, + "step": 3120 + }, + { + "epoch": 7.199538638985006, + "grad_norm": 0.0, + "learning_rate": 2.008645918530462e-06, + "loss": 0.0583, + "step": 3121 + }, + { + "epoch": 7.201845444059977, + "grad_norm": 0.0, + "learning_rate": 2.0036524575010176e-06, + "loss": 0.0802, + "step": 3122 + }, + { + "epoch": 7.204152249134948, + "grad_norm": 0.0, + "learning_rate": 1.9986645198879385e-06, + "loss": 0.0623, + "step": 3123 + }, + { + "epoch": 7.206459054209919, + "grad_norm": 0.0, + "learning_rate": 1.9936821091366e-06, + "loss": 0.0559, + "step": 3124 + }, + { + "epoch": 7.20876585928489, + "grad_norm": 0.0, + "learning_rate": 1.9887052286885654e-06, + "loss": 0.0602, + "step": 3125 + }, + { + "epoch": 7.211072664359862, + "grad_norm": 0.0, + "learning_rate": 1.983733881981581e-06, + "loss": 0.0458, + "step": 3126 + }, + { + "epoch": 7.213379469434833, + "grad_norm": 0.0, + "learning_rate": 1.9787680724495617e-06, + "loss": 0.0493, + "step": 3127 + }, + { + "epoch": 7.215686274509804, + "grad_norm": 0.0, + "learning_rate": 1.9738078035226084e-06, + "loss": 0.0159, + "step": 3128 + }, + { + "epoch": 7.217993079584775, + "grad_norm": 0.0, + "learning_rate": 1.9688530786269854e-06, + "loss": 0.0317, + "step": 3129 + }, + { + "epoch": 7.220299884659746, + "grad_norm": 0.0, + "learning_rate": 1.9639039011851292e-06, + "loss": 0.043, + "step": 3130 + }, + { + "epoch": 7.222606689734717, + "grad_norm": 0.0, + "learning_rate": 1.9589602746156476e-06, + "loss": 0.0607, + "step": 3131 + }, + { + "epoch": 7.224913494809688, + "grad_norm": 0.0, + "learning_rate": 1.9540222023333165e-06, + "loss": 0.0626, + "step": 3132 + }, + { + "epoch": 7.22722029988466, + "grad_norm": 0.0, + "learning_rate": 1.9490896877490715e-06, + "loss": 0.0508, + "step": 3133 + }, + { + "epoch": 7.229527104959631, + "grad_norm": 0.0, + "learning_rate": 1.9441627342700067e-06, + "loss": 0.0485, + "step": 3134 + }, + { + "epoch": 7.231833910034602, + "grad_norm": 0.0, + "learning_rate": 1.9392413452993787e-06, + "loss": 0.0553, + "step": 3135 + }, + { + "epoch": 7.234140715109573, + "grad_norm": 0.0, + "learning_rate": 1.9343255242366022e-06, + "loss": 0.0708, + "step": 3136 + }, + { + "epoch": 7.236447520184544, + "grad_norm": 0.0, + "learning_rate": 1.929415274477239e-06, + "loss": 0.025, + "step": 3137 + }, + { + "epoch": 7.2387543252595155, + "grad_norm": 0.0, + "learning_rate": 1.9245105994130086e-06, + "loss": 0.0508, + "step": 3138 + }, + { + "epoch": 7.2410611303344865, + "grad_norm": 0.0, + "learning_rate": 1.919611502431782e-06, + "loss": 0.0773, + "step": 3139 + }, + { + "epoch": 7.243367935409458, + "grad_norm": 0.0, + "learning_rate": 1.914717986917569e-06, + "loss": 0.0649, + "step": 3140 + }, + { + "epoch": 7.245674740484429, + "grad_norm": 0.0, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.0678, + "step": 3141 + }, + { + "epoch": 7.2479815455594006, + "grad_norm": 0.0, + "learning_rate": 1.9049477138069606e-06, + "loss": 0.0514, + "step": 3142 + }, + { + "epoch": 7.250288350634372, + "grad_norm": 0.0, + "learning_rate": 1.9000709629593073e-06, + "loss": 0.0478, + "step": 3143 + }, + { + "epoch": 7.252595155709343, + "grad_norm": 0.0, + "learning_rate": 1.895199807076148e-06, + "loss": 0.0389, + "step": 3144 + }, + { + "epoch": 7.254901960784314, + "grad_norm": 0.0, + "learning_rate": 1.8903342495221977e-06, + "loss": 0.0613, + "step": 3145 + }, + { + "epoch": 7.257208765859285, + "grad_norm": 0.0, + "learning_rate": 1.8854742936583005e-06, + "loss": 0.0589, + "step": 3146 + }, + { + "epoch": 7.259515570934256, + "grad_norm": 0.0, + "learning_rate": 1.880619942841435e-06, + "loss": 0.0916, + "step": 3147 + }, + { + "epoch": 7.261822376009227, + "grad_norm": 0.0, + "learning_rate": 1.8757712004247098e-06, + "loss": 0.0377, + "step": 3148 + }, + { + "epoch": 7.264129181084199, + "grad_norm": 0.0, + "learning_rate": 1.870928069757353e-06, + "loss": 0.0778, + "step": 3149 + }, + { + "epoch": 7.26643598615917, + "grad_norm": 0.0, + "learning_rate": 1.8660905541847208e-06, + "loss": 0.0944, + "step": 3150 + }, + { + "epoch": 7.268742791234141, + "grad_norm": 0.0, + "learning_rate": 1.861258657048295e-06, + "loss": 0.0726, + "step": 3151 + }, + { + "epoch": 7.271049596309112, + "grad_norm": 0.0, + "learning_rate": 1.856432381685669e-06, + "loss": 0.0562, + "step": 3152 + }, + { + "epoch": 7.273356401384083, + "grad_norm": 0.0, + "learning_rate": 1.8516117314305526e-06, + "loss": 0.0486, + "step": 3153 + }, + { + "epoch": 7.275663206459054, + "grad_norm": 0.0, + "learning_rate": 1.8467967096127782e-06, + "loss": 0.0616, + "step": 3154 + }, + { + "epoch": 7.277970011534025, + "grad_norm": 0.0, + "learning_rate": 1.8419873195582815e-06, + "loss": 0.0329, + "step": 3155 + }, + { + "epoch": 7.280276816608996, + "grad_norm": 0.0, + "learning_rate": 1.8371835645891134e-06, + "loss": 0.0711, + "step": 3156 + }, + { + "epoch": 7.282583621683968, + "grad_norm": 0.0, + "learning_rate": 1.8323854480234348e-06, + "loss": 0.0472, + "step": 3157 + }, + { + "epoch": 7.284890426758939, + "grad_norm": 0.0, + "learning_rate": 1.8275929731755039e-06, + "loss": 0.1153, + "step": 3158 + }, + { + "epoch": 7.28719723183391, + "grad_norm": 0.0, + "learning_rate": 1.8228061433556866e-06, + "loss": 0.0639, + "step": 3159 + }, + { + "epoch": 7.289504036908881, + "grad_norm": 0.0, + "learning_rate": 1.8180249618704536e-06, + "loss": 0.0973, + "step": 3160 + }, + { + "epoch": 7.291810841983852, + "grad_norm": 0.0, + "learning_rate": 1.8132494320223636e-06, + "loss": 0.0738, + "step": 3161 + }, + { + "epoch": 7.294117647058823, + "grad_norm": 0.0, + "learning_rate": 1.808479557110081e-06, + "loss": 0.0338, + "step": 3162 + }, + { + "epoch": 7.296424452133794, + "grad_norm": 0.0, + "learning_rate": 1.8037153404283636e-06, + "loss": 0.0735, + "step": 3163 + }, + { + "epoch": 7.2987312572087655, + "grad_norm": 0.0, + "learning_rate": 1.798956785268051e-06, + "loss": 0.0903, + "step": 3164 + }, + { + "epoch": 7.301038062283737, + "grad_norm": 0.0, + "learning_rate": 1.7942038949160857e-06, + "loss": 0.0522, + "step": 3165 + }, + { + "epoch": 7.3033448673587085, + "grad_norm": 0.0, + "learning_rate": 1.7894566726554874e-06, + "loss": 0.0649, + "step": 3166 + }, + { + "epoch": 7.3056516724336795, + "grad_norm": 0.0, + "learning_rate": 1.7847151217653624e-06, + "loss": 0.0606, + "step": 3167 + }, + { + "epoch": 7.307958477508651, + "grad_norm": 0.0, + "learning_rate": 1.7799792455209019e-06, + "loss": 0.0346, + "step": 3168 + }, + { + "epoch": 7.310265282583622, + "grad_norm": 0.0, + "learning_rate": 1.7752490471933769e-06, + "loss": 0.0786, + "step": 3169 + }, + { + "epoch": 7.312572087658593, + "grad_norm": 0.0, + "learning_rate": 1.7705245300501396e-06, + "loss": 0.0558, + "step": 3170 + }, + { + "epoch": 7.314878892733564, + "grad_norm": 0.0, + "learning_rate": 1.765805697354608e-06, + "loss": 0.0525, + "step": 3171 + }, + { + "epoch": 7.317185697808535, + "grad_norm": 0.0, + "learning_rate": 1.7610925523662836e-06, + "loss": 0.1067, + "step": 3172 + }, + { + "epoch": 7.319492502883507, + "grad_norm": 0.0, + "learning_rate": 1.756385098340736e-06, + "loss": 0.0629, + "step": 3173 + }, + { + "epoch": 7.321799307958478, + "grad_norm": 0.0, + "learning_rate": 1.7516833385296016e-06, + "loss": 0.1039, + "step": 3174 + }, + { + "epoch": 7.324106113033449, + "grad_norm": 0.0, + "learning_rate": 1.7469872761805872e-06, + "loss": 0.0501, + "step": 3175 + }, + { + "epoch": 7.32641291810842, + "grad_norm": 0.0, + "learning_rate": 1.742296914537459e-06, + "loss": 0.0603, + "step": 3176 + }, + { + "epoch": 7.328719723183391, + "grad_norm": 0.0, + "learning_rate": 1.7376122568400533e-06, + "loss": 0.0604, + "step": 3177 + }, + { + "epoch": 7.331026528258362, + "grad_norm": 0.0, + "learning_rate": 1.732933306324256e-06, + "loss": 0.0519, + "step": 3178 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.0, + "learning_rate": 1.7282600662220228e-06, + "loss": 0.0477, + "step": 3179 + }, + { + "epoch": 7.335640138408304, + "grad_norm": 0.0, + "learning_rate": 1.7235925397613529e-06, + "loss": 0.0714, + "step": 3180 + }, + { + "epoch": 7.337946943483276, + "grad_norm": 0.0, + "learning_rate": 1.7189307301663082e-06, + "loss": 0.0665, + "step": 3181 + }, + { + "epoch": 7.340253748558247, + "grad_norm": 0.0, + "learning_rate": 1.714274640657001e-06, + "loss": 0.0541, + "step": 3182 + }, + { + "epoch": 7.342560553633218, + "grad_norm": 0.0, + "learning_rate": 1.709624274449584e-06, + "loss": 0.0394, + "step": 3183 + }, + { + "epoch": 7.344867358708189, + "grad_norm": 0.0, + "learning_rate": 1.704979634756264e-06, + "loss": 0.0596, + "step": 3184 + }, + { + "epoch": 7.34717416378316, + "grad_norm": 0.0, + "learning_rate": 1.7003407247852944e-06, + "loss": 0.0889, + "step": 3185 + }, + { + "epoch": 7.349480968858131, + "grad_norm": 0.0, + "learning_rate": 1.6957075477409623e-06, + "loss": 0.0438, + "step": 3186 + }, + { + "epoch": 7.351787773933102, + "grad_norm": 0.0, + "learning_rate": 1.6910801068236015e-06, + "loss": 0.0289, + "step": 3187 + }, + { + "epoch": 7.354094579008073, + "grad_norm": 0.0, + "learning_rate": 1.6864584052295841e-06, + "loss": 0.0605, + "step": 3188 + }, + { + "epoch": 7.356401384083045, + "grad_norm": 0.0, + "learning_rate": 1.6818424461513129e-06, + "loss": 0.0453, + "step": 3189 + }, + { + "epoch": 7.358708189158016, + "grad_norm": 0.0, + "learning_rate": 1.677232232777224e-06, + "loss": 0.0547, + "step": 3190 + }, + { + "epoch": 7.361014994232987, + "grad_norm": 0.0, + "learning_rate": 1.6726277682917925e-06, + "loss": 0.0683, + "step": 3191 + }, + { + "epoch": 7.3633217993079585, + "grad_norm": 0.0, + "learning_rate": 1.6680290558755119e-06, + "loss": 0.0697, + "step": 3192 + }, + { + "epoch": 7.3656286043829295, + "grad_norm": 0.0, + "learning_rate": 1.6634360987049113e-06, + "loss": 0.0563, + "step": 3193 + }, + { + "epoch": 7.367935409457901, + "grad_norm": 0.0, + "learning_rate": 1.6588488999525431e-06, + "loss": 0.0469, + "step": 3194 + }, + { + "epoch": 7.370242214532872, + "grad_norm": 0.0, + "learning_rate": 1.6542674627869738e-06, + "loss": 0.067, + "step": 3195 + }, + { + "epoch": 7.372549019607844, + "grad_norm": 0.0, + "learning_rate": 1.6496917903728016e-06, + "loss": 0.0439, + "step": 3196 + }, + { + "epoch": 7.374855824682815, + "grad_norm": 0.0, + "learning_rate": 1.6451218858706374e-06, + "loss": 0.0626, + "step": 3197 + }, + { + "epoch": 7.377162629757786, + "grad_norm": 0.0, + "learning_rate": 1.640557752437103e-06, + "loss": 0.0657, + "step": 3198 + }, + { + "epoch": 7.379469434832757, + "grad_norm": 0.0, + "learning_rate": 1.6359993932248442e-06, + "loss": 0.058, + "step": 3199 + }, + { + "epoch": 7.381776239907728, + "grad_norm": 0.0, + "learning_rate": 1.631446811382512e-06, + "loss": 0.0659, + "step": 3200 + }, + { + "epoch": 7.384083044982699, + "grad_norm": 0.0, + "learning_rate": 1.6269000100547682e-06, + "loss": 0.0648, + "step": 3201 + }, + { + "epoch": 7.38638985005767, + "grad_norm": 0.0, + "learning_rate": 1.6223589923822768e-06, + "loss": 0.0468, + "step": 3202 + }, + { + "epoch": 7.388696655132641, + "grad_norm": 0.0, + "learning_rate": 1.6178237615017178e-06, + "loss": 0.0381, + "step": 3203 + }, + { + "epoch": 7.391003460207612, + "grad_norm": 0.0, + "learning_rate": 1.6132943205457607e-06, + "loss": 0.0495, + "step": 3204 + }, + { + "epoch": 7.393310265282584, + "grad_norm": 0.0, + "learning_rate": 1.6087706726430874e-06, + "loss": 0.0557, + "step": 3205 + }, + { + "epoch": 7.395617070357555, + "grad_norm": 0.0, + "learning_rate": 1.6042528209183728e-06, + "loss": 0.0451, + "step": 3206 + }, + { + "epoch": 7.397923875432526, + "grad_norm": 0.0, + "learning_rate": 1.599740768492286e-06, + "loss": 0.0337, + "step": 3207 + }, + { + "epoch": 7.400230680507497, + "grad_norm": 0.0, + "learning_rate": 1.5952345184814955e-06, + "loss": 0.06, + "step": 3208 + }, + { + "epoch": 7.402537485582468, + "grad_norm": 0.0, + "learning_rate": 1.5907340739986577e-06, + "loss": 0.0425, + "step": 3209 + }, + { + "epoch": 7.404844290657439, + "grad_norm": 0.0, + "learning_rate": 1.5862394381524239e-06, + "loss": 0.0343, + "step": 3210 + }, + { + "epoch": 7.40715109573241, + "grad_norm": 0.0, + "learning_rate": 1.5817506140474248e-06, + "loss": 0.0714, + "step": 3211 + }, + { + "epoch": 7.409457900807382, + "grad_norm": 0.0, + "learning_rate": 1.5772676047842862e-06, + "loss": 0.0728, + "step": 3212 + }, + { + "epoch": 7.411764705882353, + "grad_norm": 0.0, + "learning_rate": 1.5727904134596084e-06, + "loss": 0.0851, + "step": 3213 + }, + { + "epoch": 7.414071510957324, + "grad_norm": 0.0, + "learning_rate": 1.5683190431659812e-06, + "loss": 0.0352, + "step": 3214 + }, + { + "epoch": 7.416378316032295, + "grad_norm": 0.0, + "learning_rate": 1.563853496991966e-06, + "loss": 0.0529, + "step": 3215 + }, + { + "epoch": 7.418685121107266, + "grad_norm": 0.0, + "learning_rate": 1.5593937780221092e-06, + "loss": 0.0551, + "step": 3216 + }, + { + "epoch": 7.4209919261822375, + "grad_norm": 0.0, + "learning_rate": 1.5549398893369216e-06, + "loss": 0.0309, + "step": 3217 + }, + { + "epoch": 7.4232987312572085, + "grad_norm": 0.0, + "learning_rate": 1.5504918340128982e-06, + "loss": 0.0535, + "step": 3218 + }, + { + "epoch": 7.42560553633218, + "grad_norm": 0.0, + "learning_rate": 1.5460496151225002e-06, + "loss": 0.0607, + "step": 3219 + }, + { + "epoch": 7.4279123414071515, + "grad_norm": 0.0, + "learning_rate": 1.5416132357341519e-06, + "loss": 0.0805, + "step": 3220 + }, + { + "epoch": 7.430219146482123, + "grad_norm": 0.0, + "learning_rate": 1.5371826989122507e-06, + "loss": 0.06, + "step": 3221 + }, + { + "epoch": 7.432525951557094, + "grad_norm": 0.0, + "learning_rate": 1.5327580077171589e-06, + "loss": 0.0717, + "step": 3222 + }, + { + "epoch": 7.434832756632065, + "grad_norm": 0.0, + "learning_rate": 1.528339165205195e-06, + "loss": 0.0406, + "step": 3223 + }, + { + "epoch": 7.437139561707036, + "grad_norm": 0.0, + "learning_rate": 1.5239261744286427e-06, + "loss": 0.0716, + "step": 3224 + }, + { + "epoch": 7.439446366782007, + "grad_norm": 0.0, + "learning_rate": 1.5195190384357405e-06, + "loss": 0.0502, + "step": 3225 + }, + { + "epoch": 7.441753171856978, + "grad_norm": 0.0, + "learning_rate": 1.5151177602706867e-06, + "loss": 0.0856, + "step": 3226 + }, + { + "epoch": 7.444059976931949, + "grad_norm": 0.0, + "learning_rate": 1.5107223429736273e-06, + "loss": 0.061, + "step": 3227 + }, + { + "epoch": 7.446366782006921, + "grad_norm": 0.0, + "learning_rate": 1.5063327895806668e-06, + "loss": 0.0639, + "step": 3228 + }, + { + "epoch": 7.448673587081892, + "grad_norm": 0.0, + "learning_rate": 1.501949103123852e-06, + "loss": 0.0854, + "step": 3229 + }, + { + "epoch": 7.450980392156863, + "grad_norm": 0.0, + "learning_rate": 1.4975712866311832e-06, + "loss": 0.0461, + "step": 3230 + }, + { + "epoch": 7.453287197231834, + "grad_norm": 0.0, + "learning_rate": 1.4931993431266056e-06, + "loss": 0.0487, + "step": 3231 + }, + { + "epoch": 7.455594002306805, + "grad_norm": 0.0, + "learning_rate": 1.4888332756300027e-06, + "loss": 0.0459, + "step": 3232 + }, + { + "epoch": 7.457900807381776, + "grad_norm": 0.0, + "learning_rate": 1.4844730871572045e-06, + "loss": 0.0656, + "step": 3233 + }, + { + "epoch": 7.460207612456747, + "grad_norm": 0.0, + "learning_rate": 1.48011878071998e-06, + "loss": 0.0522, + "step": 3234 + }, + { + "epoch": 7.462514417531718, + "grad_norm": 0.0, + "learning_rate": 1.4757703593260286e-06, + "loss": 0.1048, + "step": 3235 + }, + { + "epoch": 7.46482122260669, + "grad_norm": 0.0, + "learning_rate": 1.4714278259789916e-06, + "loss": 0.0626, + "step": 3236 + }, + { + "epoch": 7.467128027681661, + "grad_norm": 0.0, + "learning_rate": 1.467091183678444e-06, + "loss": 0.0416, + "step": 3237 + }, + { + "epoch": 7.469434832756632, + "grad_norm": 0.0, + "learning_rate": 1.4627604354198854e-06, + "loss": 0.0274, + "step": 3238 + }, + { + "epoch": 7.471741637831603, + "grad_norm": 0.0, + "learning_rate": 1.4584355841947452e-06, + "loss": 0.0511, + "step": 3239 + }, + { + "epoch": 7.474048442906574, + "grad_norm": 0.0, + "learning_rate": 1.4541166329903856e-06, + "loss": 0.0541, + "step": 3240 + }, + { + "epoch": 7.476355247981545, + "grad_norm": 0.0, + "learning_rate": 1.449803584790086e-06, + "loss": 0.0551, + "step": 3241 + }, + { + "epoch": 7.478662053056516, + "grad_norm": 0.0, + "learning_rate": 1.4454964425730533e-06, + "loss": 0.0493, + "step": 3242 + }, + { + "epoch": 7.4809688581314875, + "grad_norm": 0.0, + "learning_rate": 1.4411952093144167e-06, + "loss": 0.0551, + "step": 3243 + }, + { + "epoch": 7.483275663206459, + "grad_norm": 0.0, + "learning_rate": 1.4368998879852135e-06, + "loss": 0.0272, + "step": 3244 + }, + { + "epoch": 7.4855824682814305, + "grad_norm": 0.0, + "learning_rate": 1.432610481552409e-06, + "loss": 0.0434, + "step": 3245 + }, + { + "epoch": 7.4878892733564015, + "grad_norm": 0.0, + "learning_rate": 1.4283269929788779e-06, + "loss": 0.0705, + "step": 3246 + }, + { + "epoch": 7.490196078431373, + "grad_norm": 0.0, + "learning_rate": 1.424049425223405e-06, + "loss": 0.0922, + "step": 3247 + }, + { + "epoch": 7.492502883506344, + "grad_norm": 0.0, + "learning_rate": 1.4197777812406898e-06, + "loss": 0.0512, + "step": 3248 + }, + { + "epoch": 7.494809688581315, + "grad_norm": 0.0, + "learning_rate": 1.4155120639813392e-06, + "loss": 0.0299, + "step": 3249 + }, + { + "epoch": 7.497116493656286, + "grad_norm": 0.0, + "learning_rate": 1.4112522763918635e-06, + "loss": 0.0392, + "step": 3250 + }, + { + "epoch": 7.499423298731257, + "grad_norm": 0.0, + "learning_rate": 1.406998421414676e-06, + "loss": 0.0848, + "step": 3251 + }, + { + "epoch": 7.501730103806229, + "grad_norm": 0.0, + "learning_rate": 1.4027505019880972e-06, + "loss": 0.0697, + "step": 3252 + }, + { + "epoch": 7.5040369088812, + "grad_norm": 0.0, + "learning_rate": 1.3985085210463479e-06, + "loss": 0.0529, + "step": 3253 + }, + { + "epoch": 7.506343713956171, + "grad_norm": 0.0, + "learning_rate": 1.3942724815195386e-06, + "loss": 0.0468, + "step": 3254 + }, + { + "epoch": 7.508650519031142, + "grad_norm": 0.0, + "learning_rate": 1.3900423863336842e-06, + "loss": 0.0433, + "step": 3255 + }, + { + "epoch": 7.510957324106113, + "grad_norm": 0.0, + "learning_rate": 1.3858182384106943e-06, + "loss": 0.0609, + "step": 3256 + }, + { + "epoch": 7.513264129181084, + "grad_norm": 0.0, + "learning_rate": 1.3816000406683604e-06, + "loss": 0.0691, + "step": 3257 + }, + { + "epoch": 7.515570934256055, + "grad_norm": 0.0, + "learning_rate": 1.377387796020374e-06, + "loss": 0.0436, + "step": 3258 + }, + { + "epoch": 7.517877739331027, + "grad_norm": 0.0, + "learning_rate": 1.3731815073763132e-06, + "loss": 0.1054, + "step": 3259 + }, + { + "epoch": 7.520184544405998, + "grad_norm": 0.0, + "learning_rate": 1.368981177641636e-06, + "loss": 0.0695, + "step": 3260 + }, + { + "epoch": 7.522491349480969, + "grad_norm": 0.0, + "learning_rate": 1.364786809717692e-06, + "loss": 0.0531, + "step": 3261 + }, + { + "epoch": 7.52479815455594, + "grad_norm": 0.0, + "learning_rate": 1.3605984065017074e-06, + "loss": 0.0568, + "step": 3262 + }, + { + "epoch": 7.527104959630911, + "grad_norm": 0.0, + "learning_rate": 1.3564159708867863e-06, + "loss": 0.0382, + "step": 3263 + }, + { + "epoch": 7.529411764705882, + "grad_norm": 0.0, + "learning_rate": 1.3522395057619186e-06, + "loss": 0.106, + "step": 3264 + }, + { + "epoch": 7.531718569780853, + "grad_norm": 0.0, + "learning_rate": 1.3480690140119657e-06, + "loss": 0.0491, + "step": 3265 + }, + { + "epoch": 7.534025374855824, + "grad_norm": 0.0, + "learning_rate": 1.3439044985176608e-06, + "loss": 0.0429, + "step": 3266 + }, + { + "epoch": 7.536332179930795, + "grad_norm": 0.0, + "learning_rate": 1.339745962155613e-06, + "loss": 0.0597, + "step": 3267 + }, + { + "epoch": 7.538638985005767, + "grad_norm": 0.0, + "learning_rate": 1.3355934077983024e-06, + "loss": 0.038, + "step": 3268 + }, + { + "epoch": 7.540945790080738, + "grad_norm": 0.0, + "learning_rate": 1.3314468383140687e-06, + "loss": 0.0254, + "step": 3269 + }, + { + "epoch": 7.5432525951557095, + "grad_norm": 0.0, + "learning_rate": 1.3273062565671258e-06, + "loss": 0.075, + "step": 3270 + }, + { + "epoch": 7.5455594002306805, + "grad_norm": 0.0, + "learning_rate": 1.323171665417552e-06, + "loss": 0.0498, + "step": 3271 + }, + { + "epoch": 7.5478662053056516, + "grad_norm": 0.0, + "learning_rate": 1.3190430677212795e-06, + "loss": 0.0499, + "step": 3272 + }, + { + "epoch": 7.550173010380623, + "grad_norm": 0.0, + "learning_rate": 1.3149204663301118e-06, + "loss": 0.057, + "step": 3273 + }, + { + "epoch": 7.552479815455594, + "grad_norm": 0.0, + "learning_rate": 1.3108038640916988e-06, + "loss": 0.1198, + "step": 3274 + }, + { + "epoch": 7.554786620530566, + "grad_norm": 0.0, + "learning_rate": 1.3066932638495566e-06, + "loss": 0.0821, + "step": 3275 + }, + { + "epoch": 7.557093425605537, + "grad_norm": 0.0, + "learning_rate": 1.3025886684430467e-06, + "loss": 0.1062, + "step": 3276 + }, + { + "epoch": 7.559400230680508, + "grad_norm": 0.0, + "learning_rate": 1.2984900807073919e-06, + "loss": 0.1036, + "step": 3277 + }, + { + "epoch": 7.561707035755479, + "grad_norm": 0.0, + "learning_rate": 1.2943975034736566e-06, + "loss": 0.0424, + "step": 3278 + }, + { + "epoch": 7.56401384083045, + "grad_norm": 0.0, + "learning_rate": 1.2903109395687597e-06, + "loss": 0.0774, + "step": 3279 + }, + { + "epoch": 7.566320645905421, + "grad_norm": 0.0, + "learning_rate": 1.286230391815465e-06, + "loss": 0.0709, + "step": 3280 + }, + { + "epoch": 7.568627450980392, + "grad_norm": 0.0, + "learning_rate": 1.282155863032377e-06, + "loss": 0.0703, + "step": 3281 + }, + { + "epoch": 7.570934256055363, + "grad_norm": 0.0, + "learning_rate": 1.278087356033947e-06, + "loss": 0.0727, + "step": 3282 + }, + { + "epoch": 7.573241061130334, + "grad_norm": 0.0, + "learning_rate": 1.2740248736304673e-06, + "loss": 0.0583, + "step": 3283 + }, + { + "epoch": 7.575547866205306, + "grad_norm": 0.0, + "learning_rate": 1.2699684186280636e-06, + "loss": 0.0757, + "step": 3284 + }, + { + "epoch": 7.577854671280277, + "grad_norm": 0.0, + "learning_rate": 1.2659179938287035e-06, + "loss": 0.0598, + "step": 3285 + }, + { + "epoch": 7.580161476355248, + "grad_norm": 0.0, + "learning_rate": 1.2618736020301858e-06, + "loss": 0.0559, + "step": 3286 + }, + { + "epoch": 7.582468281430219, + "grad_norm": 0.0, + "learning_rate": 1.2578352460261456e-06, + "loss": 0.0466, + "step": 3287 + }, + { + "epoch": 7.58477508650519, + "grad_norm": 0.0, + "learning_rate": 1.2538029286060428e-06, + "loss": 0.0302, + "step": 3288 + }, + { + "epoch": 7.587081891580161, + "grad_norm": 0.0, + "learning_rate": 1.2497766525551724e-06, + "loss": 0.0643, + "step": 3289 + }, + { + "epoch": 7.589388696655132, + "grad_norm": 0.0, + "learning_rate": 1.2457564206546568e-06, + "loss": 0.0832, + "step": 3290 + }, + { + "epoch": 7.591695501730104, + "grad_norm": 0.0, + "learning_rate": 1.2417422356814345e-06, + "loss": 0.0414, + "step": 3291 + }, + { + "epoch": 7.594002306805075, + "grad_norm": 0.0, + "learning_rate": 1.2377341004082778e-06, + "loss": 0.0648, + "step": 3292 + }, + { + "epoch": 7.596309111880046, + "grad_norm": 0.0, + "learning_rate": 1.233732017603776e-06, + "loss": 0.0631, + "step": 3293 + }, + { + "epoch": 7.598615916955017, + "grad_norm": 0.0, + "learning_rate": 1.2297359900323346e-06, + "loss": 0.046, + "step": 3294 + }, + { + "epoch": 7.600922722029988, + "grad_norm": 0.0, + "learning_rate": 1.2257460204541793e-06, + "loss": 0.0659, + "step": 3295 + }, + { + "epoch": 7.6032295271049595, + "grad_norm": 0.0, + "learning_rate": 1.2217621116253564e-06, + "loss": 0.0789, + "step": 3296 + }, + { + "epoch": 7.6055363321799305, + "grad_norm": 0.0, + "learning_rate": 1.2177842662977136e-06, + "loss": 0.108, + "step": 3297 + }, + { + "epoch": 7.607843137254902, + "grad_norm": 0.0, + "learning_rate": 1.213812487218924e-06, + "loss": 0.0773, + "step": 3298 + }, + { + "epoch": 7.610149942329873, + "grad_norm": 0.0, + "learning_rate": 1.2098467771324597e-06, + "loss": 0.0648, + "step": 3299 + }, + { + "epoch": 7.612456747404845, + "grad_norm": 0.0, + "learning_rate": 1.2058871387776039e-06, + "loss": 0.0613, + "step": 3300 + }, + { + "epoch": 7.614763552479816, + "grad_norm": 0.0, + "learning_rate": 1.2019335748894489e-06, + "loss": 0.0753, + "step": 3301 + }, + { + "epoch": 7.617070357554787, + "grad_norm": 0.0, + "learning_rate": 1.1979860881988903e-06, + "loss": 0.1092, + "step": 3302 + }, + { + "epoch": 7.619377162629758, + "grad_norm": 0.0, + "learning_rate": 1.19404468143262e-06, + "loss": 0.056, + "step": 3303 + }, + { + "epoch": 7.621683967704729, + "grad_norm": 0.0, + "learning_rate": 1.1901093573131394e-06, + "loss": 0.061, + "step": 3304 + }, + { + "epoch": 7.6239907727797, + "grad_norm": 0.0, + "learning_rate": 1.186180118558743e-06, + "loss": 0.1186, + "step": 3305 + }, + { + "epoch": 7.626297577854672, + "grad_norm": 0.0, + "learning_rate": 1.1822569678835195e-06, + "loss": 0.0442, + "step": 3306 + }, + { + "epoch": 7.628604382929643, + "grad_norm": 0.0, + "learning_rate": 1.1783399079973578e-06, + "loss": 0.0402, + "step": 3307 + }, + { + "epoch": 7.630911188004614, + "grad_norm": 0.0, + "learning_rate": 1.1744289416059396e-06, + "loss": 0.0333, + "step": 3308 + }, + { + "epoch": 7.633217993079585, + "grad_norm": 0.0, + "learning_rate": 1.1705240714107301e-06, + "loss": 0.0816, + "step": 3309 + }, + { + "epoch": 7.635524798154556, + "grad_norm": 0.0, + "learning_rate": 1.1666253001089933e-06, + "loss": 0.0477, + "step": 3310 + }, + { + "epoch": 7.637831603229527, + "grad_norm": 0.0, + "learning_rate": 1.1627326303937747e-06, + "loss": 0.0679, + "step": 3311 + }, + { + "epoch": 7.640138408304498, + "grad_norm": 0.0, + "learning_rate": 1.1588460649539036e-06, + "loss": 0.0394, + "step": 3312 + }, + { + "epoch": 7.642445213379469, + "grad_norm": 0.0, + "learning_rate": 1.1549656064739966e-06, + "loss": 0.0435, + "step": 3313 + }, + { + "epoch": 7.64475201845444, + "grad_norm": 0.0, + "learning_rate": 1.1510912576344546e-06, + "loss": 0.0425, + "step": 3314 + }, + { + "epoch": 7.647058823529412, + "grad_norm": 0.0, + "learning_rate": 1.1472230211114498e-06, + "loss": 0.0747, + "step": 3315 + }, + { + "epoch": 7.649365628604383, + "grad_norm": 0.0, + "learning_rate": 1.1433608995769396e-06, + "loss": 0.0654, + "step": 3316 + }, + { + "epoch": 7.651672433679354, + "grad_norm": 0.0, + "learning_rate": 1.1395048956986577e-06, + "loss": 0.0555, + "step": 3317 + }, + { + "epoch": 7.653979238754325, + "grad_norm": 0.0, + "learning_rate": 1.1356550121401033e-06, + "loss": 0.0873, + "step": 3318 + }, + { + "epoch": 7.656286043829296, + "grad_norm": 0.0, + "learning_rate": 1.1318112515605583e-06, + "loss": 0.0944, + "step": 3319 + }, + { + "epoch": 7.658592848904267, + "grad_norm": 0.0, + "learning_rate": 1.1279736166150724e-06, + "loss": 0.0675, + "step": 3320 + }, + { + "epoch": 7.660899653979238, + "grad_norm": 0.0, + "learning_rate": 1.124142109954459e-06, + "loss": 0.026, + "step": 3321 + }, + { + "epoch": 7.66320645905421, + "grad_norm": 0.0, + "learning_rate": 1.1203167342253063e-06, + "loss": 0.0339, + "step": 3322 + }, + { + "epoch": 7.665513264129181, + "grad_norm": 0.0, + "learning_rate": 1.1164974920699611e-06, + "loss": 0.0733, + "step": 3323 + }, + { + "epoch": 7.6678200692041525, + "grad_norm": 0.0, + "learning_rate": 1.1126843861265347e-06, + "loss": 0.0878, + "step": 3324 + }, + { + "epoch": 7.6701268742791235, + "grad_norm": 0.0, + "learning_rate": 1.108877419028902e-06, + "loss": 0.0504, + "step": 3325 + }, + { + "epoch": 7.672433679354095, + "grad_norm": 0.0, + "learning_rate": 1.1050765934066998e-06, + "loss": 0.0613, + "step": 3326 + }, + { + "epoch": 7.674740484429066, + "grad_norm": 0.0, + "learning_rate": 1.1012819118853147e-06, + "loss": 0.0606, + "step": 3327 + }, + { + "epoch": 7.677047289504037, + "grad_norm": 0.0, + "learning_rate": 1.0974933770858964e-06, + "loss": 0.0631, + "step": 3328 + }, + { + "epoch": 7.679354094579008, + "grad_norm": 0.0, + "learning_rate": 1.0937109916253474e-06, + "loss": 0.0283, + "step": 3329 + }, + { + "epoch": 7.681660899653979, + "grad_norm": 0.0, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.0332, + "step": 3330 + }, + { + "epoch": 7.683967704728951, + "grad_norm": 0.0, + "learning_rate": 1.086164679167222e-06, + "loss": 0.0379, + "step": 3331 + }, + { + "epoch": 7.686274509803922, + "grad_norm": 0.0, + "learning_rate": 1.0824007573822025e-06, + "loss": 0.0849, + "step": 3332 + }, + { + "epoch": 7.688581314878893, + "grad_norm": 0.0, + "learning_rate": 1.0786429953611665e-06, + "loss": 0.0531, + "step": 3333 + }, + { + "epoch": 7.690888119953864, + "grad_norm": 0.0, + "learning_rate": 1.0748913956997565e-06, + "loss": 0.0651, + "step": 3334 + }, + { + "epoch": 7.693194925028835, + "grad_norm": 0.0, + "learning_rate": 1.0711459609893604e-06, + "loss": 0.0748, + "step": 3335 + }, + { + "epoch": 7.695501730103806, + "grad_norm": 0.0, + "learning_rate": 1.0674066938171123e-06, + "loss": 0.0979, + "step": 3336 + }, + { + "epoch": 7.697808535178777, + "grad_norm": 0.0, + "learning_rate": 1.0636735967658785e-06, + "loss": 0.0557, + "step": 3337 + }, + { + "epoch": 7.700115340253749, + "grad_norm": 0.0, + "learning_rate": 1.0599466724142693e-06, + "loss": 0.1222, + "step": 3338 + }, + { + "epoch": 7.70242214532872, + "grad_norm": 0.0, + "learning_rate": 1.0562259233366334e-06, + "loss": 0.0241, + "step": 3339 + }, + { + "epoch": 7.704728950403691, + "grad_norm": 0.0, + "learning_rate": 1.0525113521030428e-06, + "loss": 0.056, + "step": 3340 + }, + { + "epoch": 7.707035755478662, + "grad_norm": 0.0, + "learning_rate": 1.0488029612793138e-06, + "loss": 0.0525, + "step": 3341 + }, + { + "epoch": 7.709342560553633, + "grad_norm": 0.0, + "learning_rate": 1.0451007534269908e-06, + "loss": 0.0392, + "step": 3342 + }, + { + "epoch": 7.711649365628604, + "grad_norm": 0.0, + "learning_rate": 1.0414047311033404e-06, + "loss": 0.0764, + "step": 3343 + }, + { + "epoch": 7.713956170703575, + "grad_norm": 0.0, + "learning_rate": 1.0377148968613659e-06, + "loss": 0.0488, + "step": 3344 + }, + { + "epoch": 7.716262975778546, + "grad_norm": 0.0, + "learning_rate": 1.034031253249792e-06, + "loss": 0.072, + "step": 3345 + }, + { + "epoch": 7.718569780853517, + "grad_norm": 0.0, + "learning_rate": 1.0303538028130678e-06, + "loss": 0.0641, + "step": 3346 + }, + { + "epoch": 7.720876585928489, + "grad_norm": 0.0, + "learning_rate": 1.026682548091361e-06, + "loss": 0.0507, + "step": 3347 + }, + { + "epoch": 7.72318339100346, + "grad_norm": 0.0, + "learning_rate": 1.0230174916205681e-06, + "loss": 0.088, + "step": 3348 + }, + { + "epoch": 7.7254901960784315, + "grad_norm": 0.0, + "learning_rate": 1.0193586359322927e-06, + "loss": 0.047, + "step": 3349 + }, + { + "epoch": 7.7277970011534025, + "grad_norm": 0.0, + "learning_rate": 1.0157059835538662e-06, + "loss": 0.0644, + "step": 3350 + }, + { + "epoch": 7.730103806228374, + "grad_norm": 0.0, + "learning_rate": 1.012059537008332e-06, + "loss": 0.0801, + "step": 3351 + }, + { + "epoch": 7.732410611303345, + "grad_norm": 0.0, + "learning_rate": 1.0084192988144392e-06, + "loss": 0.0698, + "step": 3352 + }, + { + "epoch": 7.734717416378316, + "grad_norm": 0.0, + "learning_rate": 1.0047852714866591e-06, + "loss": 0.0747, + "step": 3353 + }, + { + "epoch": 7.737024221453288, + "grad_norm": 0.0, + "learning_rate": 1.0011574575351702e-06, + "loss": 0.0997, + "step": 3354 + }, + { + "epoch": 7.739331026528259, + "grad_norm": 0.0, + "learning_rate": 9.975358594658524e-07, + "loss": 0.0408, + "step": 3355 + }, + { + "epoch": 7.74163783160323, + "grad_norm": 0.0, + "learning_rate": 9.939204797802992e-07, + "loss": 0.0809, + "step": 3356 + }, + { + "epoch": 7.743944636678201, + "grad_norm": 0.0, + "learning_rate": 9.903113209758098e-07, + "loss": 0.0504, + "step": 3357 + }, + { + "epoch": 7.746251441753172, + "grad_norm": 0.0, + "learning_rate": 9.867083855453775e-07, + "loss": 0.1038, + "step": 3358 + }, + { + "epoch": 7.748558246828143, + "grad_norm": 0.0, + "learning_rate": 9.831116759777082e-07, + "loss": 0.0819, + "step": 3359 + }, + { + "epoch": 7.750865051903114, + "grad_norm": 0.0, + "learning_rate": 9.795211947571993e-07, + "loss": 0.0493, + "step": 3360 + }, + { + "epoch": 7.753171856978085, + "grad_norm": 0.0, + "learning_rate": 9.759369443639455e-07, + "loss": 0.0709, + "step": 3361 + }, + { + "epoch": 7.755478662053056, + "grad_norm": 0.0, + "learning_rate": 9.723589272737443e-07, + "loss": 0.0702, + "step": 3362 + }, + { + "epoch": 7.757785467128028, + "grad_norm": 0.0, + "learning_rate": 9.687871459580845e-07, + "loss": 0.044, + "step": 3363 + }, + { + "epoch": 7.760092272202999, + "grad_norm": 0.0, + "learning_rate": 9.652216028841433e-07, + "loss": 0.0334, + "step": 3364 + }, + { + "epoch": 7.76239907727797, + "grad_norm": 0.0, + "learning_rate": 9.616623005147952e-07, + "loss": 0.034, + "step": 3365 + }, + { + "epoch": 7.764705882352941, + "grad_norm": 0.0, + "learning_rate": 9.58109241308599e-07, + "loss": 0.0588, + "step": 3366 + }, + { + "epoch": 7.767012687427912, + "grad_norm": 0.0, + "learning_rate": 9.545624277198085e-07, + "loss": 0.0564, + "step": 3367 + }, + { + "epoch": 7.769319492502883, + "grad_norm": 0.0, + "learning_rate": 9.510218621983524e-07, + "loss": 0.0416, + "step": 3368 + }, + { + "epoch": 7.771626297577855, + "grad_norm": 0.0, + "learning_rate": 9.474875471898526e-07, + "loss": 0.0475, + "step": 3369 + }, + { + "epoch": 7.773933102652826, + "grad_norm": 0.0, + "learning_rate": 9.439594851356093e-07, + "loss": 0.053, + "step": 3370 + }, + { + "epoch": 7.776239907727797, + "grad_norm": 0.0, + "learning_rate": 9.404376784726054e-07, + "loss": 0.0347, + "step": 3371 + }, + { + "epoch": 7.778546712802768, + "grad_norm": 0.0, + "learning_rate": 9.369221296335007e-07, + "loss": 0.0756, + "step": 3372 + }, + { + "epoch": 7.780853517877739, + "grad_norm": 0.0, + "learning_rate": 9.334128410466359e-07, + "loss": 0.0686, + "step": 3373 + }, + { + "epoch": 7.78316032295271, + "grad_norm": 0.0, + "learning_rate": 9.299098151360231e-07, + "loss": 0.0525, + "step": 3374 + }, + { + "epoch": 7.7854671280276815, + "grad_norm": 0.0, + "learning_rate": 9.264130543213512e-07, + "loss": 0.0576, + "step": 3375 + }, + { + "epoch": 7.7877739331026525, + "grad_norm": 0.0, + "learning_rate": 9.229225610179848e-07, + "loss": 0.0756, + "step": 3376 + }, + { + "epoch": 7.790080738177624, + "grad_norm": 0.0, + "learning_rate": 9.194383376369509e-07, + "loss": 0.1163, + "step": 3377 + }, + { + "epoch": 7.7923875432525955, + "grad_norm": 0.0, + "learning_rate": 9.15960386584952e-07, + "loss": 0.0485, + "step": 3378 + }, + { + "epoch": 7.794694348327567, + "grad_norm": 0.0, + "learning_rate": 9.124887102643576e-07, + "loss": 0.0641, + "step": 3379 + }, + { + "epoch": 7.797001153402538, + "grad_norm": 0.0, + "learning_rate": 9.090233110732006e-07, + "loss": 0.0898, + "step": 3380 + }, + { + "epoch": 7.799307958477509, + "grad_norm": 0.0, + "learning_rate": 9.055641914051783e-07, + "loss": 0.1169, + "step": 3381 + }, + { + "epoch": 7.80161476355248, + "grad_norm": 0.0, + "learning_rate": 9.021113536496551e-07, + "loss": 0.0743, + "step": 3382 + }, + { + "epoch": 7.803921568627451, + "grad_norm": 0.0, + "learning_rate": 8.986648001916499e-07, + "loss": 0.0438, + "step": 3383 + }, + { + "epoch": 7.806228373702422, + "grad_norm": 0.0, + "learning_rate": 8.952245334118415e-07, + "loss": 0.0706, + "step": 3384 + }, + { + "epoch": 7.808535178777394, + "grad_norm": 0.0, + "learning_rate": 8.917905556865714e-07, + "loss": 0.0812, + "step": 3385 + }, + { + "epoch": 7.810841983852365, + "grad_norm": 0.0, + "learning_rate": 8.883628693878299e-07, + "loss": 0.0741, + "step": 3386 + }, + { + "epoch": 7.813148788927336, + "grad_norm": 0.0, + "learning_rate": 8.849414768832687e-07, + "loss": 0.0599, + "step": 3387 + }, + { + "epoch": 7.815455594002307, + "grad_norm": 0.0, + "learning_rate": 8.815263805361884e-07, + "loss": 0.0417, + "step": 3388 + }, + { + "epoch": 7.817762399077278, + "grad_norm": 0.0, + "learning_rate": 8.781175827055388e-07, + "loss": 0.0605, + "step": 3389 + }, + { + "epoch": 7.820069204152249, + "grad_norm": 0.0, + "learning_rate": 8.747150857459219e-07, + "loss": 0.0758, + "step": 3390 + }, + { + "epoch": 7.82237600922722, + "grad_norm": 0.0, + "learning_rate": 8.713188920075888e-07, + "loss": 0.0649, + "step": 3391 + }, + { + "epoch": 7.824682814302191, + "grad_norm": 0.0, + "learning_rate": 8.679290038364319e-07, + "loss": 0.0247, + "step": 3392 + }, + { + "epoch": 7.826989619377162, + "grad_norm": 0.0, + "learning_rate": 8.645454235739903e-07, + "loss": 0.0609, + "step": 3393 + }, + { + "epoch": 7.829296424452134, + "grad_norm": 0.0, + "learning_rate": 8.611681535574501e-07, + "loss": 0.0889, + "step": 3394 + }, + { + "epoch": 7.831603229527105, + "grad_norm": 0.0, + "learning_rate": 8.57797196119633e-07, + "loss": 0.0408, + "step": 3395 + }, + { + "epoch": 7.833910034602076, + "grad_norm": 0.0, + "learning_rate": 8.544325535889997e-07, + "loss": 0.0516, + "step": 3396 + }, + { + "epoch": 7.836216839677047, + "grad_norm": 0.0, + "learning_rate": 8.510742282896545e-07, + "loss": 0.0728, + "step": 3397 + }, + { + "epoch": 7.838523644752018, + "grad_norm": 0.0, + "learning_rate": 8.47722222541334e-07, + "loss": 0.0524, + "step": 3398 + }, + { + "epoch": 7.840830449826989, + "grad_norm": 0.0, + "learning_rate": 8.443765386594094e-07, + "loss": 0.0723, + "step": 3399 + }, + { + "epoch": 7.8431372549019605, + "grad_norm": 0.0, + "learning_rate": 8.41037178954891e-07, + "loss": 0.0693, + "step": 3400 + }, + { + "epoch": 7.845444059976932, + "grad_norm": 0.0, + "learning_rate": 8.377041457344104e-07, + "loss": 0.0796, + "step": 3401 + }, + { + "epoch": 7.8477508650519034, + "grad_norm": 0.0, + "learning_rate": 8.343774413002382e-07, + "loss": 0.0598, + "step": 3402 + }, + { + "epoch": 7.8500576701268745, + "grad_norm": 0.0, + "learning_rate": 8.310570679502716e-07, + "loss": 0.0557, + "step": 3403 + }, + { + "epoch": 7.8523644752018456, + "grad_norm": 0.0, + "learning_rate": 8.277430279780296e-07, + "loss": 0.0414, + "step": 3404 + }, + { + "epoch": 7.854671280276817, + "grad_norm": 0.0, + "learning_rate": 8.24435323672661e-07, + "loss": 0.0704, + "step": 3405 + }, + { + "epoch": 7.856978085351788, + "grad_norm": 0.0, + "learning_rate": 8.211339573189414e-07, + "loss": 0.0624, + "step": 3406 + }, + { + "epoch": 7.859284890426759, + "grad_norm": 0.0, + "learning_rate": 8.178389311972612e-07, + "loss": 0.0697, + "step": 3407 + }, + { + "epoch": 7.86159169550173, + "grad_norm": 0.0, + "learning_rate": 8.145502475836331e-07, + "loss": 0.0555, + "step": 3408 + }, + { + "epoch": 7.863898500576701, + "grad_norm": 0.0, + "learning_rate": 8.112679087496933e-07, + "loss": 0.0638, + "step": 3409 + }, + { + "epoch": 7.866205305651673, + "grad_norm": 0.0, + "learning_rate": 8.079919169626926e-07, + "loss": 0.0451, + "step": 3410 + }, + { + "epoch": 7.868512110726644, + "grad_norm": 0.0, + "learning_rate": 8.047222744854943e-07, + "loss": 0.0907, + "step": 3411 + }, + { + "epoch": 7.870818915801615, + "grad_norm": 0.0, + "learning_rate": 8.014589835765807e-07, + "loss": 0.0456, + "step": 3412 + }, + { + "epoch": 7.873125720876586, + "grad_norm": 0.0, + "learning_rate": 7.982020464900486e-07, + "loss": 0.088, + "step": 3413 + }, + { + "epoch": 7.875432525951557, + "grad_norm": 0.0, + "learning_rate": 7.949514654755963e-07, + "loss": 0.0493, + "step": 3414 + }, + { + "epoch": 7.877739331026528, + "grad_norm": 0.0, + "learning_rate": 7.917072427785422e-07, + "loss": 0.0596, + "step": 3415 + }, + { + "epoch": 7.880046136101499, + "grad_norm": 0.0, + "learning_rate": 7.884693806398091e-07, + "loss": 0.0937, + "step": 3416 + }, + { + "epoch": 7.882352941176471, + "grad_norm": 0.0, + "learning_rate": 7.852378812959227e-07, + "loss": 0.0563, + "step": 3417 + }, + { + "epoch": 7.884659746251442, + "grad_norm": 0.0, + "learning_rate": 7.820127469790206e-07, + "loss": 0.0735, + "step": 3418 + }, + { + "epoch": 7.886966551326413, + "grad_norm": 0.0, + "learning_rate": 7.787939799168342e-07, + "loss": 0.0663, + "step": 3419 + }, + { + "epoch": 7.889273356401384, + "grad_norm": 0.0, + "learning_rate": 7.755815823327084e-07, + "loss": 0.0578, + "step": 3420 + }, + { + "epoch": 7.891580161476355, + "grad_norm": 0.0, + "learning_rate": 7.723755564455771e-07, + "loss": 0.0408, + "step": 3421 + }, + { + "epoch": 7.893886966551326, + "grad_norm": 0.0, + "learning_rate": 7.69175904469982e-07, + "loss": 0.0551, + "step": 3422 + }, + { + "epoch": 7.896193771626297, + "grad_norm": 0.0, + "learning_rate": 7.659826286160565e-07, + "loss": 0.0756, + "step": 3423 + }, + { + "epoch": 7.898500576701268, + "grad_norm": 0.0, + "learning_rate": 7.627957310895329e-07, + "loss": 0.0715, + "step": 3424 + }, + { + "epoch": 7.900807381776239, + "grad_norm": 0.0, + "learning_rate": 7.596152140917368e-07, + "loss": 0.0782, + "step": 3425 + }, + { + "epoch": 7.903114186851211, + "grad_norm": 0.0, + "learning_rate": 7.564410798195832e-07, + "loss": 0.0404, + "step": 3426 + }, + { + "epoch": 7.905420991926182, + "grad_norm": 0.0, + "learning_rate": 7.532733304655848e-07, + "loss": 0.0682, + "step": 3427 + }, + { + "epoch": 7.9077277970011535, + "grad_norm": 0.0, + "learning_rate": 7.501119682178392e-07, + "loss": 0.0495, + "step": 3428 + }, + { + "epoch": 7.9100346020761245, + "grad_norm": 0.0, + "learning_rate": 7.46956995260033e-07, + "loss": 0.0429, + "step": 3429 + }, + { + "epoch": 7.912341407151096, + "grad_norm": 0.0, + "learning_rate": 7.438084137714408e-07, + "loss": 0.0505, + "step": 3430 + }, + { + "epoch": 7.914648212226067, + "grad_norm": 0.0, + "learning_rate": 7.406662259269193e-07, + "loss": 0.0645, + "step": 3431 + }, + { + "epoch": 7.916955017301038, + "grad_norm": 0.0, + "learning_rate": 7.375304338969135e-07, + "loss": 0.0962, + "step": 3432 + }, + { + "epoch": 7.91926182237601, + "grad_norm": 0.0, + "learning_rate": 7.344010398474455e-07, + "loss": 0.0969, + "step": 3433 + }, + { + "epoch": 7.921568627450981, + "grad_norm": 0.0, + "learning_rate": 7.312780459401226e-07, + "loss": 0.0487, + "step": 3434 + }, + { + "epoch": 7.923875432525952, + "grad_norm": 0.0, + "learning_rate": 7.281614543321269e-07, + "loss": 0.033, + "step": 3435 + }, + { + "epoch": 7.926182237600923, + "grad_norm": 0.0, + "learning_rate": 7.250512671762211e-07, + "loss": 0.0625, + "step": 3436 + }, + { + "epoch": 7.928489042675894, + "grad_norm": 0.0, + "learning_rate": 7.219474866207465e-07, + "loss": 0.0999, + "step": 3437 + }, + { + "epoch": 7.930795847750865, + "grad_norm": 0.0, + "learning_rate": 7.188501148096117e-07, + "loss": 0.0576, + "step": 3438 + }, + { + "epoch": 7.933102652825836, + "grad_norm": 0.0, + "learning_rate": 7.157591538823039e-07, + "loss": 0.0615, + "step": 3439 + }, + { + "epoch": 7.935409457900807, + "grad_norm": 0.0, + "learning_rate": 7.126746059738832e-07, + "loss": 0.0286, + "step": 3440 + }, + { + "epoch": 7.937716262975779, + "grad_norm": 0.0, + "learning_rate": 7.095964732149741e-07, + "loss": 0.0593, + "step": 3441 + }, + { + "epoch": 7.94002306805075, + "grad_norm": 0.0, + "learning_rate": 7.065247577317747e-07, + "loss": 0.0355, + "step": 3442 + }, + { + "epoch": 7.942329873125721, + "grad_norm": 0.0, + "learning_rate": 7.034594616460522e-07, + "loss": 0.0586, + "step": 3443 + }, + { + "epoch": 7.944636678200692, + "grad_norm": 0.0, + "learning_rate": 7.004005870751341e-07, + "loss": 0.0519, + "step": 3444 + }, + { + "epoch": 7.946943483275663, + "grad_norm": 0.0, + "learning_rate": 6.973481361319124e-07, + "loss": 0.0582, + "step": 3445 + }, + { + "epoch": 7.949250288350634, + "grad_norm": 0.0, + "learning_rate": 6.943021109248471e-07, + "loss": 0.0533, + "step": 3446 + }, + { + "epoch": 7.951557093425605, + "grad_norm": 0.0, + "learning_rate": 6.912625135579587e-07, + "loss": 0.0247, + "step": 3447 + }, + { + "epoch": 7.953863898500577, + "grad_norm": 0.0, + "learning_rate": 6.88229346130822e-07, + "loss": 0.0635, + "step": 3448 + }, + { + "epoch": 7.956170703575548, + "grad_norm": 0.0, + "learning_rate": 6.852026107385756e-07, + "loss": 0.0514, + "step": 3449 + }, + { + "epoch": 7.958477508650519, + "grad_norm": 0.0, + "learning_rate": 6.821823094719171e-07, + "loss": 0.0751, + "step": 3450 + }, + { + "epoch": 7.96078431372549, + "grad_norm": 0.0, + "learning_rate": 6.791684444170932e-07, + "loss": 0.0462, + "step": 3451 + }, + { + "epoch": 7.963091118800461, + "grad_norm": 0.0, + "learning_rate": 6.761610176559086e-07, + "loss": 0.0433, + "step": 3452 + }, + { + "epoch": 7.965397923875432, + "grad_norm": 0.0, + "learning_rate": 6.731600312657238e-07, + "loss": 0.049, + "step": 3453 + }, + { + "epoch": 7.9677047289504035, + "grad_norm": 0.0, + "learning_rate": 6.701654873194429e-07, + "loss": 0.0602, + "step": 3454 + }, + { + "epoch": 7.9700115340253745, + "grad_norm": 0.0, + "learning_rate": 6.671773878855281e-07, + "loss": 0.0412, + "step": 3455 + }, + { + "epoch": 7.972318339100346, + "grad_norm": 0.0, + "learning_rate": 6.641957350279838e-07, + "loss": 0.0414, + "step": 3456 + }, + { + "epoch": 7.9746251441753175, + "grad_norm": 0.0, + "learning_rate": 6.612205308063646e-07, + "loss": 0.0854, + "step": 3457 + }, + { + "epoch": 7.976931949250289, + "grad_norm": 0.0, + "learning_rate": 6.582517772757702e-07, + "loss": 0.0382, + "step": 3458 + }, + { + "epoch": 7.97923875432526, + "grad_norm": 0.0, + "learning_rate": 6.552894764868456e-07, + "loss": 0.0701, + "step": 3459 + }, + { + "epoch": 7.981545559400231, + "grad_norm": 0.0, + "learning_rate": 6.523336304857764e-07, + "loss": 0.0775, + "step": 3460 + }, + { + "epoch": 7.983852364475202, + "grad_norm": 0.0, + "learning_rate": 6.493842413142915e-07, + "loss": 0.041, + "step": 3461 + }, + { + "epoch": 7.986159169550173, + "grad_norm": 0.0, + "learning_rate": 6.464413110096601e-07, + "loss": 0.0456, + "step": 3462 + }, + { + "epoch": 7.988465974625144, + "grad_norm": 0.0, + "learning_rate": 6.435048416046863e-07, + "loss": 0.0706, + "step": 3463 + }, + { + "epoch": 7.990772779700116, + "grad_norm": 0.0, + "learning_rate": 6.405748351277152e-07, + "loss": 0.063, + "step": 3464 + }, + { + "epoch": 7.993079584775087, + "grad_norm": 0.0, + "learning_rate": 6.37651293602628e-07, + "loss": 0.0883, + "step": 3465 + }, + { + "epoch": 7.995386389850058, + "grad_norm": 0.0, + "learning_rate": 6.34734219048837e-07, + "loss": 0.0665, + "step": 3466 + }, + { + "epoch": 7.997693194925029, + "grad_norm": 0.0, + "learning_rate": 6.318236134812917e-07, + "loss": 0.0869, + "step": 3467 + }, + { + "epoch": 8.0, + "grad_norm": 0.0, + "learning_rate": 6.28919478910468e-07, + "loss": 0.062, + "step": 3468 + }, + { + "epoch": 8.002306805074971, + "grad_norm": 0.0, + "learning_rate": 6.260218173423749e-07, + "loss": 0.0482, + "step": 3469 + }, + { + "epoch": 8.004613610149942, + "grad_norm": 0.0, + "learning_rate": 6.231306307785523e-07, + "loss": 0.0636, + "step": 3470 + }, + { + "epoch": 8.006920415224913, + "grad_norm": 0.0, + "learning_rate": 6.202459212160638e-07, + "loss": 0.0494, + "step": 3471 + }, + { + "epoch": 8.009227220299884, + "grad_norm": 0.0, + "learning_rate": 6.173676906475012e-07, + "loss": 0.0207, + "step": 3472 + }, + { + "epoch": 8.011534025374855, + "grad_norm": 0.0, + "learning_rate": 6.144959410609785e-07, + "loss": 0.0468, + "step": 3473 + }, + { + "epoch": 8.013840830449826, + "grad_norm": 0.0, + "learning_rate": 6.116306744401391e-07, + "loss": 0.0413, + "step": 3474 + }, + { + "epoch": 8.016147635524797, + "grad_norm": 0.0, + "learning_rate": 6.087718927641406e-07, + "loss": 0.0702, + "step": 3475 + }, + { + "epoch": 8.01845444059977, + "grad_norm": 0.0, + "learning_rate": 6.05919598007666e-07, + "loss": 0.0205, + "step": 3476 + }, + { + "epoch": 8.020761245674741, + "grad_norm": 0.0, + "learning_rate": 6.030737921409169e-07, + "loss": 0.0493, + "step": 3477 + }, + { + "epoch": 8.023068050749712, + "grad_norm": 0.0, + "learning_rate": 6.002344771296098e-07, + "loss": 0.0458, + "step": 3478 + }, + { + "epoch": 8.025374855824683, + "grad_norm": 0.0, + "learning_rate": 5.974016549349837e-07, + "loss": 0.0322, + "step": 3479 + }, + { + "epoch": 8.027681660899654, + "grad_norm": 0.0, + "learning_rate": 5.945753275137844e-07, + "loss": 0.0534, + "step": 3480 + }, + { + "epoch": 8.029988465974625, + "grad_norm": 0.0, + "learning_rate": 5.917554968182803e-07, + "loss": 0.0474, + "step": 3481 + }, + { + "epoch": 8.032295271049597, + "grad_norm": 0.0, + "learning_rate": 5.889421647962456e-07, + "loss": 0.0325, + "step": 3482 + }, + { + "epoch": 8.034602076124568, + "grad_norm": 0.0, + "learning_rate": 5.861353333909692e-07, + "loss": 0.0544, + "step": 3483 + }, + { + "epoch": 8.036908881199539, + "grad_norm": 0.0, + "learning_rate": 5.833350045412478e-07, + "loss": 0.0373, + "step": 3484 + }, + { + "epoch": 8.03921568627451, + "grad_norm": 0.0, + "learning_rate": 5.805411801813865e-07, + "loss": 0.0639, + "step": 3485 + }, + { + "epoch": 8.04152249134948, + "grad_norm": 0.0, + "learning_rate": 5.777538622412005e-07, + "loss": 0.0287, + "step": 3486 + }, + { + "epoch": 8.043829296424452, + "grad_norm": 0.0, + "learning_rate": 5.749730526460073e-07, + "loss": 0.0393, + "step": 3487 + }, + { + "epoch": 8.046136101499423, + "grad_norm": 0.0, + "learning_rate": 5.721987533166307e-07, + "loss": 0.0167, + "step": 3488 + }, + { + "epoch": 8.048442906574394, + "grad_norm": 0.0, + "learning_rate": 5.694309661693942e-07, + "loss": 0.028, + "step": 3489 + }, + { + "epoch": 8.050749711649365, + "grad_norm": 0.0, + "learning_rate": 5.666696931161308e-07, + "loss": 0.0427, + "step": 3490 + }, + { + "epoch": 8.053056516724336, + "grad_norm": 0.0, + "learning_rate": 5.63914936064165e-07, + "loss": 0.0417, + "step": 3491 + }, + { + "epoch": 8.055363321799309, + "grad_norm": 0.0, + "learning_rate": 5.611666969163243e-07, + "loss": 0.0374, + "step": 3492 + }, + { + "epoch": 8.05767012687428, + "grad_norm": 0.0, + "learning_rate": 5.584249775709372e-07, + "loss": 0.0284, + "step": 3493 + }, + { + "epoch": 8.059976931949251, + "grad_norm": 0.0, + "learning_rate": 5.556897799218208e-07, + "loss": 0.0347, + "step": 3494 + }, + { + "epoch": 8.062283737024222, + "grad_norm": 0.0, + "learning_rate": 5.529611058582951e-07, + "loss": 0.0523, + "step": 3495 + }, + { + "epoch": 8.064590542099193, + "grad_norm": 0.0, + "learning_rate": 5.502389572651723e-07, + "loss": 0.0527, + "step": 3496 + }, + { + "epoch": 8.066897347174164, + "grad_norm": 0.0, + "learning_rate": 5.475233360227516e-07, + "loss": 0.0447, + "step": 3497 + }, + { + "epoch": 8.069204152249135, + "grad_norm": 0.0, + "learning_rate": 5.448142440068316e-07, + "loss": 0.0536, + "step": 3498 + }, + { + "epoch": 8.071510957324106, + "grad_norm": 0.0, + "learning_rate": 5.421116830886963e-07, + "loss": 0.0412, + "step": 3499 + }, + { + "epoch": 8.073817762399077, + "grad_norm": 0.0, + "learning_rate": 5.394156551351182e-07, + "loss": 0.0378, + "step": 3500 + }, + { + "epoch": 8.076124567474048, + "grad_norm": 0.0, + "learning_rate": 5.367261620083575e-07, + "loss": 0.0751, + "step": 3501 + }, + { + "epoch": 8.07843137254902, + "grad_norm": 0.0, + "learning_rate": 5.340432055661637e-07, + "loss": 0.0456, + "step": 3502 + }, + { + "epoch": 8.08073817762399, + "grad_norm": 0.0, + "learning_rate": 5.313667876617657e-07, + "loss": 0.0622, + "step": 3503 + }, + { + "epoch": 8.083044982698961, + "grad_norm": 0.0, + "learning_rate": 5.286969101438821e-07, + "loss": 0.0336, + "step": 3504 + }, + { + "epoch": 8.085351787773932, + "grad_norm": 0.0, + "learning_rate": 5.26033574856708e-07, + "loss": 0.0245, + "step": 3505 + }, + { + "epoch": 8.087658592848904, + "grad_norm": 0.0, + "learning_rate": 5.233767836399217e-07, + "loss": 0.0365, + "step": 3506 + }, + { + "epoch": 8.089965397923875, + "grad_norm": 0.0, + "learning_rate": 5.207265383286831e-07, + "loss": 0.0685, + "step": 3507 + }, + { + "epoch": 8.092272202998847, + "grad_norm": 0.0, + "learning_rate": 5.180828407536287e-07, + "loss": 0.1192, + "step": 3508 + }, + { + "epoch": 8.094579008073818, + "grad_norm": 0.0, + "learning_rate": 5.154456927408713e-07, + "loss": 0.0367, + "step": 3509 + }, + { + "epoch": 8.09688581314879, + "grad_norm": 0.0, + "learning_rate": 5.128150961120026e-07, + "loss": 0.0362, + "step": 3510 + }, + { + "epoch": 8.09919261822376, + "grad_norm": 0.0, + "learning_rate": 5.101910526840869e-07, + "loss": 0.0588, + "step": 3511 + }, + { + "epoch": 8.101499423298732, + "grad_norm": 0.0, + "learning_rate": 5.075735642696611e-07, + "loss": 0.021, + "step": 3512 + }, + { + "epoch": 8.103806228373703, + "grad_norm": 0.0, + "learning_rate": 5.049626326767366e-07, + "loss": 0.0482, + "step": 3513 + }, + { + "epoch": 8.106113033448674, + "grad_norm": 0.0, + "learning_rate": 5.02358259708795e-07, + "loss": 0.0466, + "step": 3514 + }, + { + "epoch": 8.108419838523645, + "grad_norm": 0.0, + "learning_rate": 4.997604471647844e-07, + "loss": 0.0267, + "step": 3515 + }, + { + "epoch": 8.110726643598616, + "grad_norm": 0.0, + "learning_rate": 4.97169196839129e-07, + "loss": 0.0535, + "step": 3516 + }, + { + "epoch": 8.113033448673587, + "grad_norm": 0.0, + "learning_rate": 4.945845105217118e-07, + "loss": 0.07, + "step": 3517 + }, + { + "epoch": 8.115340253748558, + "grad_norm": 0.0, + "learning_rate": 4.920063899978833e-07, + "loss": 0.0469, + "step": 3518 + }, + { + "epoch": 8.117647058823529, + "grad_norm": 0.0, + "learning_rate": 4.894348370484648e-07, + "loss": 0.0617, + "step": 3519 + }, + { + "epoch": 8.1199538638985, + "grad_norm": 0.0, + "learning_rate": 4.868698534497362e-07, + "loss": 0.0584, + "step": 3520 + }, + { + "epoch": 8.122260668973471, + "grad_norm": 0.0, + "learning_rate": 4.843114409734384e-07, + "loss": 0.0244, + "step": 3521 + }, + { + "epoch": 8.124567474048442, + "grad_norm": 0.0, + "learning_rate": 4.817596013867765e-07, + "loss": 0.032, + "step": 3522 + }, + { + "epoch": 8.126874279123413, + "grad_norm": 0.0, + "learning_rate": 4.792143364524138e-07, + "loss": 0.0345, + "step": 3523 + }, + { + "epoch": 8.129181084198386, + "grad_norm": 0.0, + "learning_rate": 4.766756479284751e-07, + "loss": 0.0448, + "step": 3524 + }, + { + "epoch": 8.131487889273357, + "grad_norm": 0.0, + "learning_rate": 4.7414353756853773e-07, + "loss": 0.0547, + "step": 3525 + }, + { + "epoch": 8.133794694348328, + "grad_norm": 0.0, + "learning_rate": 4.7161800712163807e-07, + "loss": 0.0548, + "step": 3526 + }, + { + "epoch": 8.1361014994233, + "grad_norm": 0.0, + "learning_rate": 4.6909905833226965e-07, + "loss": 0.0439, + "step": 3527 + }, + { + "epoch": 8.13840830449827, + "grad_norm": 0.0, + "learning_rate": 4.6658669294037393e-07, + "loss": 0.0415, + "step": 3528 + }, + { + "epoch": 8.140715109573241, + "grad_norm": 0.0, + "learning_rate": 4.6408091268134836e-07, + "loss": 0.0361, + "step": 3529 + }, + { + "epoch": 8.143021914648212, + "grad_norm": 0.0, + "learning_rate": 4.61581719286045e-07, + "loss": 0.0352, + "step": 3530 + }, + { + "epoch": 8.145328719723183, + "grad_norm": 0.0, + "learning_rate": 4.5908911448075746e-07, + "loss": 0.0349, + "step": 3531 + }, + { + "epoch": 8.147635524798154, + "grad_norm": 0.0, + "learning_rate": 4.566030999872384e-07, + "loss": 0.0402, + "step": 3532 + }, + { + "epoch": 8.149942329873126, + "grad_norm": 0.0, + "learning_rate": 4.5412367752268094e-07, + "loss": 0.0494, + "step": 3533 + }, + { + "epoch": 8.152249134948097, + "grad_norm": 0.0, + "learning_rate": 4.5165084879972844e-07, + "loss": 0.0373, + "step": 3534 + }, + { + "epoch": 8.154555940023068, + "grad_norm": 0.0, + "learning_rate": 4.491846155264667e-07, + "loss": 0.0398, + "step": 3535 + }, + { + "epoch": 8.156862745098039, + "grad_norm": 0.0, + "learning_rate": 4.46724979406431e-07, + "loss": 0.0559, + "step": 3536 + }, + { + "epoch": 8.15916955017301, + "grad_norm": 0.0, + "learning_rate": 4.4427194213859216e-07, + "loss": 0.053, + "step": 3537 + }, + { + "epoch": 8.16147635524798, + "grad_norm": 0.0, + "learning_rate": 4.4182550541737033e-07, + "loss": 0.0473, + "step": 3538 + }, + { + "epoch": 8.163783160322954, + "grad_norm": 0.0, + "learning_rate": 4.3938567093262275e-07, + "loss": 0.0495, + "step": 3539 + }, + { + "epoch": 8.166089965397925, + "grad_norm": 0.0, + "learning_rate": 4.3695244036964567e-07, + "loss": 0.0441, + "step": 3540 + }, + { + "epoch": 8.168396770472896, + "grad_norm": 0.0, + "learning_rate": 4.345258154091747e-07, + "loss": 0.0558, + "step": 3541 + }, + { + "epoch": 8.170703575547867, + "grad_norm": 0.0, + "learning_rate": 4.3210579772738237e-07, + "loss": 0.0543, + "step": 3542 + }, + { + "epoch": 8.173010380622838, + "grad_norm": 0.0, + "learning_rate": 4.296923889958771e-07, + "loss": 0.0341, + "step": 3543 + }, + { + "epoch": 8.175317185697809, + "grad_norm": 0.0, + "learning_rate": 4.272855908817042e-07, + "loss": 0.0347, + "step": 3544 + }, + { + "epoch": 8.17762399077278, + "grad_norm": 0.0, + "learning_rate": 4.2488540504734056e-07, + "loss": 0.0309, + "step": 3545 + }, + { + "epoch": 8.179930795847751, + "grad_norm": 0.0, + "learning_rate": 4.224918331506955e-07, + "loss": 0.0706, + "step": 3546 + }, + { + "epoch": 8.182237600922722, + "grad_norm": 0.0, + "learning_rate": 4.2010487684511105e-07, + "loss": 0.0663, + "step": 3547 + }, + { + "epoch": 8.184544405997693, + "grad_norm": 0.0, + "learning_rate": 4.177245377793604e-07, + "loss": 0.0244, + "step": 3548 + }, + { + "epoch": 8.186851211072664, + "grad_norm": 0.0, + "learning_rate": 4.1535081759764286e-07, + "loss": 0.0535, + "step": 3549 + }, + { + "epoch": 8.189158016147635, + "grad_norm": 0.0, + "learning_rate": 4.12983717939589e-07, + "loss": 0.0504, + "step": 3550 + }, + { + "epoch": 8.191464821222606, + "grad_norm": 0.0, + "learning_rate": 4.106232404402544e-07, + "loss": 0.0318, + "step": 3551 + }, + { + "epoch": 8.193771626297577, + "grad_norm": 0.0, + "learning_rate": 4.082693867301224e-07, + "loss": 0.0486, + "step": 3552 + }, + { + "epoch": 8.196078431372548, + "grad_norm": 0.0, + "learning_rate": 4.0592215843509585e-07, + "loss": 0.0426, + "step": 3553 + }, + { + "epoch": 8.19838523644752, + "grad_norm": 0.0, + "learning_rate": 4.035815571765089e-07, + "loss": 0.0453, + "step": 3554 + }, + { + "epoch": 8.200692041522492, + "grad_norm": 0.0, + "learning_rate": 4.012475845711106e-07, + "loss": 0.0419, + "step": 3555 + }, + { + "epoch": 8.202998846597463, + "grad_norm": 0.0, + "learning_rate": 3.9892024223107673e-07, + "loss": 0.0507, + "step": 3556 + }, + { + "epoch": 8.205305651672434, + "grad_norm": 0.0, + "learning_rate": 3.965995317640026e-07, + "loss": 0.0784, + "step": 3557 + }, + { + "epoch": 8.207612456747405, + "grad_norm": 0.0, + "learning_rate": 3.9428545477289913e-07, + "loss": 0.0807, + "step": 3558 + }, + { + "epoch": 8.209919261822376, + "grad_norm": 0.0, + "learning_rate": 3.919780128561979e-07, + "loss": 0.0427, + "step": 3559 + }, + { + "epoch": 8.212226066897347, + "grad_norm": 0.0, + "learning_rate": 3.8967720760774816e-07, + "loss": 0.0362, + "step": 3560 + }, + { + "epoch": 8.214532871972319, + "grad_norm": 0.0, + "learning_rate": 3.8738304061681107e-07, + "loss": 0.0414, + "step": 3561 + }, + { + "epoch": 8.21683967704729, + "grad_norm": 0.0, + "learning_rate": 3.850955134680678e-07, + "loss": 0.0578, + "step": 3562 + }, + { + "epoch": 8.21914648212226, + "grad_norm": 0.0, + "learning_rate": 3.8281462774161004e-07, + "loss": 0.0481, + "step": 3563 + }, + { + "epoch": 8.221453287197232, + "grad_norm": 0.0, + "learning_rate": 3.8054038501294077e-07, + "loss": 0.0385, + "step": 3564 + }, + { + "epoch": 8.223760092272203, + "grad_norm": 0.0, + "learning_rate": 3.7827278685297785e-07, + "loss": 0.0376, + "step": 3565 + }, + { + "epoch": 8.226066897347174, + "grad_norm": 0.0, + "learning_rate": 3.7601183482804504e-07, + "loss": 0.0359, + "step": 3566 + }, + { + "epoch": 8.228373702422145, + "grad_norm": 0.0, + "learning_rate": 3.7375753049987974e-07, + "loss": 0.0836, + "step": 3567 + }, + { + "epoch": 8.230680507497116, + "grad_norm": 0.0, + "learning_rate": 3.715098754256241e-07, + "loss": 0.0459, + "step": 3568 + }, + { + "epoch": 8.232987312572087, + "grad_norm": 0.0, + "learning_rate": 3.692688711578296e-07, + "loss": 0.0712, + "step": 3569 + }, + { + "epoch": 8.235294117647058, + "grad_norm": 0.0, + "learning_rate": 3.6703451924445467e-07, + "loss": 0.0495, + "step": 3570 + }, + { + "epoch": 8.23760092272203, + "grad_norm": 0.0, + "learning_rate": 3.6480682122885804e-07, + "loss": 0.0249, + "step": 3571 + }, + { + "epoch": 8.239907727797002, + "grad_norm": 0.0, + "learning_rate": 3.625857786498055e-07, + "loss": 0.0351, + "step": 3572 + }, + { + "epoch": 8.242214532871973, + "grad_norm": 0.0, + "learning_rate": 3.603713930414676e-07, + "loss": 0.0366, + "step": 3573 + }, + { + "epoch": 8.244521337946944, + "grad_norm": 0.0, + "learning_rate": 3.58163665933412e-07, + "loss": 0.0524, + "step": 3574 + }, + { + "epoch": 8.246828143021915, + "grad_norm": 0.0, + "learning_rate": 3.55962598850611e-07, + "loss": 0.0403, + "step": 3575 + }, + { + "epoch": 8.249134948096886, + "grad_norm": 0.0, + "learning_rate": 3.5376819331343404e-07, + "loss": 0.0553, + "step": 3576 + }, + { + "epoch": 8.251441753171857, + "grad_norm": 0.0, + "learning_rate": 3.515804508376508e-07, + "loss": 0.0668, + "step": 3577 + }, + { + "epoch": 8.253748558246828, + "grad_norm": 0.0, + "learning_rate": 3.4939937293442694e-07, + "loss": 0.064, + "step": 3578 + }, + { + "epoch": 8.2560553633218, + "grad_norm": 0.0, + "learning_rate": 3.472249611103273e-07, + "loss": 0.0678, + "step": 3579 + }, + { + "epoch": 8.25836216839677, + "grad_norm": 0.0, + "learning_rate": 3.450572168673072e-07, + "loss": 0.0779, + "step": 3580 + }, + { + "epoch": 8.260668973471741, + "grad_norm": 0.0, + "learning_rate": 3.428961417027221e-07, + "loss": 0.0559, + "step": 3581 + }, + { + "epoch": 8.262975778546712, + "grad_norm": 0.0, + "learning_rate": 3.4074173710931804e-07, + "loss": 0.0407, + "step": 3582 + }, + { + "epoch": 8.265282583621683, + "grad_norm": 0.0, + "learning_rate": 3.385940045752323e-07, + "loss": 0.0753, + "step": 3583 + }, + { + "epoch": 8.267589388696654, + "grad_norm": 0.0, + "learning_rate": 3.3645294558399487e-07, + "loss": 0.0595, + "step": 3584 + }, + { + "epoch": 8.269896193771626, + "grad_norm": 0.0, + "learning_rate": 3.3431856161452835e-07, + "loss": 0.0238, + "step": 3585 + }, + { + "epoch": 8.272202998846598, + "grad_norm": 0.0, + "learning_rate": 3.3219085414114003e-07, + "loss": 0.0222, + "step": 3586 + }, + { + "epoch": 8.27450980392157, + "grad_norm": 0.0, + "learning_rate": 3.3006982463352764e-07, + "loss": 0.0723, + "step": 3587 + }, + { + "epoch": 8.27681660899654, + "grad_norm": 0.0, + "learning_rate": 3.2795547455677813e-07, + "loss": 0.0424, + "step": 3588 + }, + { + "epoch": 8.279123414071512, + "grad_norm": 0.0, + "learning_rate": 3.2584780537136206e-07, + "loss": 0.075, + "step": 3589 + }, + { + "epoch": 8.281430219146483, + "grad_norm": 0.0, + "learning_rate": 3.237468185331327e-07, + "loss": 0.0207, + "step": 3590 + }, + { + "epoch": 8.283737024221454, + "grad_norm": 0.0, + "learning_rate": 3.2165251549333585e-07, + "loss": 0.0536, + "step": 3591 + }, + { + "epoch": 8.286043829296425, + "grad_norm": 0.0, + "learning_rate": 3.1956489769859213e-07, + "loss": 0.0461, + "step": 3592 + }, + { + "epoch": 8.288350634371396, + "grad_norm": 0.0, + "learning_rate": 3.1748396659090797e-07, + "loss": 0.0496, + "step": 3593 + }, + { + "epoch": 8.290657439446367, + "grad_norm": 0.0, + "learning_rate": 3.1540972360767254e-07, + "loss": 0.0602, + "step": 3594 + }, + { + "epoch": 8.292964244521338, + "grad_norm": 0.0, + "learning_rate": 3.1334217018165194e-07, + "loss": 0.0399, + "step": 3595 + }, + { + "epoch": 8.295271049596309, + "grad_norm": 0.0, + "learning_rate": 3.112813077409926e-07, + "loss": 0.0379, + "step": 3596 + }, + { + "epoch": 8.29757785467128, + "grad_norm": 0.0, + "learning_rate": 3.0922713770922155e-07, + "loss": 0.0432, + "step": 3597 + }, + { + "epoch": 8.299884659746251, + "grad_norm": 0.0, + "learning_rate": 3.07179661505238e-07, + "loss": 0.0528, + "step": 3598 + }, + { + "epoch": 8.302191464821222, + "grad_norm": 0.0, + "learning_rate": 3.051388805433231e-07, + "loss": 0.0486, + "step": 3599 + }, + { + "epoch": 8.304498269896193, + "grad_norm": 0.0, + "learning_rate": 3.0310479623313125e-07, + "loss": 0.0549, + "step": 3600 + }, + { + "epoch": 8.306805074971164, + "grad_norm": 0.0, + "learning_rate": 3.010774099796898e-07, + "loss": 0.0388, + "step": 3601 + }, + { + "epoch": 8.309111880046135, + "grad_norm": 0.0, + "learning_rate": 2.9905672318339963e-07, + "loss": 0.0413, + "step": 3602 + }, + { + "epoch": 8.311418685121108, + "grad_norm": 0.0, + "learning_rate": 2.970427372400353e-07, + "loss": 0.0446, + "step": 3603 + }, + { + "epoch": 8.313725490196079, + "grad_norm": 0.0, + "learning_rate": 2.950354535407429e-07, + "loss": 0.0374, + "step": 3604 + }, + { + "epoch": 8.31603229527105, + "grad_norm": 0.0, + "learning_rate": 2.930348734720379e-07, + "loss": 0.0554, + "step": 3605 + }, + { + "epoch": 8.318339100346021, + "grad_norm": 0.0, + "learning_rate": 2.910409984158058e-07, + "loss": 0.0817, + "step": 3606 + }, + { + "epoch": 8.320645905420992, + "grad_norm": 0.0, + "learning_rate": 2.8905382974930173e-07, + "loss": 0.039, + "step": 3607 + }, + { + "epoch": 8.322952710495963, + "grad_norm": 0.0, + "learning_rate": 2.8707336884514436e-07, + "loss": 0.0512, + "step": 3608 + }, + { + "epoch": 8.325259515570934, + "grad_norm": 0.0, + "learning_rate": 2.8509961707132496e-07, + "loss": 0.0671, + "step": 3609 + }, + { + "epoch": 8.327566320645905, + "grad_norm": 0.0, + "learning_rate": 2.831325757911985e-07, + "loss": 0.0655, + "step": 3610 + }, + { + "epoch": 8.329873125720876, + "grad_norm": 0.0, + "learning_rate": 2.8117224636347917e-07, + "loss": 0.0403, + "step": 3611 + }, + { + "epoch": 8.332179930795848, + "grad_norm": 0.0, + "learning_rate": 2.7921863014225504e-07, + "loss": 0.0571, + "step": 3612 + }, + { + "epoch": 8.334486735870819, + "grad_norm": 0.0, + "learning_rate": 2.772717284769677e-07, + "loss": 0.0557, + "step": 3613 + }, + { + "epoch": 8.33679354094579, + "grad_norm": 0.0, + "learning_rate": 2.753315427124259e-07, + "loss": 0.0535, + "step": 3614 + }, + { + "epoch": 8.33910034602076, + "grad_norm": 0.0, + "learning_rate": 2.733980741887987e-07, + "loss": 0.0681, + "step": 3615 + }, + { + "epoch": 8.341407151095732, + "grad_norm": 0.0, + "learning_rate": 2.714713242416156e-07, + "loss": 0.0559, + "step": 3616 + }, + { + "epoch": 8.343713956170703, + "grad_norm": 0.0, + "learning_rate": 2.6955129420176193e-07, + "loss": 0.0557, + "step": 3617 + }, + { + "epoch": 8.346020761245676, + "grad_norm": 0.0, + "learning_rate": 2.676379853954858e-07, + "loss": 0.0493, + "step": 3618 + }, + { + "epoch": 8.348327566320647, + "grad_norm": 0.0, + "learning_rate": 2.6573139914439104e-07, + "loss": 0.0439, + "step": 3619 + }, + { + "epoch": 8.350634371395618, + "grad_norm": 0.0, + "learning_rate": 2.6383153676543537e-07, + "loss": 0.0433, + "step": 3620 + }, + { + "epoch": 8.352941176470589, + "grad_norm": 0.0, + "learning_rate": 2.6193839957093683e-07, + "loss": 0.0524, + "step": 3621 + }, + { + "epoch": 8.35524798154556, + "grad_norm": 0.0, + "learning_rate": 2.6005198886856486e-07, + "loss": 0.0676, + "step": 3622 + }, + { + "epoch": 8.35755478662053, + "grad_norm": 0.0, + "learning_rate": 2.581723059613428e-07, + "loss": 0.0551, + "step": 3623 + }, + { + "epoch": 8.359861591695502, + "grad_norm": 0.0, + "learning_rate": 2.5629935214764866e-07, + "loss": 0.0528, + "step": 3624 + }, + { + "epoch": 8.362168396770473, + "grad_norm": 0.0, + "learning_rate": 2.5443312872120763e-07, + "loss": 0.0622, + "step": 3625 + }, + { + "epoch": 8.364475201845444, + "grad_norm": 0.0, + "learning_rate": 2.5257363697110406e-07, + "loss": 0.0417, + "step": 3626 + }, + { + "epoch": 8.366782006920415, + "grad_norm": 0.0, + "learning_rate": 2.507208781817638e-07, + "loss": 0.0681, + "step": 3627 + }, + { + "epoch": 8.369088811995386, + "grad_norm": 0.0, + "learning_rate": 2.4887485363296883e-07, + "loss": 0.0397, + "step": 3628 + }, + { + "epoch": 8.371395617070357, + "grad_norm": 0.0, + "learning_rate": 2.4703556459984456e-07, + "loss": 0.0186, + "step": 3629 + }, + { + "epoch": 8.373702422145328, + "grad_norm": 0.0, + "learning_rate": 2.45203012352867e-07, + "loss": 0.0466, + "step": 3630 + }, + { + "epoch": 8.3760092272203, + "grad_norm": 0.0, + "learning_rate": 2.433771981578581e-07, + "loss": 0.0663, + "step": 3631 + }, + { + "epoch": 8.37831603229527, + "grad_norm": 0.0, + "learning_rate": 2.4155812327598337e-07, + "loss": 0.0394, + "step": 3632 + }, + { + "epoch": 8.380622837370241, + "grad_norm": 0.0, + "learning_rate": 2.3974578896375555e-07, + "loss": 0.052, + "step": 3633 + }, + { + "epoch": 8.382929642445214, + "grad_norm": 0.0, + "learning_rate": 2.3794019647303325e-07, + "loss": 0.0452, + "step": 3634 + }, + { + "epoch": 8.385236447520185, + "grad_norm": 0.0, + "learning_rate": 2.361413470510121e-07, + "loss": 0.0378, + "step": 3635 + }, + { + "epoch": 8.387543252595156, + "grad_norm": 0.0, + "learning_rate": 2.3434924194023712e-07, + "loss": 0.0776, + "step": 3636 + }, + { + "epoch": 8.389850057670127, + "grad_norm": 0.0, + "learning_rate": 2.3256388237858806e-07, + "loss": 0.0532, + "step": 3637 + }, + { + "epoch": 8.392156862745098, + "grad_norm": 0.0, + "learning_rate": 2.307852695992907e-07, + "loss": 0.0653, + "step": 3638 + }, + { + "epoch": 8.39446366782007, + "grad_norm": 0.0, + "learning_rate": 2.2901340483090785e-07, + "loss": 0.0465, + "step": 3639 + }, + { + "epoch": 8.39677047289504, + "grad_norm": 0.0, + "learning_rate": 2.2724828929734156e-07, + "loss": 0.0754, + "step": 3640 + }, + { + "epoch": 8.399077277970012, + "grad_norm": 0.0, + "learning_rate": 2.25489924217831e-07, + "loss": 0.0579, + "step": 3641 + }, + { + "epoch": 8.401384083044983, + "grad_norm": 0.0, + "learning_rate": 2.2373831080695463e-07, + "loss": 0.0261, + "step": 3642 + }, + { + "epoch": 8.403690888119954, + "grad_norm": 0.0, + "learning_rate": 2.2199345027462572e-07, + "loss": 0.0806, + "step": 3643 + }, + { + "epoch": 8.405997693194925, + "grad_norm": 0.0, + "learning_rate": 2.202553438260946e-07, + "loss": 0.0538, + "step": 3644 + }, + { + "epoch": 8.408304498269896, + "grad_norm": 0.0, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.0474, + "step": 3645 + }, + { + "epoch": 8.410611303344867, + "grad_norm": 0.0, + "learning_rate": 2.1679939797809024e-07, + "loss": 0.0601, + "step": 3646 + }, + { + "epoch": 8.412918108419838, + "grad_norm": 0.0, + "learning_rate": 2.1508156096578748e-07, + "loss": 0.0686, + "step": 3647 + }, + { + "epoch": 8.415224913494809, + "grad_norm": 0.0, + "learning_rate": 2.1337048281161565e-07, + "loss": 0.059, + "step": 3648 + }, + { + "epoch": 8.41753171856978, + "grad_norm": 0.0, + "learning_rate": 2.1166616469749047e-07, + "loss": 0.0093, + "step": 3649 + }, + { + "epoch": 8.419838523644753, + "grad_norm": 0.0, + "learning_rate": 2.0996860780065575e-07, + "loss": 0.0509, + "step": 3650 + }, + { + "epoch": 8.422145328719724, + "grad_norm": 0.0, + "learning_rate": 2.082778132936858e-07, + "loss": 0.0346, + "step": 3651 + }, + { + "epoch": 8.424452133794695, + "grad_norm": 0.0, + "learning_rate": 2.0659378234448524e-07, + "loss": 0.0575, + "step": 3652 + }, + { + "epoch": 8.426758938869666, + "grad_norm": 0.0, + "learning_rate": 2.0491651611628582e-07, + "loss": 0.049, + "step": 3653 + }, + { + "epoch": 8.429065743944637, + "grad_norm": 0.0, + "learning_rate": 2.0324601576764525e-07, + "loss": 0.0489, + "step": 3654 + }, + { + "epoch": 8.431372549019608, + "grad_norm": 0.0, + "learning_rate": 2.0158228245244826e-07, + "loss": 0.0622, + "step": 3655 + }, + { + "epoch": 8.43367935409458, + "grad_norm": 0.0, + "learning_rate": 1.9992531731991005e-07, + "loss": 0.0422, + "step": 3656 + }, + { + "epoch": 8.43598615916955, + "grad_norm": 0.0, + "learning_rate": 1.9827512151456175e-07, + "loss": 0.0513, + "step": 3657 + }, + { + "epoch": 8.438292964244521, + "grad_norm": 0.0, + "learning_rate": 1.96631696176266e-07, + "loss": 0.0459, + "step": 3658 + }, + { + "epoch": 8.440599769319492, + "grad_norm": 0.0, + "learning_rate": 1.9499504244020694e-07, + "loss": 0.0429, + "step": 3659 + }, + { + "epoch": 8.442906574394463, + "grad_norm": 0.0, + "learning_rate": 1.933651614368892e-07, + "loss": 0.0404, + "step": 3660 + }, + { + "epoch": 8.445213379469434, + "grad_norm": 0.0, + "learning_rate": 1.917420542921433e-07, + "loss": 0.0518, + "step": 3661 + }, + { + "epoch": 8.447520184544405, + "grad_norm": 0.0, + "learning_rate": 1.9012572212711467e-07, + "loss": 0.0562, + "step": 3662 + }, + { + "epoch": 8.449826989619377, + "grad_norm": 0.0, + "learning_rate": 1.885161660582746e-07, + "loss": 0.037, + "step": 3663 + }, + { + "epoch": 8.452133794694348, + "grad_norm": 0.0, + "learning_rate": 1.8691338719741048e-07, + "loss": 0.03, + "step": 3664 + }, + { + "epoch": 8.45444059976932, + "grad_norm": 0.0, + "learning_rate": 1.8531738665163112e-07, + "loss": 0.0562, + "step": 3665 + }, + { + "epoch": 8.456747404844291, + "grad_norm": 0.0, + "learning_rate": 1.8372816552336025e-07, + "loss": 0.0785, + "step": 3666 + }, + { + "epoch": 8.459054209919262, + "grad_norm": 0.0, + "learning_rate": 1.82145724910342e-07, + "loss": 0.0763, + "step": 3667 + }, + { + "epoch": 8.461361014994234, + "grad_norm": 0.0, + "learning_rate": 1.8057006590563419e-07, + "loss": 0.0488, + "step": 3668 + }, + { + "epoch": 8.463667820069205, + "grad_norm": 0.0, + "learning_rate": 1.7900118959761181e-07, + "loss": 0.0448, + "step": 3669 + }, + { + "epoch": 8.465974625144176, + "grad_norm": 0.0, + "learning_rate": 1.7743909706996242e-07, + "loss": 0.0847, + "step": 3670 + }, + { + "epoch": 8.468281430219147, + "grad_norm": 0.0, + "learning_rate": 1.7588378940169293e-07, + "loss": 0.0688, + "step": 3671 + }, + { + "epoch": 8.470588235294118, + "grad_norm": 0.0, + "learning_rate": 1.7433526766711727e-07, + "loss": 0.0592, + "step": 3672 + }, + { + "epoch": 8.472895040369089, + "grad_norm": 0.0, + "learning_rate": 1.7279353293586765e-07, + "loss": 0.0327, + "step": 3673 + }, + { + "epoch": 8.47520184544406, + "grad_norm": 0.0, + "learning_rate": 1.7125858627288328e-07, + "loss": 0.041, + "step": 3674 + }, + { + "epoch": 8.477508650519031, + "grad_norm": 0.0, + "learning_rate": 1.6973042873841827e-07, + "loss": 0.0528, + "step": 3675 + }, + { + "epoch": 8.479815455594002, + "grad_norm": 0.0, + "learning_rate": 1.6820906138803384e-07, + "loss": 0.035, + "step": 3676 + }, + { + "epoch": 8.482122260668973, + "grad_norm": 0.0, + "learning_rate": 1.6669448527260602e-07, + "loss": 0.0282, + "step": 3677 + }, + { + "epoch": 8.484429065743944, + "grad_norm": 0.0, + "learning_rate": 1.651867014383146e-07, + "loss": 0.0589, + "step": 3678 + }, + { + "epoch": 8.486735870818915, + "grad_norm": 0.0, + "learning_rate": 1.6368571092665098e-07, + "loss": 0.0452, + "step": 3679 + }, + { + "epoch": 8.489042675893886, + "grad_norm": 0.0, + "learning_rate": 1.6219151477441243e-07, + "loss": 0.0537, + "step": 3680 + }, + { + "epoch": 8.491349480968857, + "grad_norm": 0.0, + "learning_rate": 1.6070411401370335e-07, + "loss": 0.0574, + "step": 3681 + }, + { + "epoch": 8.49365628604383, + "grad_norm": 0.0, + "learning_rate": 1.5922350967193524e-07, + "loss": 0.0508, + "step": 3682 + }, + { + "epoch": 8.495963091118801, + "grad_norm": 0.0, + "learning_rate": 1.5774970277182333e-07, + "loss": 0.0788, + "step": 3683 + }, + { + "epoch": 8.498269896193772, + "grad_norm": 0.0, + "learning_rate": 1.5628269433139e-07, + "loss": 0.0739, + "step": 3684 + }, + { + "epoch": 8.500576701268743, + "grad_norm": 0.0, + "learning_rate": 1.5482248536395904e-07, + "loss": 0.0577, + "step": 3685 + }, + { + "epoch": 8.502883506343714, + "grad_norm": 0.0, + "learning_rate": 1.5336907687815817e-07, + "loss": 0.0515, + "step": 3686 + }, + { + "epoch": 8.505190311418685, + "grad_norm": 0.0, + "learning_rate": 1.519224698779198e-07, + "loss": 0.042, + "step": 3687 + }, + { + "epoch": 8.507497116493656, + "grad_norm": 0.0, + "learning_rate": 1.504826653624758e-07, + "loss": 0.0478, + "step": 3688 + }, + { + "epoch": 8.509803921568627, + "grad_norm": 0.0, + "learning_rate": 1.4904966432635947e-07, + "loss": 0.0551, + "step": 3689 + }, + { + "epoch": 8.512110726643598, + "grad_norm": 0.0, + "learning_rate": 1.4762346775940794e-07, + "loss": 0.0471, + "step": 3690 + }, + { + "epoch": 8.51441753171857, + "grad_norm": 0.0, + "learning_rate": 1.4620407664675319e-07, + "loss": 0.0351, + "step": 3691 + }, + { + "epoch": 8.51672433679354, + "grad_norm": 0.0, + "learning_rate": 1.447914919688298e-07, + "loss": 0.0439, + "step": 3692 + }, + { + "epoch": 8.519031141868512, + "grad_norm": 0.0, + "learning_rate": 1.4338571470137063e-07, + "loss": 0.0632, + "step": 3693 + }, + { + "epoch": 8.521337946943483, + "grad_norm": 0.0, + "learning_rate": 1.419867458154034e-07, + "loss": 0.047, + "step": 3694 + }, + { + "epoch": 8.523644752018454, + "grad_norm": 0.0, + "learning_rate": 1.405945862772573e-07, + "loss": 0.082, + "step": 3695 + }, + { + "epoch": 8.525951557093425, + "grad_norm": 0.0, + "learning_rate": 1.3920923704855648e-07, + "loss": 0.0179, + "step": 3696 + }, + { + "epoch": 8.528258362168398, + "grad_norm": 0.0, + "learning_rate": 1.3783069908621772e-07, + "loss": 0.0698, + "step": 3697 + }, + { + "epoch": 8.530565167243369, + "grad_norm": 0.0, + "learning_rate": 1.3645897334245817e-07, + "loss": 0.0695, + "step": 3698 + }, + { + "epoch": 8.53287197231834, + "grad_norm": 0.0, + "learning_rate": 1.350940607647866e-07, + "loss": 0.0427, + "step": 3699 + }, + { + "epoch": 8.53517877739331, + "grad_norm": 0.0, + "learning_rate": 1.337359622960044e-07, + "loss": 0.0732, + "step": 3700 + }, + { + "epoch": 8.537485582468282, + "grad_norm": 0.0, + "learning_rate": 1.323846788742078e-07, + "loss": 0.0537, + "step": 3701 + }, + { + "epoch": 8.539792387543253, + "grad_norm": 0.0, + "learning_rate": 1.3104021143278911e-07, + "loss": 0.0578, + "step": 3702 + }, + { + "epoch": 8.542099192618224, + "grad_norm": 0.0, + "learning_rate": 1.2970256090042432e-07, + "loss": 0.0512, + "step": 3703 + }, + { + "epoch": 8.544405997693195, + "grad_norm": 0.0, + "learning_rate": 1.2837172820108769e-07, + "loss": 0.0335, + "step": 3704 + }, + { + "epoch": 8.546712802768166, + "grad_norm": 0.0, + "learning_rate": 1.2704771425404382e-07, + "loss": 0.0445, + "step": 3705 + }, + { + "epoch": 8.549019607843137, + "grad_norm": 0.0, + "learning_rate": 1.2573051997384122e-07, + "loss": 0.0621, + "step": 3706 + }, + { + "epoch": 8.551326412918108, + "grad_norm": 0.0, + "learning_rate": 1.2442014627032318e-07, + "loss": 0.0537, + "step": 3707 + }, + { + "epoch": 8.55363321799308, + "grad_norm": 0.0, + "learning_rate": 1.231165940486234e-07, + "loss": 0.0701, + "step": 3708 + }, + { + "epoch": 8.55594002306805, + "grad_norm": 0.0, + "learning_rate": 1.2181986420915615e-07, + "loss": 0.0784, + "step": 3709 + }, + { + "epoch": 8.558246828143021, + "grad_norm": 0.0, + "learning_rate": 1.2052995764763042e-07, + "loss": 0.0334, + "step": 3710 + }, + { + "epoch": 8.560553633217992, + "grad_norm": 0.0, + "learning_rate": 1.192468752550402e-07, + "loss": 0.0545, + "step": 3711 + }, + { + "epoch": 8.562860438292965, + "grad_norm": 0.0, + "learning_rate": 1.1797061791766207e-07, + "loss": 0.03, + "step": 3712 + }, + { + "epoch": 8.565167243367936, + "grad_norm": 0.0, + "learning_rate": 1.1670118651706197e-07, + "loss": 0.04, + "step": 3713 + }, + { + "epoch": 8.567474048442907, + "grad_norm": 0.0, + "learning_rate": 1.1543858193009183e-07, + "loss": 0.0356, + "step": 3714 + }, + { + "epoch": 8.569780853517878, + "grad_norm": 0.0, + "learning_rate": 1.1418280502888401e-07, + "loss": 0.0513, + "step": 3715 + }, + { + "epoch": 8.57208765859285, + "grad_norm": 0.0, + "learning_rate": 1.1293385668085688e-07, + "loss": 0.0596, + "step": 3716 + }, + { + "epoch": 8.57439446366782, + "grad_norm": 0.0, + "learning_rate": 1.1169173774871478e-07, + "loss": 0.059, + "step": 3717 + }, + { + "epoch": 8.576701268742791, + "grad_norm": 0.0, + "learning_rate": 1.1045644909043917e-07, + "loss": 0.035, + "step": 3718 + }, + { + "epoch": 8.579008073817763, + "grad_norm": 0.0, + "learning_rate": 1.0922799155929753e-07, + "loss": 0.0669, + "step": 3719 + }, + { + "epoch": 8.581314878892734, + "grad_norm": 0.0, + "learning_rate": 1.0800636600383662e-07, + "loss": 0.014, + "step": 3720 + }, + { + "epoch": 8.583621683967705, + "grad_norm": 0.0, + "learning_rate": 1.0679157326788592e-07, + "loss": 0.0525, + "step": 3721 + }, + { + "epoch": 8.585928489042676, + "grad_norm": 0.0, + "learning_rate": 1.055836141905553e-07, + "loss": 0.0443, + "step": 3722 + }, + { + "epoch": 8.588235294117647, + "grad_norm": 0.0, + "learning_rate": 1.0438248960623065e-07, + "loss": 0.0509, + "step": 3723 + }, + { + "epoch": 8.590542099192618, + "grad_norm": 0.0, + "learning_rate": 1.0318820034458165e-07, + "loss": 0.045, + "step": 3724 + }, + { + "epoch": 8.592848904267589, + "grad_norm": 0.0, + "learning_rate": 1.0200074723055397e-07, + "loss": 0.0686, + "step": 3725 + }, + { + "epoch": 8.59515570934256, + "grad_norm": 0.0, + "learning_rate": 1.0082013108437038e-07, + "loss": 0.0316, + "step": 3726 + }, + { + "epoch": 8.597462514417531, + "grad_norm": 0.0, + "learning_rate": 9.964635272153633e-08, + "loss": 0.0675, + "step": 3727 + }, + { + "epoch": 8.599769319492502, + "grad_norm": 0.0, + "learning_rate": 9.84794129528266e-08, + "loss": 0.069, + "step": 3728 + }, + { + "epoch": 8.602076124567475, + "grad_norm": 0.0, + "learning_rate": 9.731931258429638e-08, + "loss": 0.0713, + "step": 3729 + }, + { + "epoch": 8.604382929642446, + "grad_norm": 0.0, + "learning_rate": 9.616605241727917e-08, + "loss": 0.0709, + "step": 3730 + }, + { + "epoch": 8.606689734717417, + "grad_norm": 0.0, + "learning_rate": 9.50196332483766e-08, + "loss": 0.0451, + "step": 3731 + }, + { + "epoch": 8.608996539792388, + "grad_norm": 0.0, + "learning_rate": 9.388005586947191e-08, + "loss": 0.0461, + "step": 3732 + }, + { + "epoch": 8.611303344867359, + "grad_norm": 0.0, + "learning_rate": 9.274732106771989e-08, + "loss": 0.0423, + "step": 3733 + }, + { + "epoch": 8.61361014994233, + "grad_norm": 0.0, + "learning_rate": 9.162142962554576e-08, + "loss": 0.0422, + "step": 3734 + }, + { + "epoch": 8.615916955017301, + "grad_norm": 0.0, + "learning_rate": 9.0502382320653e-08, + "loss": 0.0529, + "step": 3735 + }, + { + "epoch": 8.618223760092272, + "grad_norm": 0.0, + "learning_rate": 8.939017992601329e-08, + "loss": 0.0478, + "step": 3736 + }, + { + "epoch": 8.620530565167243, + "grad_norm": 0.0, + "learning_rate": 8.82848232098732e-08, + "loss": 0.0453, + "step": 3737 + }, + { + "epoch": 8.622837370242214, + "grad_norm": 0.0, + "learning_rate": 8.718631293574753e-08, + "loss": 0.0475, + "step": 3738 + }, + { + "epoch": 8.625144175317185, + "grad_norm": 0.0, + "learning_rate": 8.609464986242711e-08, + "loss": 0.0317, + "step": 3739 + }, + { + "epoch": 8.627450980392156, + "grad_norm": 0.0, + "learning_rate": 8.500983474396762e-08, + "loss": 0.0308, + "step": 3740 + }, + { + "epoch": 8.629757785467127, + "grad_norm": 0.0, + "learning_rate": 8.393186832969746e-08, + "loss": 0.0585, + "step": 3741 + }, + { + "epoch": 8.632064590542099, + "grad_norm": 0.0, + "learning_rate": 8.286075136421435e-08, + "loss": 0.0764, + "step": 3742 + }, + { + "epoch": 8.63437139561707, + "grad_norm": 0.0, + "learning_rate": 8.179648458738309e-08, + "loss": 0.054, + "step": 3743 + }, + { + "epoch": 8.636678200692042, + "grad_norm": 0.0, + "learning_rate": 8.07390687343379e-08, + "loss": 0.0274, + "step": 3744 + }, + { + "epoch": 8.638985005767013, + "grad_norm": 0.0, + "learning_rate": 7.968850453548227e-08, + "loss": 0.0487, + "step": 3745 + }, + { + "epoch": 8.641291810841984, + "grad_norm": 0.0, + "learning_rate": 7.864479271648462e-08, + "loss": 0.049, + "step": 3746 + }, + { + "epoch": 8.643598615916956, + "grad_norm": 0.0, + "learning_rate": 7.760793399827937e-08, + "loss": 0.0169, + "step": 3747 + }, + { + "epoch": 8.645905420991927, + "grad_norm": 0.0, + "learning_rate": 7.65779290970714e-08, + "loss": 0.0374, + "step": 3748 + }, + { + "epoch": 8.648212226066898, + "grad_norm": 0.0, + "learning_rate": 7.555477872432715e-08, + "loss": 0.0454, + "step": 3749 + }, + { + "epoch": 8.650519031141869, + "grad_norm": 0.0, + "learning_rate": 7.453848358678018e-08, + "loss": 0.0651, + "step": 3750 + }, + { + "epoch": 8.65282583621684, + "grad_norm": 0.0, + "learning_rate": 7.352904438642893e-08, + "loss": 0.0817, + "step": 3751 + }, + { + "epoch": 8.65513264129181, + "grad_norm": 0.0, + "learning_rate": 7.25264618205357e-08, + "loss": 0.0477, + "step": 3752 + }, + { + "epoch": 8.657439446366782, + "grad_norm": 0.0, + "learning_rate": 7.153073658162646e-08, + "loss": 0.0903, + "step": 3753 + }, + { + "epoch": 8.659746251441753, + "grad_norm": 0.0, + "learning_rate": 7.054186935749219e-08, + "loss": 0.0465, + "step": 3754 + }, + { + "epoch": 8.662053056516724, + "grad_norm": 0.0, + "learning_rate": 6.955986083118426e-08, + "loss": 0.037, + "step": 3755 + }, + { + "epoch": 8.664359861591695, + "grad_norm": 0.0, + "learning_rate": 6.858471168101788e-08, + "loss": 0.0456, + "step": 3756 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.0, + "learning_rate": 6.761642258056977e-08, + "loss": 0.0485, + "step": 3757 + }, + { + "epoch": 8.668973471741637, + "grad_norm": 0.0, + "learning_rate": 6.665499419867937e-08, + "loss": 0.0538, + "step": 3758 + }, + { + "epoch": 8.671280276816608, + "grad_norm": 0.0, + "learning_rate": 6.570042719944436e-08, + "loss": 0.055, + "step": 3759 + }, + { + "epoch": 8.67358708189158, + "grad_norm": 0.0, + "learning_rate": 6.475272224222507e-08, + "loss": 0.0294, + "step": 3760 + }, + { + "epoch": 8.675893886966552, + "grad_norm": 0.0, + "learning_rate": 6.381187998164229e-08, + "loss": 0.0259, + "step": 3761 + }, + { + "epoch": 8.678200692041523, + "grad_norm": 0.0, + "learning_rate": 6.287790106757396e-08, + "loss": 0.0298, + "step": 3762 + }, + { + "epoch": 8.680507497116494, + "grad_norm": 0.0, + "learning_rate": 6.19507861451607e-08, + "loss": 0.0427, + "step": 3763 + }, + { + "epoch": 8.682814302191465, + "grad_norm": 0.0, + "learning_rate": 6.103053585480023e-08, + "loss": 0.0535, + "step": 3764 + }, + { + "epoch": 8.685121107266436, + "grad_norm": 0.0, + "learning_rate": 6.011715083214742e-08, + "loss": 0.0358, + "step": 3765 + }, + { + "epoch": 8.687427912341407, + "grad_norm": 0.0, + "learning_rate": 5.921063170811647e-08, + "loss": 0.0274, + "step": 3766 + }, + { + "epoch": 8.689734717416378, + "grad_norm": 0.0, + "learning_rate": 5.831097910887873e-08, + "loss": 0.0286, + "step": 3767 + }, + { + "epoch": 8.69204152249135, + "grad_norm": 0.0, + "learning_rate": 5.7418193655861545e-08, + "loss": 0.0371, + "step": 3768 + }, + { + "epoch": 8.69434832756632, + "grad_norm": 0.0, + "learning_rate": 5.6532275965751614e-08, + "loss": 0.0626, + "step": 3769 + }, + { + "epoch": 8.696655132641292, + "grad_norm": 0.0, + "learning_rate": 5.5653226650487225e-08, + "loss": 0.0761, + "step": 3770 + }, + { + "epoch": 8.698961937716263, + "grad_norm": 0.0, + "learning_rate": 5.4781046317267103e-08, + "loss": 0.0517, + "step": 3771 + }, + { + "epoch": 8.701268742791234, + "grad_norm": 0.0, + "learning_rate": 5.391573556854157e-08, + "loss": 0.0544, + "step": 3772 + }, + { + "epoch": 8.703575547866205, + "grad_norm": 0.0, + "learning_rate": 5.305729500201917e-08, + "loss": 0.0455, + "step": 3773 + }, + { + "epoch": 8.705882352941176, + "grad_norm": 0.0, + "learning_rate": 5.220572521066003e-08, + "loss": 0.0851, + "step": 3774 + }, + { + "epoch": 8.708189158016147, + "grad_norm": 0.0, + "learning_rate": 5.136102678268029e-08, + "loss": 0.0421, + "step": 3775 + }, + { + "epoch": 8.71049596309112, + "grad_norm": 0.0, + "learning_rate": 5.052320030154767e-08, + "loss": 0.0531, + "step": 3776 + }, + { + "epoch": 8.71280276816609, + "grad_norm": 0.0, + "learning_rate": 4.9692246345985905e-08, + "loss": 0.0491, + "step": 3777 + }, + { + "epoch": 8.715109573241062, + "grad_norm": 0.0, + "learning_rate": 4.88681654899692e-08, + "loss": 0.0546, + "step": 3778 + }, + { + "epoch": 8.717416378316033, + "grad_norm": 0.0, + "learning_rate": 4.8050958302726655e-08, + "loss": 0.0575, + "step": 3779 + }, + { + "epoch": 8.719723183391004, + "grad_norm": 0.0, + "learning_rate": 4.7240625348735636e-08, + "loss": 0.042, + "step": 3780 + }, + { + "epoch": 8.722029988465975, + "grad_norm": 0.0, + "learning_rate": 4.643716718772839e-08, + "loss": 0.0495, + "step": 3781 + }, + { + "epoch": 8.724336793540946, + "grad_norm": 0.0, + "learning_rate": 4.564058437468877e-08, + "loss": 0.0416, + "step": 3782 + }, + { + "epoch": 8.726643598615917, + "grad_norm": 0.0, + "learning_rate": 4.485087745984884e-08, + "loss": 0.0392, + "step": 3783 + }, + { + "epoch": 8.728950403690888, + "grad_norm": 0.0, + "learning_rate": 4.406804698869338e-08, + "loss": 0.0665, + "step": 3784 + }, + { + "epoch": 8.731257208765859, + "grad_norm": 0.0, + "learning_rate": 4.329209350195651e-08, + "loss": 0.0424, + "step": 3785 + }, + { + "epoch": 8.73356401384083, + "grad_norm": 0.0, + "learning_rate": 4.252301753562171e-08, + "loss": 0.0569, + "step": 3786 + }, + { + "epoch": 8.735870818915801, + "grad_norm": 0.0, + "learning_rate": 4.176081962092182e-08, + "loss": 0.0381, + "step": 3787 + }, + { + "epoch": 8.738177623990772, + "grad_norm": 0.0, + "learning_rate": 4.100550028434125e-08, + "loss": 0.0428, + "step": 3788 + }, + { + "epoch": 8.740484429065743, + "grad_norm": 0.0, + "learning_rate": 4.025706004760932e-08, + "loss": 0.0629, + "step": 3789 + }, + { + "epoch": 8.742791234140714, + "grad_norm": 0.0, + "learning_rate": 3.951549942770694e-08, + "loss": 0.0428, + "step": 3790 + }, + { + "epoch": 8.745098039215687, + "grad_norm": 0.0, + "learning_rate": 3.878081893685992e-08, + "loss": 0.0391, + "step": 3791 + }, + { + "epoch": 8.747404844290658, + "grad_norm": 0.0, + "learning_rate": 3.805301908254455e-08, + "loss": 0.0449, + "step": 3792 + }, + { + "epoch": 8.74971164936563, + "grad_norm": 0.0, + "learning_rate": 3.7332100367482027e-08, + "loss": 0.0383, + "step": 3793 + }, + { + "epoch": 8.7520184544406, + "grad_norm": 0.0, + "learning_rate": 3.6618063289642904e-08, + "loss": 0.0366, + "step": 3794 + }, + { + "epoch": 8.754325259515571, + "grad_norm": 0.0, + "learning_rate": 3.591090834224153e-08, + "loss": 0.04, + "step": 3795 + }, + { + "epoch": 8.756632064590542, + "grad_norm": 0.0, + "learning_rate": 3.521063601373942e-08, + "loss": 0.0465, + "step": 3796 + }, + { + "epoch": 8.758938869665513, + "grad_norm": 0.0, + "learning_rate": 3.451724678784518e-08, + "loss": 0.0501, + "step": 3797 + }, + { + "epoch": 8.761245674740485, + "grad_norm": 0.0, + "learning_rate": 3.383074114351237e-08, + "loss": 0.0495, + "step": 3798 + }, + { + "epoch": 8.763552479815456, + "grad_norm": 0.0, + "learning_rate": 3.315111955493944e-08, + "loss": 0.0644, + "step": 3799 + }, + { + "epoch": 8.765859284890427, + "grad_norm": 0.0, + "learning_rate": 3.247838249156976e-08, + "loss": 0.0526, + "step": 3800 + }, + { + "epoch": 8.768166089965398, + "grad_norm": 0.0, + "learning_rate": 3.181253041809052e-08, + "loss": 0.0469, + "step": 3801 + }, + { + "epoch": 8.770472895040369, + "grad_norm": 0.0, + "learning_rate": 3.115356379443601e-08, + "loss": 0.054, + "step": 3802 + }, + { + "epoch": 8.77277970011534, + "grad_norm": 0.0, + "learning_rate": 3.0501483075779936e-08, + "loss": 0.0529, + "step": 3803 + }, + { + "epoch": 8.77508650519031, + "grad_norm": 0.0, + "learning_rate": 2.9856288712544204e-08, + "loss": 0.0523, + "step": 3804 + }, + { + "epoch": 8.777393310265282, + "grad_norm": 0.0, + "learning_rate": 2.9217981150390095e-08, + "loss": 0.0506, + "step": 3805 + }, + { + "epoch": 8.779700115340253, + "grad_norm": 0.0, + "learning_rate": 2.858656083022604e-08, + "loss": 0.0314, + "step": 3806 + }, + { + "epoch": 8.782006920415224, + "grad_norm": 0.0, + "learning_rate": 2.796202818819871e-08, + "loss": 0.0507, + "step": 3807 + }, + { + "epoch": 8.784313725490197, + "grad_norm": 0.0, + "learning_rate": 2.7344383655699692e-08, + "loss": 0.0575, + "step": 3808 + }, + { + "epoch": 8.786620530565168, + "grad_norm": 0.0, + "learning_rate": 2.6733627659363272e-08, + "loss": 0.0515, + "step": 3809 + }, + { + "epoch": 8.788927335640139, + "grad_norm": 0.0, + "learning_rate": 2.6129760621063095e-08, + "loss": 0.0359, + "step": 3810 + }, + { + "epoch": 8.79123414071511, + "grad_norm": 0.0, + "learning_rate": 2.5532782957917724e-08, + "loss": 0.07, + "step": 3811 + }, + { + "epoch": 8.793540945790081, + "grad_norm": 0.0, + "learning_rate": 2.4942695082281752e-08, + "loss": 0.0737, + "step": 3812 + }, + { + "epoch": 8.795847750865052, + "grad_norm": 0.0, + "learning_rate": 2.4359497401758026e-08, + "loss": 0.0466, + "step": 3813 + }, + { + "epoch": 8.798154555940023, + "grad_norm": 0.0, + "learning_rate": 2.378319031918208e-08, + "loss": 0.0445, + "step": 3814 + }, + { + "epoch": 8.800461361014994, + "grad_norm": 0.0, + "learning_rate": 2.3213774232635487e-08, + "loss": 0.0343, + "step": 3815 + }, + { + "epoch": 8.802768166089965, + "grad_norm": 0.0, + "learning_rate": 2.265124953543918e-08, + "loss": 0.0638, + "step": 3816 + }, + { + "epoch": 8.805074971164936, + "grad_norm": 0.0, + "learning_rate": 2.2095616616150117e-08, + "loss": 0.0574, + "step": 3817 + }, + { + "epoch": 8.807381776239907, + "grad_norm": 0.0, + "learning_rate": 2.1546875858570182e-08, + "loss": 0.0248, + "step": 3818 + }, + { + "epoch": 8.809688581314878, + "grad_norm": 0.0, + "learning_rate": 2.1005027641736176e-08, + "loss": 0.0355, + "step": 3819 + }, + { + "epoch": 8.81199538638985, + "grad_norm": 0.0, + "learning_rate": 2.0470072339926482e-08, + "loss": 0.0497, + "step": 3820 + }, + { + "epoch": 8.81430219146482, + "grad_norm": 0.0, + "learning_rate": 1.9942010322655527e-08, + "loss": 0.0469, + "step": 3821 + }, + { + "epoch": 8.816608996539792, + "grad_norm": 0.0, + "learning_rate": 1.9420841954681525e-08, + "loss": 0.0696, + "step": 3822 + }, + { + "epoch": 8.818915801614764, + "grad_norm": 0.0, + "learning_rate": 1.8906567595994295e-08, + "loss": 0.0671, + "step": 3823 + }, + { + "epoch": 8.821222606689735, + "grad_norm": 0.0, + "learning_rate": 1.8399187601827462e-08, + "loss": 0.02, + "step": 3824 + }, + { + "epoch": 8.823529411764707, + "grad_norm": 0.0, + "learning_rate": 1.7898702322648453e-08, + "loss": 0.0804, + "step": 3825 + }, + { + "epoch": 8.825836216839678, + "grad_norm": 0.0, + "learning_rate": 1.7405112104164067e-08, + "loss": 0.03, + "step": 3826 + }, + { + "epoch": 8.828143021914649, + "grad_norm": 0.0, + "learning_rate": 1.6918417287318245e-08, + "loss": 0.0701, + "step": 3827 + }, + { + "epoch": 8.83044982698962, + "grad_norm": 0.0, + "learning_rate": 1.6438618208290957e-08, + "loss": 0.0492, + "step": 3828 + }, + { + "epoch": 8.83275663206459, + "grad_norm": 0.0, + "learning_rate": 1.596571519850043e-08, + "loss": 0.0481, + "step": 3829 + }, + { + "epoch": 8.835063437139562, + "grad_norm": 0.0, + "learning_rate": 1.5499708584600924e-08, + "loss": 0.0575, + "step": 3830 + }, + { + "epoch": 8.837370242214533, + "grad_norm": 0.0, + "learning_rate": 1.5040598688482732e-08, + "loss": 0.0702, + "step": 3831 + }, + { + "epoch": 8.839677047289504, + "grad_norm": 0.0, + "learning_rate": 1.4588385827272178e-08, + "loss": 0.0487, + "step": 3832 + }, + { + "epoch": 8.841983852364475, + "grad_norm": 0.0, + "learning_rate": 1.414307031333273e-08, + "loss": 0.0686, + "step": 3833 + }, + { + "epoch": 8.844290657439446, + "grad_norm": 0.0, + "learning_rate": 1.370465245426167e-08, + "loss": 0.022, + "step": 3834 + }, + { + "epoch": 8.846597462514417, + "grad_norm": 0.0, + "learning_rate": 1.3273132552893419e-08, + "loss": 0.0449, + "step": 3835 + }, + { + "epoch": 8.848904267589388, + "grad_norm": 0.0, + "learning_rate": 1.2848510907296219e-08, + "loss": 0.0684, + "step": 3836 + }, + { + "epoch": 8.85121107266436, + "grad_norm": 0.0, + "learning_rate": 1.2430787810776556e-08, + "loss": 0.0274, + "step": 3837 + }, + { + "epoch": 8.853517877739332, + "grad_norm": 0.0, + "learning_rate": 1.2019963551871405e-08, + "loss": 0.047, + "step": 3838 + }, + { + "epoch": 8.855824682814303, + "grad_norm": 0.0, + "learning_rate": 1.161603841435488e-08, + "loss": 0.0406, + "step": 3839 + }, + { + "epoch": 8.858131487889274, + "grad_norm": 0.0, + "learning_rate": 1.1219012677234908e-08, + "loss": 0.0443, + "step": 3840 + }, + { + "epoch": 8.860438292964245, + "grad_norm": 0.0, + "learning_rate": 1.0828886614754342e-08, + "loss": 0.0609, + "step": 3841 + }, + { + "epoch": 8.862745098039216, + "grad_norm": 0.0, + "learning_rate": 1.0445660496390952e-08, + "loss": 0.0612, + "step": 3842 + }, + { + "epoch": 8.865051903114187, + "grad_norm": 0.0, + "learning_rate": 1.0069334586854106e-08, + "loss": 0.0184, + "step": 3843 + }, + { + "epoch": 8.867358708189158, + "grad_norm": 0.0, + "learning_rate": 9.699909146086983e-09, + "loss": 0.0416, + "step": 3844 + }, + { + "epoch": 8.86966551326413, + "grad_norm": 0.0, + "learning_rate": 9.337384429269903e-09, + "loss": 0.0467, + "step": 3845 + }, + { + "epoch": 8.8719723183391, + "grad_norm": 0.0, + "learning_rate": 8.981760686811448e-09, + "loss": 0.0423, + "step": 3846 + }, + { + "epoch": 8.874279123414071, + "grad_norm": 0.0, + "learning_rate": 8.633038164358454e-09, + "loss": 0.0558, + "step": 3847 + }, + { + "epoch": 8.876585928489042, + "grad_norm": 0.0, + "learning_rate": 8.29121710278713e-09, + "loss": 0.0414, + "step": 3848 + }, + { + "epoch": 8.878892733564014, + "grad_norm": 0.0, + "learning_rate": 7.956297738207496e-09, + "loss": 0.0519, + "step": 3849 + }, + { + "epoch": 8.881199538638985, + "grad_norm": 0.0, + "learning_rate": 7.628280301963387e-09, + "loss": 0.0487, + "step": 3850 + }, + { + "epoch": 8.883506343713956, + "grad_norm": 0.0, + "learning_rate": 7.3071650206291145e-09, + "loss": 0.0668, + "step": 3851 + }, + { + "epoch": 8.885813148788927, + "grad_norm": 0.0, + "learning_rate": 6.992952116013918e-09, + "loss": 0.056, + "step": 3852 + }, + { + "epoch": 8.888119953863898, + "grad_norm": 0.0, + "learning_rate": 6.685641805158627e-09, + "loss": 0.0361, + "step": 3853 + }, + { + "epoch": 8.890426758938869, + "grad_norm": 0.0, + "learning_rate": 6.385234300332332e-09, + "loss": 0.029, + "step": 3854 + }, + { + "epoch": 8.892733564013842, + "grad_norm": 0.0, + "learning_rate": 6.091729809042379e-09, + "loss": 0.0551, + "step": 3855 + }, + { + "epoch": 8.895040369088813, + "grad_norm": 0.0, + "learning_rate": 5.805128534024373e-09, + "loss": 0.0743, + "step": 3856 + }, + { + "epoch": 8.897347174163784, + "grad_norm": 0.0, + "learning_rate": 5.525430673244403e-09, + "loss": 0.0317, + "step": 3857 + }, + { + "epoch": 8.899653979238755, + "grad_norm": 0.0, + "learning_rate": 5.252636419902368e-09, + "loss": 0.04, + "step": 3858 + }, + { + "epoch": 8.901960784313726, + "grad_norm": 0.0, + "learning_rate": 4.986745962428652e-09, + "loss": 0.0331, + "step": 3859 + }, + { + "epoch": 8.904267589388697, + "grad_norm": 0.0, + "learning_rate": 4.727759484486338e-09, + "loss": 0.0233, + "step": 3860 + }, + { + "epoch": 8.906574394463668, + "grad_norm": 0.0, + "learning_rate": 4.475677164966774e-09, + "loss": 0.0524, + "step": 3861 + }, + { + "epoch": 8.908881199538639, + "grad_norm": 0.0, + "learning_rate": 4.230499177994007e-09, + "loss": 0.0442, + "step": 3862 + }, + { + "epoch": 8.91118800461361, + "grad_norm": 0.0, + "learning_rate": 3.9922256929247895e-09, + "loss": 0.0691, + "step": 3863 + }, + { + "epoch": 8.913494809688581, + "grad_norm": 0.0, + "learning_rate": 3.760856874341912e-09, + "loss": 0.0736, + "step": 3864 + }, + { + "epoch": 8.915801614763552, + "grad_norm": 0.0, + "learning_rate": 3.536392882064199e-09, + "loss": 0.0648, + "step": 3865 + }, + { + "epoch": 8.918108419838523, + "grad_norm": 0.0, + "learning_rate": 3.3188338711365175e-09, + "loss": 0.0514, + "step": 3866 + }, + { + "epoch": 8.920415224913494, + "grad_norm": 0.0, + "learning_rate": 3.1081799918375454e-09, + "loss": 0.0382, + "step": 3867 + }, + { + "epoch": 8.922722029988465, + "grad_norm": 0.0, + "learning_rate": 2.9044313896731126e-09, + "loss": 0.0419, + "step": 3868 + }, + { + "epoch": 8.925028835063436, + "grad_norm": 0.0, + "learning_rate": 2.7075882053828605e-09, + "loss": 0.0269, + "step": 3869 + }, + { + "epoch": 8.92733564013841, + "grad_norm": 0.0, + "learning_rate": 2.5176505749346937e-09, + "loss": 0.0328, + "step": 3870 + }, + { + "epoch": 8.92964244521338, + "grad_norm": 0.0, + "learning_rate": 2.3346186295247763e-09, + "loss": 0.0458, + "step": 3871 + }, + { + "epoch": 8.931949250288351, + "grad_norm": 0.0, + "learning_rate": 2.1584924955819763e-09, + "loss": 0.04, + "step": 3872 + }, + { + "epoch": 8.934256055363322, + "grad_norm": 0.0, + "learning_rate": 1.9892722947645328e-09, + "loss": 0.0318, + "step": 3873 + }, + { + "epoch": 8.936562860438293, + "grad_norm": 0.0, + "learning_rate": 1.8269581439600559e-09, + "loss": 0.0616, + "step": 3874 + }, + { + "epoch": 8.938869665513264, + "grad_norm": 0.0, + "learning_rate": 1.6715501552855285e-09, + "loss": 0.0592, + "step": 3875 + }, + { + "epoch": 8.941176470588236, + "grad_norm": 0.0, + "learning_rate": 1.5230484360873043e-09, + "loss": 0.0343, + "step": 3876 + }, + { + "epoch": 8.943483275663207, + "grad_norm": 0.0, + "learning_rate": 1.3814530889433298e-09, + "loss": 0.0836, + "step": 3877 + }, + { + "epoch": 8.945790080738178, + "grad_norm": 0.0, + "learning_rate": 1.2467642116575919e-09, + "loss": 0.0385, + "step": 3878 + }, + { + "epoch": 8.948096885813149, + "grad_norm": 0.0, + "learning_rate": 1.1189818972656697e-09, + "loss": 0.0335, + "step": 3879 + }, + { + "epoch": 8.95040369088812, + "grad_norm": 0.0, + "learning_rate": 9.981062340336246e-10, + "loss": 0.0468, + "step": 3880 + }, + { + "epoch": 8.95271049596309, + "grad_norm": 0.0, + "learning_rate": 8.841373054546687e-10, + "loss": 0.0504, + "step": 3881 + }, + { + "epoch": 8.955017301038062, + "grad_norm": 0.0, + "learning_rate": 7.770751902513862e-10, + "loss": 0.0481, + "step": 3882 + }, + { + "epoch": 8.957324106113033, + "grad_norm": 0.0, + "learning_rate": 6.769199623779532e-10, + "loss": 0.049, + "step": 3883 + }, + { + "epoch": 8.959630911188004, + "grad_norm": 0.0, + "learning_rate": 5.836716910134766e-10, + "loss": 0.0435, + "step": 3884 + }, + { + "epoch": 8.961937716262975, + "grad_norm": 0.0, + "learning_rate": 4.973304405697654e-10, + "loss": 0.0492, + "step": 3885 + }, + { + "epoch": 8.964244521337946, + "grad_norm": 0.0, + "learning_rate": 4.178962706857803e-10, + "loss": 0.0373, + "step": 3886 + }, + { + "epoch": 8.966551326412919, + "grad_norm": 0.0, + "learning_rate": 3.4536923623096353e-10, + "loss": 0.0651, + "step": 3887 + }, + { + "epoch": 8.96885813148789, + "grad_norm": 0.0, + "learning_rate": 2.797493873019086e-10, + "loss": 0.0579, + "step": 3888 + }, + { + "epoch": 8.971164936562861, + "grad_norm": 0.0, + "learning_rate": 2.2103676922680117e-10, + "loss": 0.0408, + "step": 3889 + }, + { + "epoch": 8.973471741637832, + "grad_norm": 0.0, + "learning_rate": 1.6923142255764745e-10, + "loss": 0.0368, + "step": 3890 + }, + { + "epoch": 8.975778546712803, + "grad_norm": 0.0, + "learning_rate": 1.2433338308137645e-10, + "loss": 0.0217, + "step": 3891 + }, + { + "epoch": 8.978085351787774, + "grad_norm": 0.0, + "learning_rate": 8.634268181095806e-11, + "loss": 0.0379, + "step": 3892 + }, + { + "epoch": 8.980392156862745, + "grad_norm": 0.0, + "learning_rate": 5.525934498651353e-11, + "loss": 0.0243, + "step": 3893 + }, + { + "epoch": 8.982698961937716, + "grad_norm": 0.0, + "learning_rate": 3.108339407975613e-11, + "loss": 0.0125, + "step": 3894 + }, + { + "epoch": 8.985005767012687, + "grad_norm": 0.0, + "learning_rate": 1.381484578955039e-11, + "loss": 0.0306, + "step": 3895 + }, + { + "epoch": 8.987312572087658, + "grad_norm": 0.0, + "learning_rate": 3.4537120441324733e-12, + "loss": 0.0753, + "step": 3896 + }, + { + "epoch": 8.98961937716263, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 0.0315, + "step": 3897 + }, + { + "epoch": 8.98961937716263, + "step": 3897, + "total_flos": 5.517435394662072e+17, + "train_loss": 0.40523916085876577, + "train_runtime": 135721.8739, + "train_samples_per_second": 1.839, + "train_steps_per_second": 0.029 + } + ], + "logging_steps": 1.0, + "max_steps": 3897, + "num_input_tokens_seen": 0, + "num_train_epochs": 9, + "save_steps": 1000, + "total_flos": 5.517435394662072e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}