{ "best_metric": 0.2108319103717804, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.05700441784238278, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002850220892119139, "grad_norm": 1.226487159729004, "learning_rate": 7e-06, "loss": 0.5069, "step": 1 }, { "epoch": 0.0002850220892119139, "eval_loss": 0.809647262096405, "eval_runtime": 447.2686, "eval_samples_per_second": 13.211, "eval_steps_per_second": 3.305, "step": 1 }, { "epoch": 0.0005700441784238278, "grad_norm": 1.7584460973739624, "learning_rate": 1.4e-05, "loss": 0.5609, "step": 2 }, { "epoch": 0.0008550662676357417, "grad_norm": 1.7689909934997559, "learning_rate": 2.1e-05, "loss": 0.5384, "step": 3 }, { "epoch": 0.0011400883568476556, "grad_norm": 1.5021331310272217, "learning_rate": 2.8e-05, "loss": 0.5658, "step": 4 }, { "epoch": 0.0014251104460595695, "grad_norm": 0.9700350165367126, "learning_rate": 3.5e-05, "loss": 0.4866, "step": 5 }, { "epoch": 0.0017101325352714834, "grad_norm": 1.116253137588501, "learning_rate": 4.2e-05, "loss": 0.4975, "step": 6 }, { "epoch": 0.0019951546244833976, "grad_norm": 0.7912647128105164, "learning_rate": 4.899999999999999e-05, "loss": 0.4075, "step": 7 }, { "epoch": 0.0022801767136953113, "grad_norm": 0.615479052066803, "learning_rate": 5.6e-05, "loss": 0.3164, "step": 8 }, { "epoch": 0.0025651988029072254, "grad_norm": 0.6070367693901062, "learning_rate": 6.3e-05, "loss": 0.3441, "step": 9 }, { "epoch": 0.002850220892119139, "grad_norm": 0.6294856667518616, "learning_rate": 7e-05, "loss": 0.3463, "step": 10 }, { "epoch": 0.003135242981331053, "grad_norm": 0.6025909185409546, "learning_rate": 6.999521567473641e-05, "loss": 0.3461, "step": 11 }, { "epoch": 0.003420265070542967, "grad_norm": 0.543045163154602, "learning_rate": 6.998086400693241e-05, "loss": 0.2557, "step": 12 }, { "epoch": 0.003705287159754881, "grad_norm": 0.5994967222213745, "learning_rate": 6.995694892019065e-05, "loss": 0.2976, "step": 13 }, { "epoch": 0.003990309248966795, "grad_norm": 0.5487139821052551, "learning_rate": 6.99234769526571e-05, "loss": 0.2835, "step": 14 }, { "epoch": 0.004275331338178709, "grad_norm": 0.46559107303619385, "learning_rate": 6.988045725523343e-05, "loss": 0.2154, "step": 15 }, { "epoch": 0.0045603534273906225, "grad_norm": 0.510286808013916, "learning_rate": 6.982790158907539e-05, "loss": 0.2841, "step": 16 }, { "epoch": 0.004845375516602537, "grad_norm": 0.4607190489768982, "learning_rate": 6.976582432237733e-05, "loss": 0.2381, "step": 17 }, { "epoch": 0.005130397605814451, "grad_norm": 0.4366554617881775, "learning_rate": 6.969424242644413e-05, "loss": 0.1936, "step": 18 }, { "epoch": 0.005415419695026365, "grad_norm": 0.460168719291687, "learning_rate": 6.961317547105138e-05, "loss": 0.2087, "step": 19 }, { "epoch": 0.005700441784238278, "grad_norm": 0.4425370991230011, "learning_rate": 6.952264561909527e-05, "loss": 0.2109, "step": 20 }, { "epoch": 0.005985463873450192, "grad_norm": 0.4460863173007965, "learning_rate": 6.942267762053337e-05, "loss": 0.2675, "step": 21 }, { "epoch": 0.006270485962662106, "grad_norm": 0.3876311480998993, "learning_rate": 6.931329880561832e-05, "loss": 0.1999, "step": 22 }, { "epoch": 0.0065555080518740205, "grad_norm": 0.49625661969184875, "learning_rate": 6.919453907742597e-05, "loss": 0.2386, "step": 23 }, { "epoch": 0.006840530141085934, "grad_norm": 0.4720734655857086, "learning_rate": 6.90664309036802e-05, "loss": 0.2111, "step": 24 }, { "epoch": 0.007125552230297848, "grad_norm": 0.4751650094985962, "learning_rate": 6.892900930787656e-05, "loss": 0.2675, "step": 25 }, { "epoch": 0.007410574319509762, "grad_norm": 0.41327837109565735, "learning_rate": 6.87823118597072e-05, "loss": 0.2188, "step": 26 }, { "epoch": 0.007695596408721676, "grad_norm": 0.49241769313812256, "learning_rate": 6.862637866478969e-05, "loss": 0.2325, "step": 27 }, { "epoch": 0.00798061849793359, "grad_norm": 0.4301894009113312, "learning_rate": 6.846125235370252e-05, "loss": 0.2319, "step": 28 }, { "epoch": 0.008265640587145504, "grad_norm": 0.49408209323883057, "learning_rate": 6.828697807033038e-05, "loss": 0.2112, "step": 29 }, { "epoch": 0.008550662676357419, "grad_norm": 0.46577221155166626, "learning_rate": 6.81036034595222e-05, "loss": 0.2289, "step": 30 }, { "epoch": 0.008835684765569332, "grad_norm": 0.5830202102661133, "learning_rate": 6.791117865406564e-05, "loss": 0.2229, "step": 31 }, { "epoch": 0.009120706854781245, "grad_norm": 0.4341415464878082, "learning_rate": 6.770975626098112e-05, "loss": 0.2392, "step": 32 }, { "epoch": 0.00940572894399316, "grad_norm": 0.4623303711414337, "learning_rate": 6.749939134713974e-05, "loss": 0.2101, "step": 33 }, { "epoch": 0.009690751033205073, "grad_norm": 0.3854859173297882, "learning_rate": 6.728014142420846e-05, "loss": 0.1529, "step": 34 }, { "epoch": 0.009975773122416987, "grad_norm": 0.5663196444511414, "learning_rate": 6.7052066432927e-05, "loss": 0.1961, "step": 35 }, { "epoch": 0.010260795211628902, "grad_norm": 0.4905116856098175, "learning_rate": 6.681522872672069e-05, "loss": 0.1864, "step": 36 }, { "epoch": 0.010545817300840815, "grad_norm": 0.4281812906265259, "learning_rate": 6.656969305465356e-05, "loss": 0.1443, "step": 37 }, { "epoch": 0.01083083939005273, "grad_norm": 0.4878508746623993, "learning_rate": 6.631552654372672e-05, "loss": 0.2296, "step": 38 }, { "epoch": 0.011115861479264643, "grad_norm": 0.5167247653007507, "learning_rate": 6.60527986805264e-05, "loss": 0.1857, "step": 39 }, { "epoch": 0.011400883568476556, "grad_norm": 0.4258979856967926, "learning_rate": 6.578158129222711e-05, "loss": 0.1598, "step": 40 }, { "epoch": 0.011685905657688471, "grad_norm": 0.4491550922393799, "learning_rate": 6.550194852695469e-05, "loss": 0.172, "step": 41 }, { "epoch": 0.011970927746900385, "grad_norm": 0.43145951628685, "learning_rate": 6.521397683351509e-05, "loss": 0.1712, "step": 42 }, { "epoch": 0.0122559498361123, "grad_norm": 0.4238392114639282, "learning_rate": 6.491774494049386e-05, "loss": 0.1997, "step": 43 }, { "epoch": 0.012540971925324213, "grad_norm": 0.481771856546402, "learning_rate": 6.461333383473272e-05, "loss": 0.1749, "step": 44 }, { "epoch": 0.012825994014536126, "grad_norm": 0.48285970091819763, "learning_rate": 6.430082673918849e-05, "loss": 0.1769, "step": 45 }, { "epoch": 0.013111016103748041, "grad_norm": 0.40149685740470886, "learning_rate": 6.398030909018069e-05, "loss": 0.1434, "step": 46 }, { "epoch": 0.013396038192959954, "grad_norm": 0.5220509171485901, "learning_rate": 6.365186851403423e-05, "loss": 0.1639, "step": 47 }, { "epoch": 0.013681060282171868, "grad_norm": 0.4152841567993164, "learning_rate": 6.331559480312315e-05, "loss": 0.1424, "step": 48 }, { "epoch": 0.013966082371383783, "grad_norm": 0.5210403203964233, "learning_rate": 6.297157989132236e-05, "loss": 0.1681, "step": 49 }, { "epoch": 0.014251104460595696, "grad_norm": 0.48842835426330566, "learning_rate": 6.261991782887377e-05, "loss": 0.1512, "step": 50 }, { "epoch": 0.014251104460595696, "eval_loss": 0.23581571877002716, "eval_runtime": 450.3716, "eval_samples_per_second": 13.12, "eval_steps_per_second": 3.282, "step": 50 }, { "epoch": 0.01453612654980761, "grad_norm": 0.36896973848342896, "learning_rate": 6.226070475667393e-05, "loss": 0.4531, "step": 51 }, { "epoch": 0.014821148639019524, "grad_norm": 0.45151451230049133, "learning_rate": 6.189403887999006e-05, "loss": 0.3726, "step": 52 }, { "epoch": 0.015106170728231437, "grad_norm": 0.4101298153400421, "learning_rate": 6.152002044161171e-05, "loss": 0.3514, "step": 53 }, { "epoch": 0.015391192817443352, "grad_norm": 0.40480175614356995, "learning_rate": 6.113875169444539e-05, "loss": 0.3095, "step": 54 }, { "epoch": 0.015676214906655266, "grad_norm": 0.4009363651275635, "learning_rate": 6.0750336873559605e-05, "loss": 0.3189, "step": 55 }, { "epoch": 0.01596123699586718, "grad_norm": 0.41769325733184814, "learning_rate": 6.035488216768811e-05, "loss": 0.2666, "step": 56 }, { "epoch": 0.016246259085079092, "grad_norm": 0.35941702127456665, "learning_rate": 5.9952495690198894e-05, "loss": 0.2998, "step": 57 }, { "epoch": 0.016531281174291007, "grad_norm": 0.4221387803554535, "learning_rate": 5.954328744953709e-05, "loss": 0.2548, "step": 58 }, { "epoch": 0.016816303263502922, "grad_norm": 0.4352424144744873, "learning_rate": 5.91273693191498e-05, "loss": 0.2917, "step": 59 }, { "epoch": 0.017101325352714837, "grad_norm": 0.3971022963523865, "learning_rate": 5.870485500690094e-05, "loss": 0.2802, "step": 60 }, { "epoch": 0.01738634744192675, "grad_norm": 0.3995015621185303, "learning_rate": 5.827586002398468e-05, "loss": 0.2378, "step": 61 }, { "epoch": 0.017671369531138664, "grad_norm": 0.4417852759361267, "learning_rate": 5.784050165334589e-05, "loss": 0.2057, "step": 62 }, { "epoch": 0.01795639162035058, "grad_norm": 0.4803115427494049, "learning_rate": 5.739889891761608e-05, "loss": 0.273, "step": 63 }, { "epoch": 0.01824141370956249, "grad_norm": 0.44846677780151367, "learning_rate": 5.6951172546573794e-05, "loss": 0.2206, "step": 64 }, { "epoch": 0.018526435798774405, "grad_norm": 0.6063640713691711, "learning_rate": 5.6497444944138376e-05, "loss": 0.2994, "step": 65 }, { "epoch": 0.01881145788798632, "grad_norm": 0.42268800735473633, "learning_rate": 5.603784015490587e-05, "loss": 0.2388, "step": 66 }, { "epoch": 0.01909647997719823, "grad_norm": 0.42498213052749634, "learning_rate": 5.557248383023655e-05, "loss": 0.2624, "step": 67 }, { "epoch": 0.019381502066410147, "grad_norm": 0.3898569345474243, "learning_rate": 5.510150319390302e-05, "loss": 0.2306, "step": 68 }, { "epoch": 0.01966652415562206, "grad_norm": 0.39460426568984985, "learning_rate": 5.4625027007308546e-05, "loss": 0.2648, "step": 69 }, { "epoch": 0.019951546244833973, "grad_norm": 0.39320170879364014, "learning_rate": 5.414318553428494e-05, "loss": 0.2096, "step": 70 }, { "epoch": 0.020236568334045888, "grad_norm": 0.393012672662735, "learning_rate": 5.3656110505479776e-05, "loss": 0.2313, "step": 71 }, { "epoch": 0.020521590423257803, "grad_norm": 0.3816857933998108, "learning_rate": 5.316393508234253e-05, "loss": 0.2602, "step": 72 }, { "epoch": 0.020806612512469718, "grad_norm": 0.38629385828971863, "learning_rate": 5.266679382071953e-05, "loss": 0.2195, "step": 73 }, { "epoch": 0.02109163460168163, "grad_norm": 0.34672990441322327, "learning_rate": 5.216482263406778e-05, "loss": 0.2134, "step": 74 }, { "epoch": 0.021376656690893545, "grad_norm": 0.35570091009140015, "learning_rate": 5.1658158756297576e-05, "loss": 0.2078, "step": 75 }, { "epoch": 0.02166167878010546, "grad_norm": 0.36889493465423584, "learning_rate": 5.114694070425407e-05, "loss": 0.1841, "step": 76 }, { "epoch": 0.02194670086931737, "grad_norm": 0.34534069895744324, "learning_rate": 5.063130823984823e-05, "loss": 0.1867, "step": 77 }, { "epoch": 0.022231722958529286, "grad_norm": 0.5198400020599365, "learning_rate": 5.011140233184724e-05, "loss": 0.1982, "step": 78 }, { "epoch": 0.0225167450477412, "grad_norm": 0.3951840400695801, "learning_rate": 4.958736511733516e-05, "loss": 0.1904, "step": 79 }, { "epoch": 0.022801767136953113, "grad_norm": 0.4039648175239563, "learning_rate": 4.905933986285393e-05, "loss": 0.2068, "step": 80 }, { "epoch": 0.023086789226165028, "grad_norm": 0.3911183476448059, "learning_rate": 4.8527470925235824e-05, "loss": 0.2222, "step": 81 }, { "epoch": 0.023371811315376943, "grad_norm": 0.4442287087440491, "learning_rate": 4.799190371213772e-05, "loss": 0.1945, "step": 82 }, { "epoch": 0.023656833404588854, "grad_norm": 0.35697343945503235, "learning_rate": 4.745278464228808e-05, "loss": 0.1627, "step": 83 }, { "epoch": 0.02394185549380077, "grad_norm": 0.3543776571750641, "learning_rate": 4.69102611054575e-05, "loss": 0.1628, "step": 84 }, { "epoch": 0.024226877583012684, "grad_norm": 0.369007408618927, "learning_rate": 4.6364481422163926e-05, "loss": 0.1372, "step": 85 }, { "epoch": 0.0245118996722246, "grad_norm": 0.3529796302318573, "learning_rate": 4.581559480312316e-05, "loss": 0.1587, "step": 86 }, { "epoch": 0.02479692176143651, "grad_norm": 0.3713398575782776, "learning_rate": 4.526375130845627e-05, "loss": 0.1727, "step": 87 }, { "epoch": 0.025081943850648426, "grad_norm": 0.4033448398113251, "learning_rate": 4.4709101806664554e-05, "loss": 0.173, "step": 88 }, { "epoch": 0.02536696593986034, "grad_norm": 0.4021795690059662, "learning_rate": 4.4151797933383685e-05, "loss": 0.2043, "step": 89 }, { "epoch": 0.025651988029072252, "grad_norm": 0.42848914861679077, "learning_rate": 4.359199204992797e-05, "loss": 0.1973, "step": 90 }, { "epoch": 0.025937010118284167, "grad_norm": 0.4157668948173523, "learning_rate": 4.30298372016363e-05, "loss": 0.2053, "step": 91 }, { "epoch": 0.026222032207496082, "grad_norm": 0.4267609119415283, "learning_rate": 4.246548707603114e-05, "loss": 0.1497, "step": 92 }, { "epoch": 0.026507054296707994, "grad_norm": 0.43226972222328186, "learning_rate": 4.1899095960801805e-05, "loss": 0.1466, "step": 93 }, { "epoch": 0.02679207638591991, "grad_norm": 0.4445212483406067, "learning_rate": 4.133081870162385e-05, "loss": 0.1947, "step": 94 }, { "epoch": 0.027077098475131824, "grad_norm": 0.473412424325943, "learning_rate": 4.076081065982569e-05, "loss": 0.1696, "step": 95 }, { "epoch": 0.027362120564343735, "grad_norm": 0.46042153239250183, "learning_rate": 4.018922766991447e-05, "loss": 0.1877, "step": 96 }, { "epoch": 0.02764714265355565, "grad_norm": 0.3730030059814453, "learning_rate": 3.961622599697241e-05, "loss": 0.1714, "step": 97 }, { "epoch": 0.027932164742767565, "grad_norm": 0.37114113569259644, "learning_rate": 3.9041962293935516e-05, "loss": 0.1438, "step": 98 }, { "epoch": 0.02821718683197948, "grad_norm": 0.39887505769729614, "learning_rate": 3.84665935587662e-05, "loss": 0.1029, "step": 99 }, { "epoch": 0.02850220892119139, "grad_norm": 0.6135851144790649, "learning_rate": 3.7890277091531636e-05, "loss": 0.1606, "step": 100 }, { "epoch": 0.02850220892119139, "eval_loss": 0.22643481194972992, "eval_runtime": 450.3081, "eval_samples_per_second": 13.122, "eval_steps_per_second": 3.282, "step": 100 }, { "epoch": 0.028787231010403307, "grad_norm": 0.3865916430950165, "learning_rate": 3.7313170451399475e-05, "loss": 0.5026, "step": 101 }, { "epoch": 0.02907225309961522, "grad_norm": 0.35874301195144653, "learning_rate": 3.673543141356278e-05, "loss": 0.3046, "step": 102 }, { "epoch": 0.029357275188827133, "grad_norm": 0.39656662940979004, "learning_rate": 3.6157217926105783e-05, "loss": 0.3368, "step": 103 }, { "epoch": 0.029642297278039048, "grad_norm": 0.374759703874588, "learning_rate": 3.557868806682255e-05, "loss": 0.3207, "step": 104 }, { "epoch": 0.029927319367250963, "grad_norm": 0.37248775362968445, "learning_rate": 3.5e-05, "loss": 0.3293, "step": 105 }, { "epoch": 0.030212341456462875, "grad_norm": 0.3889812231063843, "learning_rate": 3.442131193317745e-05, "loss": 0.3961, "step": 106 }, { "epoch": 0.03049736354567479, "grad_norm": 0.3763032853603363, "learning_rate": 3.384278207389421e-05, "loss": 0.3335, "step": 107 }, { "epoch": 0.030782385634886705, "grad_norm": 0.339491069316864, "learning_rate": 3.3264568586437216e-05, "loss": 0.2283, "step": 108 }, { "epoch": 0.031067407724098616, "grad_norm": 0.37553462386131287, "learning_rate": 3.268682954860052e-05, "loss": 0.3024, "step": 109 }, { "epoch": 0.03135242981331053, "grad_norm": 0.3646965026855469, "learning_rate": 3.210972290846837e-05, "loss": 0.2577, "step": 110 }, { "epoch": 0.03163745190252244, "grad_norm": 0.4289148449897766, "learning_rate": 3.15334064412338e-05, "loss": 0.2685, "step": 111 }, { "epoch": 0.03192247399173436, "grad_norm": 0.3751957416534424, "learning_rate": 3.0958037706064485e-05, "loss": 0.2758, "step": 112 }, { "epoch": 0.03220749608094627, "grad_norm": 0.35714560747146606, "learning_rate": 3.038377400302758e-05, "loss": 0.2517, "step": 113 }, { "epoch": 0.032492518170158184, "grad_norm": 0.375448077917099, "learning_rate": 2.9810772330085524e-05, "loss": 0.2422, "step": 114 }, { "epoch": 0.0327775402593701, "grad_norm": 0.32101136445999146, "learning_rate": 2.9239189340174306e-05, "loss": 0.2002, "step": 115 }, { "epoch": 0.033062562348582014, "grad_norm": 0.34094324707984924, "learning_rate": 2.8669181298376163e-05, "loss": 0.2201, "step": 116 }, { "epoch": 0.033347584437793926, "grad_norm": 0.3905237913131714, "learning_rate": 2.8100904039198193e-05, "loss": 0.2291, "step": 117 }, { "epoch": 0.033632606527005844, "grad_norm": 0.3219447731971741, "learning_rate": 2.7534512923968863e-05, "loss": 0.189, "step": 118 }, { "epoch": 0.033917628616217756, "grad_norm": 0.3261171281337738, "learning_rate": 2.6970162798363695e-05, "loss": 0.257, "step": 119 }, { "epoch": 0.034202650705429674, "grad_norm": 0.5045903921127319, "learning_rate": 2.640800795007203e-05, "loss": 0.2139, "step": 120 }, { "epoch": 0.034487672794641586, "grad_norm": 0.3959028422832489, "learning_rate": 2.5848202066616305e-05, "loss": 0.2139, "step": 121 }, { "epoch": 0.0347726948838535, "grad_norm": 0.3482423722743988, "learning_rate": 2.5290898193335446e-05, "loss": 0.1893, "step": 122 }, { "epoch": 0.035057716973065416, "grad_norm": 0.3951508700847626, "learning_rate": 2.4736248691543736e-05, "loss": 0.2544, "step": 123 }, { "epoch": 0.03534273906227733, "grad_norm": 0.3452610373497009, "learning_rate": 2.4184405196876842e-05, "loss": 0.2525, "step": 124 }, { "epoch": 0.03562776115148924, "grad_norm": 0.4159131348133087, "learning_rate": 2.363551857783608e-05, "loss": 0.252, "step": 125 }, { "epoch": 0.03591278324070116, "grad_norm": 0.3287859261035919, "learning_rate": 2.308973889454249e-05, "loss": 0.177, "step": 126 }, { "epoch": 0.03619780532991307, "grad_norm": 0.32193413376808167, "learning_rate": 2.2547215357711918e-05, "loss": 0.1704, "step": 127 }, { "epoch": 0.03648282741912498, "grad_norm": 0.3499327003955841, "learning_rate": 2.2008096287862266e-05, "loss": 0.1827, "step": 128 }, { "epoch": 0.0367678495083369, "grad_norm": 0.3674137592315674, "learning_rate": 2.1472529074764177e-05, "loss": 0.1834, "step": 129 }, { "epoch": 0.03705287159754881, "grad_norm": 0.4516706168651581, "learning_rate": 2.0940660137146074e-05, "loss": 0.2258, "step": 130 }, { "epoch": 0.03733789368676072, "grad_norm": 0.39166364073753357, "learning_rate": 2.041263488266484e-05, "loss": 0.1813, "step": 131 }, { "epoch": 0.03762291577597264, "grad_norm": 0.3784855008125305, "learning_rate": 1.988859766815275e-05, "loss": 0.2024, "step": 132 }, { "epoch": 0.03790793786518455, "grad_norm": 0.3947729766368866, "learning_rate": 1.9368691760151773e-05, "loss": 0.1663, "step": 133 }, { "epoch": 0.03819295995439646, "grad_norm": 0.36646705865859985, "learning_rate": 1.885305929574593e-05, "loss": 0.1643, "step": 134 }, { "epoch": 0.03847798204360838, "grad_norm": 0.33460554480552673, "learning_rate": 1.8341841243702424e-05, "loss": 0.1819, "step": 135 }, { "epoch": 0.03876300413282029, "grad_norm": 0.35374724864959717, "learning_rate": 1.7835177365932225e-05, "loss": 0.1475, "step": 136 }, { "epoch": 0.039048026222032205, "grad_norm": 0.34457525610923767, "learning_rate": 1.7333206179280478e-05, "loss": 0.1321, "step": 137 }, { "epoch": 0.03933304831124412, "grad_norm": 0.3537154793739319, "learning_rate": 1.6836064917657478e-05, "loss": 0.13, "step": 138 }, { "epoch": 0.039618070400456035, "grad_norm": 0.3567149341106415, "learning_rate": 1.6343889494520224e-05, "loss": 0.1604, "step": 139 }, { "epoch": 0.039903092489667946, "grad_norm": 0.3486016094684601, "learning_rate": 1.5856814465715064e-05, "loss": 0.1709, "step": 140 }, { "epoch": 0.040188114578879865, "grad_norm": 0.41061216592788696, "learning_rate": 1.5374972992691458e-05, "loss": 0.1601, "step": 141 }, { "epoch": 0.040473136668091776, "grad_norm": 0.35205891728401184, "learning_rate": 1.4898496806096974e-05, "loss": 0.1441, "step": 142 }, { "epoch": 0.04075815875730369, "grad_norm": 0.3466411232948303, "learning_rate": 1.4427516169763444e-05, "loss": 0.1806, "step": 143 }, { "epoch": 0.041043180846515606, "grad_norm": 0.37699663639068604, "learning_rate": 1.396215984509412e-05, "loss": 0.1357, "step": 144 }, { "epoch": 0.04132820293572752, "grad_norm": 0.38561469316482544, "learning_rate": 1.3502555055861625e-05, "loss": 0.1794, "step": 145 }, { "epoch": 0.041613225024939436, "grad_norm": 0.33420026302337646, "learning_rate": 1.3048827453426203e-05, "loss": 0.1292, "step": 146 }, { "epoch": 0.04189824711415135, "grad_norm": 0.4164336919784546, "learning_rate": 1.2601101082383917e-05, "loss": 0.1478, "step": 147 }, { "epoch": 0.04218326920336326, "grad_norm": 0.3779417872428894, "learning_rate": 1.2159498346654094e-05, "loss": 0.1662, "step": 148 }, { "epoch": 0.04246829129257518, "grad_norm": 0.43226301670074463, "learning_rate": 1.1724139976015306e-05, "loss": 0.1497, "step": 149 }, { "epoch": 0.04275331338178709, "grad_norm": 0.4153372347354889, "learning_rate": 1.1295144993099068e-05, "loss": 0.1364, "step": 150 }, { "epoch": 0.04275331338178709, "eval_loss": 0.21325631439685822, "eval_runtime": 450.5894, "eval_samples_per_second": 13.114, "eval_steps_per_second": 3.28, "step": 150 }, { "epoch": 0.043038335470999, "grad_norm": 0.281355082988739, "learning_rate": 1.0872630680850196e-05, "loss": 0.306, "step": 151 }, { "epoch": 0.04332335756021092, "grad_norm": 0.3525282144546509, "learning_rate": 1.0456712550462898e-05, "loss": 0.4085, "step": 152 }, { "epoch": 0.04360837964942283, "grad_norm": 0.3436671197414398, "learning_rate": 1.0047504309801104e-05, "loss": 0.3022, "step": 153 }, { "epoch": 0.04389340173863474, "grad_norm": 0.3466765284538269, "learning_rate": 9.645117832311886e-06, "loss": 0.3076, "step": 154 }, { "epoch": 0.04417842382784666, "grad_norm": 0.3465212285518646, "learning_rate": 9.249663126440394e-06, "loss": 0.2955, "step": 155 }, { "epoch": 0.04446344591705857, "grad_norm": 0.35070881247520447, "learning_rate": 8.861248305554624e-06, "loss": 0.2727, "step": 156 }, { "epoch": 0.044748468006270484, "grad_norm": 0.36707839369773865, "learning_rate": 8.47997955838829e-06, "loss": 0.285, "step": 157 }, { "epoch": 0.0450334900954824, "grad_norm": 0.3160535395145416, "learning_rate": 8.10596112000994e-06, "loss": 0.2678, "step": 158 }, { "epoch": 0.045318512184694314, "grad_norm": 0.3351013958454132, "learning_rate": 7.739295243326067e-06, "loss": 0.2542, "step": 159 }, { "epoch": 0.045603534273906225, "grad_norm": 0.34178030490875244, "learning_rate": 7.380082171126228e-06, "loss": 0.2527, "step": 160 }, { "epoch": 0.045888556363118144, "grad_norm": 0.34601011872291565, "learning_rate": 7.028420108677635e-06, "loss": 0.2287, "step": 161 }, { "epoch": 0.046173578452330055, "grad_norm": 0.3409976065158844, "learning_rate": 6.684405196876842e-06, "loss": 0.2697, "step": 162 }, { "epoch": 0.04645860054154197, "grad_norm": 0.32013270258903503, "learning_rate": 6.3481314859657675e-06, "loss": 0.1984, "step": 163 }, { "epoch": 0.046743622630753885, "grad_norm": 0.3430671691894531, "learning_rate": 6.019690909819298e-06, "loss": 0.2599, "step": 164 }, { "epoch": 0.0470286447199658, "grad_norm": 0.306186705827713, "learning_rate": 5.6991732608115e-06, "loss": 0.2162, "step": 165 }, { "epoch": 0.04731366680917771, "grad_norm": 0.359751433134079, "learning_rate": 5.386666165267256e-06, "loss": 0.223, "step": 166 }, { "epoch": 0.04759868889838963, "grad_norm": 0.3521772623062134, "learning_rate": 5.08225505950613e-06, "loss": 0.2374, "step": 167 }, { "epoch": 0.04788371098760154, "grad_norm": 0.3709127604961395, "learning_rate": 4.786023166484913e-06, "loss": 0.2737, "step": 168 }, { "epoch": 0.04816873307681345, "grad_norm": 0.3508308529853821, "learning_rate": 4.498051473045291e-06, "loss": 0.2643, "step": 169 }, { "epoch": 0.04845375516602537, "grad_norm": 0.3326634466648102, "learning_rate": 4.218418707772886e-06, "loss": 0.2183, "step": 170 }, { "epoch": 0.04873877725523728, "grad_norm": 0.39563068747520447, "learning_rate": 3.947201319473587e-06, "loss": 0.2757, "step": 171 }, { "epoch": 0.0490237993444492, "grad_norm": 0.3203083574771881, "learning_rate": 3.684473456273278e-06, "loss": 0.1658, "step": 172 }, { "epoch": 0.04930882143366111, "grad_norm": 0.3669796884059906, "learning_rate": 3.4303069453464383e-06, "loss": 0.2462, "step": 173 }, { "epoch": 0.04959384352287302, "grad_norm": 0.3505097031593323, "learning_rate": 3.184771273279312e-06, "loss": 0.1769, "step": 174 }, { "epoch": 0.04987886561208494, "grad_norm": 0.33034375309944153, "learning_rate": 2.947933567072987e-06, "loss": 0.2077, "step": 175 }, { "epoch": 0.05016388770129685, "grad_norm": 0.31688281893730164, "learning_rate": 2.719858575791534e-06, "loss": 0.18, "step": 176 }, { "epoch": 0.05044890979050876, "grad_norm": 0.3017065227031708, "learning_rate": 2.500608652860256e-06, "loss": 0.18, "step": 177 }, { "epoch": 0.05073393187972068, "grad_norm": 0.4137319326400757, "learning_rate": 2.2902437390188737e-06, "loss": 0.235, "step": 178 }, { "epoch": 0.05101895396893259, "grad_norm": 0.3511267900466919, "learning_rate": 2.0888213459343587e-06, "loss": 0.1845, "step": 179 }, { "epoch": 0.051303976058144504, "grad_norm": 0.32434019446372986, "learning_rate": 1.8963965404777875e-06, "loss": 0.133, "step": 180 }, { "epoch": 0.05158899814735642, "grad_norm": 0.35862478613853455, "learning_rate": 1.7130219296696263e-06, "loss": 0.1806, "step": 181 }, { "epoch": 0.051874020236568334, "grad_norm": 0.36328455805778503, "learning_rate": 1.5387476462974824e-06, "loss": 0.1922, "step": 182 }, { "epoch": 0.052159042325780246, "grad_norm": 0.38937562704086304, "learning_rate": 1.3736213352103147e-06, "loss": 0.2177, "step": 183 }, { "epoch": 0.052444064414992164, "grad_norm": 0.2989131212234497, "learning_rate": 1.2176881402928002e-06, "loss": 0.1575, "step": 184 }, { "epoch": 0.052729086504204076, "grad_norm": 0.3432283103466034, "learning_rate": 1.0709906921234367e-06, "loss": 0.1829, "step": 185 }, { "epoch": 0.05301410859341599, "grad_norm": 0.3284221291542053, "learning_rate": 9.33569096319799e-07, "loss": 0.1379, "step": 186 }, { "epoch": 0.053299130682627906, "grad_norm": 0.3415944278240204, "learning_rate": 8.054609225740255e-07, "loss": 0.1432, "step": 187 }, { "epoch": 0.05358415277183982, "grad_norm": 0.30469295382499695, "learning_rate": 6.867011943816724e-07, "loss": 0.1457, "step": 188 }, { "epoch": 0.05386917486105173, "grad_norm": 0.3219231963157654, "learning_rate": 5.77322379466617e-07, "loss": 0.1493, "step": 189 }, { "epoch": 0.05415419695026365, "grad_norm": 0.3684932589530945, "learning_rate": 4.773543809047186e-07, "loss": 0.1963, "step": 190 }, { "epoch": 0.05443921903947556, "grad_norm": 0.30849573016166687, "learning_rate": 3.868245289486027e-07, "loss": 0.1481, "step": 191 }, { "epoch": 0.05472424112868747, "grad_norm": 0.40836629271507263, "learning_rate": 3.0575757355586817e-07, "loss": 0.2133, "step": 192 }, { "epoch": 0.05500926321789939, "grad_norm": 0.3660025894641876, "learning_rate": 2.3417567762266497e-07, "loss": 0.1697, "step": 193 }, { "epoch": 0.0552942853071113, "grad_norm": 0.37223172187805176, "learning_rate": 1.7209841092460043e-07, "loss": 0.1186, "step": 194 }, { "epoch": 0.05557930739632321, "grad_norm": 0.38398411870002747, "learning_rate": 1.1954274476655534e-07, "loss": 0.1734, "step": 195 }, { "epoch": 0.05586432948553513, "grad_norm": 0.3642955422401428, "learning_rate": 7.652304734289127e-08, "loss": 0.1738, "step": 196 }, { "epoch": 0.05614935157474704, "grad_norm": 0.3336471915245056, "learning_rate": 4.30510798093342e-08, "loss": 0.1381, "step": 197 }, { "epoch": 0.05643437366395896, "grad_norm": 0.4195612668991089, "learning_rate": 1.9135993067588284e-08, "loss": 0.17, "step": 198 }, { "epoch": 0.05671939575317087, "grad_norm": 0.4140312969684601, "learning_rate": 4.784325263584854e-09, "loss": 0.1507, "step": 199 }, { "epoch": 0.05700441784238278, "grad_norm": 0.4781356453895569, "learning_rate": 0.0, "loss": 0.143, "step": 200 }, { "epoch": 0.05700441784238278, "eval_loss": 0.2108319103717804, "eval_runtime": 450.142, "eval_samples_per_second": 13.127, "eval_steps_per_second": 3.283, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.024569725385441e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }