{ "best_metric": 0.07271432131528854, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.03939528241493081, "eval_steps": 25, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019697641207465407, "grad_norm": 3.348949670791626, "learning_rate": 2.9999999999999997e-05, "loss": 1.0606, "step": 1 }, { "epoch": 0.00019697641207465407, "eval_loss": 0.360129714012146, "eval_runtime": 10.5317, "eval_samples_per_second": 4.748, "eval_steps_per_second": 0.665, "step": 1 }, { "epoch": 0.00039395282414930814, "grad_norm": 3.371903657913208, "learning_rate": 5.9999999999999995e-05, "loss": 1.0354, "step": 2 }, { "epoch": 0.0005909292362239622, "grad_norm": 2.8037610054016113, "learning_rate": 8.999999999999999e-05, "loss": 0.9175, "step": 3 }, { "epoch": 0.0007879056482986163, "grad_norm": 1.5464385747909546, "learning_rate": 0.00011999999999999999, "loss": 0.5709, "step": 4 }, { "epoch": 0.0009848820603732703, "grad_norm": 1.5053009986877441, "learning_rate": 0.00015, "loss": 0.5139, "step": 5 }, { "epoch": 0.0011818584724479244, "grad_norm": 1.4200072288513184, "learning_rate": 0.00017999999999999998, "loss": 0.5191, "step": 6 }, { "epoch": 0.0013788348845225785, "grad_norm": 1.2713923454284668, "learning_rate": 0.00020999999999999998, "loss": 0.5693, "step": 7 }, { "epoch": 0.0015758112965972325, "grad_norm": 2.9913175106048584, "learning_rate": 0.00023999999999999998, "loss": 0.4701, "step": 8 }, { "epoch": 0.0017727877086718866, "grad_norm": 3.1078169345855713, "learning_rate": 0.00027, "loss": 0.4946, "step": 9 }, { "epoch": 0.0019697641207465405, "grad_norm": 1.0796478986740112, "learning_rate": 0.0003, "loss": 0.552, "step": 10 }, { "epoch": 0.0021667405328211946, "grad_norm": 0.8546186685562134, "learning_rate": 0.0002999794957488703, "loss": 0.4502, "step": 11 }, { "epoch": 0.0023637169448958487, "grad_norm": 1.210360050201416, "learning_rate": 0.0002999179886011389, "loss": 0.4655, "step": 12 }, { "epoch": 0.002560693356970503, "grad_norm": 1.6150785684585571, "learning_rate": 0.0002998154953722457, "loss": 0.4661, "step": 13 }, { "epoch": 0.002757669769045157, "grad_norm": 0.7436849474906921, "learning_rate": 0.00029967204408281613, "loss": 0.366, "step": 14 }, { "epoch": 0.002954646181119811, "grad_norm": 0.7962251901626587, "learning_rate": 0.00029948767395100045, "loss": 0.4605, "step": 15 }, { "epoch": 0.003151622593194465, "grad_norm": 0.9352697134017944, "learning_rate": 0.0002992624353817517, "loss": 0.382, "step": 16 }, { "epoch": 0.003348599005269119, "grad_norm": 0.9323279857635498, "learning_rate": 0.0002989963899530457, "loss": 0.5812, "step": 17 }, { "epoch": 0.0035455754173437733, "grad_norm": 0.8441579937934875, "learning_rate": 0.00029868961039904624, "loss": 0.4787, "step": 18 }, { "epoch": 0.003742551829418427, "grad_norm": 1.0008450746536255, "learning_rate": 0.00029834218059022024, "loss": 0.3968, "step": 19 }, { "epoch": 0.003939528241493081, "grad_norm": 0.7684171199798584, "learning_rate": 0.00029795419551040833, "loss": 0.3796, "step": 20 }, { "epoch": 0.004136504653567736, "grad_norm": 1.0816248655319214, "learning_rate": 0.00029752576123085736, "loss": 0.4181, "step": 21 }, { "epoch": 0.004333481065642389, "grad_norm": 0.8110284209251404, "learning_rate": 0.0002970569948812214, "loss": 0.3604, "step": 22 }, { "epoch": 0.004530457477717044, "grad_norm": 2.4406509399414062, "learning_rate": 0.0002965480246175399, "loss": 0.4562, "step": 23 }, { "epoch": 0.004727433889791697, "grad_norm": 1.13535475730896, "learning_rate": 0.0002959989895872009, "loss": 0.4112, "step": 24 }, { "epoch": 0.004924410301866351, "grad_norm": 0.845367968082428, "learning_rate": 0.0002954100398908995, "loss": 0.4399, "step": 25 }, { "epoch": 0.004924410301866351, "eval_loss": 0.11439956724643707, "eval_runtime": 12.8261, "eval_samples_per_second": 3.898, "eval_steps_per_second": 0.546, "step": 25 }, { "epoch": 0.005121386713941006, "grad_norm": 0.7668648362159729, "learning_rate": 0.0002947813365416023, "loss": 0.4863, "step": 26 }, { "epoch": 0.005318363126015659, "grad_norm": 0.8414048552513123, "learning_rate": 0.0002941130514205272, "loss": 0.4708, "step": 27 }, { "epoch": 0.005515339538090314, "grad_norm": 0.9618905782699585, "learning_rate": 0.0002934053672301536, "loss": 0.4237, "step": 28 }, { "epoch": 0.0057123159501649675, "grad_norm": 0.7617946267127991, "learning_rate": 0.00029265847744427303, "loss": 0.3429, "step": 29 }, { "epoch": 0.005909292362239622, "grad_norm": 1.3151721954345703, "learning_rate": 0.00029187258625509513, "loss": 0.5637, "step": 30 }, { "epoch": 0.006106268774314276, "grad_norm": 1.2373560667037964, "learning_rate": 0.00029104790851742417, "loss": 0.4711, "step": 31 }, { "epoch": 0.00630324518638893, "grad_norm": 0.9699512124061584, "learning_rate": 0.0002901846696899191, "loss": 0.3827, "step": 32 }, { "epoch": 0.006500221598463584, "grad_norm": 1.1460570096969604, "learning_rate": 0.00028928310577345606, "loss": 0.4132, "step": 33 }, { "epoch": 0.006697198010538238, "grad_norm": 1.2482142448425293, "learning_rate": 0.0002883434632466077, "loss": 0.4066, "step": 34 }, { "epoch": 0.006894174422612892, "grad_norm": 1.0130342245101929, "learning_rate": 0.00028736599899825856, "loss": 0.4401, "step": 35 }, { "epoch": 0.007091150834687547, "grad_norm": 1.153891921043396, "learning_rate": 0.00028635098025737434, "loss": 0.3825, "step": 36 }, { "epoch": 0.0072881272467622, "grad_norm": 1.4242093563079834, "learning_rate": 0.00028529868451994384, "loss": 0.3514, "step": 37 }, { "epoch": 0.007485103658836854, "grad_norm": 0.8260576725006104, "learning_rate": 0.0002842093994731145, "loss": 0.38, "step": 38 }, { "epoch": 0.007682080070911508, "grad_norm": 1.6134790182113647, "learning_rate": 0.00028308342291654174, "loss": 0.3237, "step": 39 }, { "epoch": 0.007879056482986162, "grad_norm": 1.4775500297546387, "learning_rate": 0.00028192106268097334, "loss": 0.503, "step": 40 }, { "epoch": 0.008076032895060816, "grad_norm": 1.2248170375823975, "learning_rate": 0.00028072263654409154, "loss": 0.3367, "step": 41 }, { "epoch": 0.008273009307135471, "grad_norm": 5.347848415374756, "learning_rate": 0.0002794884721436361, "loss": 0.3759, "step": 42 }, { "epoch": 0.008469985719210125, "grad_norm": 1.2572146654129028, "learning_rate": 0.00027821890688783083, "loss": 0.3936, "step": 43 }, { "epoch": 0.008666962131284778, "grad_norm": 1.1421171426773071, "learning_rate": 0.0002769142878631403, "loss": 0.2144, "step": 44 }, { "epoch": 0.008863938543359432, "grad_norm": 0.9987139701843262, "learning_rate": 0.00027557497173937923, "loss": 0.217, "step": 45 }, { "epoch": 0.009060914955434088, "grad_norm": 1.9025821685791016, "learning_rate": 0.000274201324672203, "loss": 0.2224, "step": 46 }, { "epoch": 0.009257891367508741, "grad_norm": 0.7751299738883972, "learning_rate": 0.00027279372220300385, "loss": 0.1837, "step": 47 }, { "epoch": 0.009454867779583395, "grad_norm": 2.193005084991455, "learning_rate": 0.0002713525491562421, "loss": 0.1896, "step": 48 }, { "epoch": 0.009651844191658049, "grad_norm": 1.596818447113037, "learning_rate": 0.00026987819953423867, "loss": 0.2651, "step": 49 }, { "epoch": 0.009848820603732702, "grad_norm": 10.099899291992188, "learning_rate": 0.00026837107640945905, "loss": 0.4042, "step": 50 }, { "epoch": 0.009848820603732702, "eval_loss": 0.14771892130374908, "eval_runtime": 10.3847, "eval_samples_per_second": 4.815, "eval_steps_per_second": 0.674, "step": 50 }, { "epoch": 0.010045797015807358, "grad_norm": 2.2570748329162598, "learning_rate": 0.0002668315918143169, "loss": 0.6271, "step": 51 }, { "epoch": 0.010242773427882011, "grad_norm": 1.3872201442718506, "learning_rate": 0.00026526016662852886, "loss": 0.47, "step": 52 }, { "epoch": 0.010439749839956665, "grad_norm": 0.9401403665542603, "learning_rate": 0.00026365723046405023, "loss": 0.443, "step": 53 }, { "epoch": 0.010636726252031319, "grad_norm": 0.6595192551612854, "learning_rate": 0.0002620232215476231, "loss": 0.3299, "step": 54 }, { "epoch": 0.010833702664105974, "grad_norm": 0.527516782283783, "learning_rate": 0.0002603585866009697, "loss": 0.2561, "step": 55 }, { "epoch": 0.011030679076180628, "grad_norm": 0.5898721814155579, "learning_rate": 0.00025866378071866334, "loss": 0.3284, "step": 56 }, { "epoch": 0.011227655488255281, "grad_norm": 0.6567277908325195, "learning_rate": 0.00025693926724370956, "loss": 0.3464, "step": 57 }, { "epoch": 0.011424631900329935, "grad_norm": 0.6930991411209106, "learning_rate": 0.00025518551764087326, "loss": 0.3655, "step": 58 }, { "epoch": 0.01162160831240459, "grad_norm": 0.6276407241821289, "learning_rate": 0.00025340301136778483, "loss": 0.3158, "step": 59 }, { "epoch": 0.011818584724479244, "grad_norm": 0.6224534511566162, "learning_rate": 0.00025159223574386114, "loss": 0.3519, "step": 60 }, { "epoch": 0.012015561136553898, "grad_norm": 0.6579044461250305, "learning_rate": 0.0002497536858170772, "loss": 0.3508, "step": 61 }, { "epoch": 0.012212537548628551, "grad_norm": 0.5916846394538879, "learning_rate": 0.00024788786422862526, "loss": 0.3069, "step": 62 }, { "epoch": 0.012409513960703205, "grad_norm": 0.6183111071586609, "learning_rate": 0.00024599528107549745, "loss": 0.2878, "step": 63 }, { "epoch": 0.01260649037277786, "grad_norm": 0.6713694334030151, "learning_rate": 0.00024407645377103054, "loss": 0.3213, "step": 64 }, { "epoch": 0.012803466784852514, "grad_norm": 0.8130861520767212, "learning_rate": 0.00024213190690345018, "loss": 0.3197, "step": 65 }, { "epoch": 0.013000443196927168, "grad_norm": 0.4906407594680786, "learning_rate": 0.00024016217209245374, "loss": 0.2504, "step": 66 }, { "epoch": 0.013197419609001821, "grad_norm": 0.6218925714492798, "learning_rate": 0.00023816778784387094, "loss": 0.2868, "step": 67 }, { "epoch": 0.013394396021076477, "grad_norm": 0.6536829471588135, "learning_rate": 0.0002361492994024415, "loss": 0.2905, "step": 68 }, { "epoch": 0.01359137243315113, "grad_norm": 0.5276692509651184, "learning_rate": 0.0002341072586027509, "loss": 0.2757, "step": 69 }, { "epoch": 0.013788348845225784, "grad_norm": 0.8190099000930786, "learning_rate": 0.00023204222371836405, "loss": 0.3012, "step": 70 }, { "epoch": 0.013985325257300438, "grad_norm": 0.7881957292556763, "learning_rate": 0.00022995475930919905, "loss": 0.2993, "step": 71 }, { "epoch": 0.014182301669375093, "grad_norm": 0.8249359726905823, "learning_rate": 0.00022784543606718227, "loss": 0.3406, "step": 72 }, { "epoch": 0.014379278081449747, "grad_norm": 0.9758163690567017, "learning_rate": 0.00022571483066022657, "loss": 0.2938, "step": 73 }, { "epoch": 0.0145762544935244, "grad_norm": 0.6984056234359741, "learning_rate": 0.0002235635255745762, "loss": 0.2965, "step": 74 }, { "epoch": 0.014773230905599054, "grad_norm": 0.8371394276618958, "learning_rate": 0.00022139210895556104, "loss": 0.3191, "step": 75 }, { "epoch": 0.014773230905599054, "eval_loss": 0.0825795829296112, "eval_runtime": 10.3801, "eval_samples_per_second": 4.817, "eval_steps_per_second": 0.674, "step": 75 }, { "epoch": 0.014970207317673708, "grad_norm": 0.6708223223686218, "learning_rate": 0.00021920117444680317, "loss": 0.3168, "step": 76 }, { "epoch": 0.015167183729748363, "grad_norm": 0.6277792453765869, "learning_rate": 0.00021699132102792097, "loss": 0.2699, "step": 77 }, { "epoch": 0.015364160141823017, "grad_norm": 1.0930883884429932, "learning_rate": 0.0002147631528507739, "loss": 0.3786, "step": 78 }, { "epoch": 0.01556113655389767, "grad_norm": 0.6358778476715088, "learning_rate": 0.00021251727907429355, "loss": 0.3144, "step": 79 }, { "epoch": 0.015758112965972324, "grad_norm": 0.7981488108634949, "learning_rate": 0.0002102543136979454, "loss": 0.3314, "step": 80 }, { "epoch": 0.015955089378046978, "grad_norm": 0.7823123335838318, "learning_rate": 0.0002079748753938678, "loss": 0.2656, "step": 81 }, { "epoch": 0.01615206579012163, "grad_norm": 0.6537957787513733, "learning_rate": 0.0002056795873377331, "loss": 0.316, "step": 82 }, { "epoch": 0.01634904220219629, "grad_norm": 0.6539929509162903, "learning_rate": 0.00020336907703837748, "loss": 0.2713, "step": 83 }, { "epoch": 0.016546018614270942, "grad_norm": 1.0729496479034424, "learning_rate": 0.00020104397616624645, "loss": 0.2545, "step": 84 }, { "epoch": 0.016742995026345596, "grad_norm": 0.7264039516448975, "learning_rate": 0.00019870492038070252, "loss": 0.3414, "step": 85 }, { "epoch": 0.01693997143842025, "grad_norm": 0.7242591977119446, "learning_rate": 0.0001963525491562421, "loss": 0.275, "step": 86 }, { "epoch": 0.017136947850494903, "grad_norm": 0.860544741153717, "learning_rate": 0.0001939875056076697, "loss": 0.2932, "step": 87 }, { "epoch": 0.017333924262569557, "grad_norm": 0.851104199886322, "learning_rate": 0.00019161043631427666, "loss": 0.2768, "step": 88 }, { "epoch": 0.01753090067464421, "grad_norm": 0.8001173138618469, "learning_rate": 0.00018922199114307294, "loss": 0.2581, "step": 89 }, { "epoch": 0.017727877086718864, "grad_norm": 1.0649043321609497, "learning_rate": 0.00018682282307111987, "loss": 0.3063, "step": 90 }, { "epoch": 0.017924853498793518, "grad_norm": 0.90488201379776, "learning_rate": 0.00018441358800701273, "loss": 0.2846, "step": 91 }, { "epoch": 0.018121829910868175, "grad_norm": 0.955570638179779, "learning_rate": 0.00018199494461156203, "loss": 0.3494, "step": 92 }, { "epoch": 0.01831880632294283, "grad_norm": 1.2587333917617798, "learning_rate": 0.000179567554117722, "loss": 0.2484, "step": 93 }, { "epoch": 0.018515782735017482, "grad_norm": 0.7618815898895264, "learning_rate": 0.00017713208014981648, "loss": 0.2205, "step": 94 }, { "epoch": 0.018712759147092136, "grad_norm": 0.8597466945648193, "learning_rate": 0.00017468918854211007, "loss": 0.2238, "step": 95 }, { "epoch": 0.01890973555916679, "grad_norm": 1.390031099319458, "learning_rate": 0.00017223954715677627, "loss": 0.2533, "step": 96 }, { "epoch": 0.019106711971241443, "grad_norm": 0.9729688763618469, "learning_rate": 0.00016978382570131034, "loss": 0.1609, "step": 97 }, { "epoch": 0.019303688383316097, "grad_norm": 0.6992880702018738, "learning_rate": 0.00016732269554543794, "loss": 0.1844, "step": 98 }, { "epoch": 0.01950066479539075, "grad_norm": 1.1221824884414673, "learning_rate": 0.00016485682953756942, "loss": 0.2346, "step": 99 }, { "epoch": 0.019697641207465404, "grad_norm": 2.233081817626953, "learning_rate": 0.00016238690182084986, "loss": 0.4078, "step": 100 }, { "epoch": 0.019697641207465404, "eval_loss": 0.11340178549289703, "eval_runtime": 12.7864, "eval_samples_per_second": 3.91, "eval_steps_per_second": 0.547, "step": 100 }, { "epoch": 0.01989461761954006, "grad_norm": 1.3194266557693481, "learning_rate": 0.0001599135876488549, "loss": 0.4517, "step": 101 }, { "epoch": 0.020091594031614715, "grad_norm": 1.0158177614212036, "learning_rate": 0.00015743756320098332, "loss": 0.4222, "step": 102 }, { "epoch": 0.02028857044368937, "grad_norm": 0.84644615650177, "learning_rate": 0.0001549595053975962, "loss": 0.3635, "step": 103 }, { "epoch": 0.020485546855764022, "grad_norm": 0.6677044630050659, "learning_rate": 0.00015248009171495378, "loss": 0.2531, "step": 104 }, { "epoch": 0.020682523267838676, "grad_norm": 0.5319753289222717, "learning_rate": 0.00015, "loss": 0.2562, "step": 105 }, { "epoch": 0.02087949967991333, "grad_norm": 0.5926326513290405, "learning_rate": 0.00014751990828504622, "loss": 0.3177, "step": 106 }, { "epoch": 0.021076476091987983, "grad_norm": 0.8615707755088806, "learning_rate": 0.00014504049460240375, "loss": 0.2798, "step": 107 }, { "epoch": 0.021273452504062637, "grad_norm": 0.5553646683692932, "learning_rate": 0.00014256243679901663, "loss": 0.3154, "step": 108 }, { "epoch": 0.021470428916137294, "grad_norm": 0.42339223623275757, "learning_rate": 0.00014008641235114508, "loss": 0.2716, "step": 109 }, { "epoch": 0.021667405328211948, "grad_norm": 0.46130532026290894, "learning_rate": 0.00013761309817915014, "loss": 0.217, "step": 110 }, { "epoch": 0.0218643817402866, "grad_norm": 0.42555496096611023, "learning_rate": 0.00013514317046243058, "loss": 0.2988, "step": 111 }, { "epoch": 0.022061358152361255, "grad_norm": 0.47253310680389404, "learning_rate": 0.00013267730445456208, "loss": 0.2824, "step": 112 }, { "epoch": 0.02225833456443591, "grad_norm": 0.7298460602760315, "learning_rate": 0.00013021617429868963, "loss": 0.2986, "step": 113 }, { "epoch": 0.022455310976510563, "grad_norm": 0.575012743473053, "learning_rate": 0.00012776045284322368, "loss": 0.2342, "step": 114 }, { "epoch": 0.022652287388585216, "grad_norm": 0.5378597974777222, "learning_rate": 0.00012531081145788987, "loss": 0.2058, "step": 115 }, { "epoch": 0.02284926380065987, "grad_norm": 0.8967413902282715, "learning_rate": 0.00012286791985018355, "loss": 0.3315, "step": 116 }, { "epoch": 0.023046240212734524, "grad_norm": 0.6629744172096252, "learning_rate": 0.00012043244588227796, "loss": 0.3345, "step": 117 }, { "epoch": 0.02324321662480918, "grad_norm": 0.5112919807434082, "learning_rate": 0.00011800505538843798, "loss": 0.2614, "step": 118 }, { "epoch": 0.023440193036883834, "grad_norm": 0.34445106983184814, "learning_rate": 0.00011558641199298727, "loss": 0.2409, "step": 119 }, { "epoch": 0.023637169448958488, "grad_norm": 0.5876243710517883, "learning_rate": 0.00011317717692888012, "loss": 0.2454, "step": 120 }, { "epoch": 0.02383414586103314, "grad_norm": 0.5282692909240723, "learning_rate": 0.00011077800885692702, "loss": 0.283, "step": 121 }, { "epoch": 0.024031122273107795, "grad_norm": 0.8354734778404236, "learning_rate": 0.00010838956368572334, "loss": 0.3019, "step": 122 }, { "epoch": 0.02422809868518245, "grad_norm": 0.5572185516357422, "learning_rate": 0.0001060124943923303, "loss": 0.2843, "step": 123 }, { "epoch": 0.024425075097257103, "grad_norm": 0.4937724173069, "learning_rate": 0.0001036474508437579, "loss": 0.2486, "step": 124 }, { "epoch": 0.024622051509331756, "grad_norm": 0.658233106136322, "learning_rate": 0.00010129507961929748, "loss": 0.2915, "step": 125 }, { "epoch": 0.024622051509331756, "eval_loss": 0.07672200351953506, "eval_runtime": 10.6957, "eval_samples_per_second": 4.675, "eval_steps_per_second": 0.654, "step": 125 }, { "epoch": 0.02481902792140641, "grad_norm": 0.6942847967147827, "learning_rate": 9.895602383375353e-05, "loss": 0.3618, "step": 126 }, { "epoch": 0.025016004333481067, "grad_norm": 0.9075064063072205, "learning_rate": 9.663092296162251e-05, "loss": 0.3206, "step": 127 }, { "epoch": 0.02521298074555572, "grad_norm": 0.5630618333816528, "learning_rate": 9.432041266226686e-05, "loss": 0.2901, "step": 128 }, { "epoch": 0.025409957157630374, "grad_norm": 0.5655243396759033, "learning_rate": 9.202512460613219e-05, "loss": 0.2592, "step": 129 }, { "epoch": 0.025606933569705028, "grad_norm": 0.6408135890960693, "learning_rate": 8.97456863020546e-05, "loss": 0.3255, "step": 130 }, { "epoch": 0.02580390998177968, "grad_norm": 0.4605986177921295, "learning_rate": 8.748272092570646e-05, "loss": 0.2667, "step": 131 }, { "epoch": 0.026000886393854335, "grad_norm": 0.6698881387710571, "learning_rate": 8.523684714922608e-05, "loss": 0.2609, "step": 132 }, { "epoch": 0.02619786280592899, "grad_norm": 0.5498625636100769, "learning_rate": 8.300867897207903e-05, "loss": 0.2256, "step": 133 }, { "epoch": 0.026394839218003643, "grad_norm": 0.5972995162010193, "learning_rate": 8.079882555319684e-05, "loss": 0.2337, "step": 134 }, { "epoch": 0.0265918156300783, "grad_norm": 0.9262932538986206, "learning_rate": 7.860789104443896e-05, "loss": 0.4045, "step": 135 }, { "epoch": 0.026788792042152954, "grad_norm": 0.7597740888595581, "learning_rate": 7.643647442542382e-05, "loss": 0.3238, "step": 136 }, { "epoch": 0.026985768454227607, "grad_norm": 0.6598590612411499, "learning_rate": 7.428516933977347e-05, "loss": 0.2864, "step": 137 }, { "epoch": 0.02718274486630226, "grad_norm": 1.9483425617218018, "learning_rate": 7.215456393281776e-05, "loss": 0.2428, "step": 138 }, { "epoch": 0.027379721278376914, "grad_norm": 0.6512977480888367, "learning_rate": 7.004524069080096e-05, "loss": 0.2453, "step": 139 }, { "epoch": 0.027576697690451568, "grad_norm": 0.864540159702301, "learning_rate": 6.795777628163599e-05, "loss": 0.2991, "step": 140 }, { "epoch": 0.027773674102526222, "grad_norm": 0.7853538393974304, "learning_rate": 6.58927413972491e-05, "loss": 0.2653, "step": 141 }, { "epoch": 0.027970650514600875, "grad_norm": 0.6818575263023376, "learning_rate": 6.385070059755846e-05, "loss": 0.2376, "step": 142 }, { "epoch": 0.02816762692667553, "grad_norm": 0.8303399085998535, "learning_rate": 6.183221215612904e-05, "loss": 0.2446, "step": 143 }, { "epoch": 0.028364603338750186, "grad_norm": 0.4680931270122528, "learning_rate": 5.983782790754623e-05, "loss": 0.1331, "step": 144 }, { "epoch": 0.02856157975082484, "grad_norm": 0.6611976623535156, "learning_rate": 5.786809309654982e-05, "loss": 0.1828, "step": 145 }, { "epoch": 0.028758556162899494, "grad_norm": 0.5369855761528015, "learning_rate": 5.592354622896944e-05, "loss": 0.1183, "step": 146 }, { "epoch": 0.028955532574974147, "grad_norm": 0.8444812893867493, "learning_rate": 5.40047189245025e-05, "loss": 0.1247, "step": 147 }, { "epoch": 0.0291525089870488, "grad_norm": 0.8747226595878601, "learning_rate": 5.211213577137469e-05, "loss": 0.1633, "step": 148 }, { "epoch": 0.029349485399123455, "grad_norm": 0.8830420970916748, "learning_rate": 5.024631418292274e-05, "loss": 0.1816, "step": 149 }, { "epoch": 0.029546461811198108, "grad_norm": 3.1225264072418213, "learning_rate": 4.840776425613886e-05, "loss": 0.2376, "step": 150 }, { "epoch": 0.029546461811198108, "eval_loss": 0.08567950874567032, "eval_runtime": 9.1661, "eval_samples_per_second": 5.455, "eval_steps_per_second": 0.764, "step": 150 }, { "epoch": 0.029743438223272762, "grad_norm": 0.9880196452140808, "learning_rate": 4.659698863221513e-05, "loss": 0.3546, "step": 151 }, { "epoch": 0.029940414635347416, "grad_norm": 0.7226404547691345, "learning_rate": 4.481448235912671e-05, "loss": 0.2702, "step": 152 }, { "epoch": 0.030137391047422073, "grad_norm": 0.8038386702537537, "learning_rate": 4.306073275629044e-05, "loss": 0.3077, "step": 153 }, { "epoch": 0.030334367459496726, "grad_norm": 0.6242251396179199, "learning_rate": 4.133621928133665e-05, "loss": 0.2612, "step": 154 }, { "epoch": 0.03053134387157138, "grad_norm": 0.6672581434249878, "learning_rate": 3.964141339903026e-05, "loss": 0.2917, "step": 155 }, { "epoch": 0.030728320283646034, "grad_norm": 0.5822924375534058, "learning_rate": 3.797677845237696e-05, "loss": 0.3038, "step": 156 }, { "epoch": 0.030925296695720687, "grad_norm": 0.5603398680686951, "learning_rate": 3.634276953594982e-05, "loss": 0.2806, "step": 157 }, { "epoch": 0.03112227310779534, "grad_norm": 0.526652991771698, "learning_rate": 3.473983337147118e-05, "loss": 0.3352, "step": 158 }, { "epoch": 0.031319249519869995, "grad_norm": 0.5369526147842407, "learning_rate": 3.316840818568315e-05, "loss": 0.2806, "step": 159 }, { "epoch": 0.03151622593194465, "grad_norm": 0.4179346263408661, "learning_rate": 3.162892359054098e-05, "loss": 0.2744, "step": 160 }, { "epoch": 0.0317132023440193, "grad_norm": 0.6778124570846558, "learning_rate": 3.0121800465761293e-05, "loss": 0.2826, "step": 161 }, { "epoch": 0.031910178756093956, "grad_norm": 0.37569910287857056, "learning_rate": 2.8647450843757897e-05, "loss": 0.2533, "step": 162 }, { "epoch": 0.03210715516816861, "grad_norm": 0.33460506796836853, "learning_rate": 2.7206277796996144e-05, "loss": 0.2703, "step": 163 }, { "epoch": 0.03230413158024326, "grad_norm": 0.5205643177032471, "learning_rate": 2.5798675327796993e-05, "loss": 0.3269, "step": 164 }, { "epoch": 0.03250110799231792, "grad_norm": 0.8476812243461609, "learning_rate": 2.4425028260620715e-05, "loss": 0.3097, "step": 165 }, { "epoch": 0.03269808440439258, "grad_norm": 0.48694828152656555, "learning_rate": 2.3085712136859668e-05, "loss": 0.2699, "step": 166 }, { "epoch": 0.03289506081646723, "grad_norm": 0.7992863655090332, "learning_rate": 2.178109311216913e-05, "loss": 0.2547, "step": 167 }, { "epoch": 0.033092037228541885, "grad_norm": 0.489131361246109, "learning_rate": 2.0511527856363912e-05, "loss": 0.2847, "step": 168 }, { "epoch": 0.03328901364061654, "grad_norm": 0.7267995476722717, "learning_rate": 1.927736345590839e-05, "loss": 0.1622, "step": 169 }, { "epoch": 0.03348599005269119, "grad_norm": 0.4803732633590698, "learning_rate": 1.8078937319026654e-05, "loss": 0.2689, "step": 170 }, { "epoch": 0.033682966464765846, "grad_norm": 0.4199903905391693, "learning_rate": 1.6916577083458228e-05, "loss": 0.299, "step": 171 }, { "epoch": 0.0338799428768405, "grad_norm": 0.36284616589546204, "learning_rate": 1.579060052688548e-05, "loss": 0.262, "step": 172 }, { "epoch": 0.03407691928891515, "grad_norm": 0.4913172423839569, "learning_rate": 1.4701315480056164e-05, "loss": 0.2382, "step": 173 }, { "epoch": 0.03427389570098981, "grad_norm": 0.701080322265625, "learning_rate": 1.3649019742625623e-05, "loss": 0.2878, "step": 174 }, { "epoch": 0.03447087211306446, "grad_norm": 0.4284681975841522, "learning_rate": 1.2634001001741373e-05, "loss": 0.2711, "step": 175 }, { "epoch": 0.03447087211306446, "eval_loss": 0.07281436771154404, "eval_runtime": 11.8755, "eval_samples_per_second": 4.21, "eval_steps_per_second": 0.589, "step": 175 }, { "epoch": 0.034667848525139114, "grad_norm": 0.5271365642547607, "learning_rate": 1.1656536753392287e-05, "loss": 0.2571, "step": 176 }, { "epoch": 0.03486482493721377, "grad_norm": 0.4727443754673004, "learning_rate": 1.0716894226543953e-05, "loss": 0.2677, "step": 177 }, { "epoch": 0.03506180134928842, "grad_norm": 0.37858378887176514, "learning_rate": 9.815330310080887e-06, "loss": 0.2534, "step": 178 }, { "epoch": 0.035258777761363075, "grad_norm": 0.4432028830051422, "learning_rate": 8.952091482575824e-06, "loss": 0.3306, "step": 179 }, { "epoch": 0.03545575417343773, "grad_norm": 0.4843367636203766, "learning_rate": 8.127413744904804e-06, "loss": 0.2407, "step": 180 }, { "epoch": 0.03565273058551238, "grad_norm": 0.5236724615097046, "learning_rate": 7.34152255572697e-06, "loss": 0.2688, "step": 181 }, { "epoch": 0.035849706997587036, "grad_norm": 0.688205897808075, "learning_rate": 6.594632769846353e-06, "loss": 0.3668, "step": 182 }, { "epoch": 0.036046683409661696, "grad_norm": 0.4552861452102661, "learning_rate": 5.886948579472778e-06, "loss": 0.275, "step": 183 }, { "epoch": 0.03624365982173635, "grad_norm": 0.5080707669258118, "learning_rate": 5.218663458397715e-06, "loss": 0.2851, "step": 184 }, { "epoch": 0.036440636233811004, "grad_norm": 0.43050211668014526, "learning_rate": 4.589960109100444e-06, "loss": 0.2722, "step": 185 }, { "epoch": 0.03663761264588566, "grad_norm": 0.548950731754303, "learning_rate": 4.001010412799138e-06, "loss": 0.2045, "step": 186 }, { "epoch": 0.03683458905796031, "grad_norm": 0.42442962527275085, "learning_rate": 3.451975382460109e-06, "loss": 0.3006, "step": 187 }, { "epoch": 0.037031565470034965, "grad_norm": 0.5989577174186707, "learning_rate": 2.9430051187785962e-06, "loss": 0.1922, "step": 188 }, { "epoch": 0.03722854188210962, "grad_norm": 0.528300404548645, "learning_rate": 2.4742387691426445e-06, "loss": 0.2248, "step": 189 }, { "epoch": 0.03742551829418427, "grad_norm": 0.5548564195632935, "learning_rate": 2.0458044895916513e-06, "loss": 0.2838, "step": 190 }, { "epoch": 0.037622494706258926, "grad_norm": 0.6061025857925415, "learning_rate": 1.6578194097797258e-06, "loss": 0.2634, "step": 191 }, { "epoch": 0.03781947111833358, "grad_norm": 0.6724241971969604, "learning_rate": 1.3103896009537207e-06, "loss": 0.3241, "step": 192 }, { "epoch": 0.03801644753040823, "grad_norm": 0.629009485244751, "learning_rate": 1.0036100469542786e-06, "loss": 0.2181, "step": 193 }, { "epoch": 0.03821342394248289, "grad_norm": 0.6329449415206909, "learning_rate": 7.375646182482875e-07, "loss": 0.1653, "step": 194 }, { "epoch": 0.03841040035455754, "grad_norm": 0.5899990200996399, "learning_rate": 5.123260489995229e-07, "loss": 0.2021, "step": 195 }, { "epoch": 0.038607376766632194, "grad_norm": 0.6992418169975281, "learning_rate": 3.2795591718381975e-07, "loss": 0.1811, "step": 196 }, { "epoch": 0.03880435317870685, "grad_norm": 0.7541390657424927, "learning_rate": 1.8450462775428942e-07, "loss": 0.1525, "step": 197 }, { "epoch": 0.0390013295907815, "grad_norm": 0.5883744955062866, "learning_rate": 8.201139886109264e-08, "loss": 0.143, "step": 198 }, { "epoch": 0.039198306002856155, "grad_norm": 0.8709452748298645, "learning_rate": 2.0504251129649374e-08, "loss": 0.1841, "step": 199 }, { "epoch": 0.03939528241493081, "grad_norm": 0.9281623363494873, "learning_rate": 0.0, "loss": 0.2595, "step": 200 }, { "epoch": 0.03939528241493081, "eval_loss": 0.07271432131528854, "eval_runtime": 12.7749, "eval_samples_per_second": 3.914, "eval_steps_per_second": 0.548, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.47763455488426e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }