|
{ |
|
"best_metric": 0.6604854464530945, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-50", |
|
"epoch": 1.0027855153203342, |
|
"eval_steps": 50, |
|
"global_step": 90, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011142061281337047, |
|
"grad_norm": 1.0420100688934326, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1642, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011142061281337047, |
|
"eval_loss": 1.8091527223587036, |
|
"eval_runtime": 17.0987, |
|
"eval_samples_per_second": 8.89, |
|
"eval_steps_per_second": 2.222, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.022284122562674095, |
|
"grad_norm": 1.1613749265670776, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2725, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.033426183844011144, |
|
"grad_norm": 1.1371344327926636, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4025, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04456824512534819, |
|
"grad_norm": 1.4346470832824707, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4288, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.055710306406685235, |
|
"grad_norm": 1.3074644804000854, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4659, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06685236768802229, |
|
"grad_norm": 1.416764497756958, |
|
"learning_rate": 6e-05, |
|
"loss": 1.439, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07799442896935933, |
|
"grad_norm": 1.8262981176376343, |
|
"learning_rate": 7e-05, |
|
"loss": 1.2719, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08913649025069638, |
|
"grad_norm": 1.8879832029342651, |
|
"learning_rate": 8e-05, |
|
"loss": 1.1828, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.10027855153203342, |
|
"grad_norm": 1.7780221700668335, |
|
"learning_rate": 9e-05, |
|
"loss": 0.9261, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.11142061281337047, |
|
"grad_norm": 1.4061527252197266, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8449, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12256267409470752, |
|
"grad_norm": 1.4167653322219849, |
|
"learning_rate": 9.996145181203615e-05, |
|
"loss": 0.9189, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.13370473537604458, |
|
"grad_norm": 1.4484316110610962, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 0.8569, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.14484679665738162, |
|
"grad_norm": 1.3803526163101196, |
|
"learning_rate": 9.965342284774632e-05, |
|
"loss": 0.7734, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.15598885793871867, |
|
"grad_norm": 1.325583815574646, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 0.8063, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1671309192200557, |
|
"grad_norm": 1.6950255632400513, |
|
"learning_rate": 9.903926402016153e-05, |
|
"loss": 0.9614, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.17827298050139276, |
|
"grad_norm": 1.4282883405685425, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 0.7504, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1894150417827298, |
|
"grad_norm": 1.3364102840423584, |
|
"learning_rate": 9.812276182268236e-05, |
|
"loss": 0.7784, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.20055710306406685, |
|
"grad_norm": 1.332399606704712, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.7068, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2116991643454039, |
|
"grad_norm": 1.4372507333755493, |
|
"learning_rate": 9.690956679612421e-05, |
|
"loss": 0.699, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.22284122562674094, |
|
"grad_norm": 1.3753567934036255, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 0.8339, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.233983286908078, |
|
"grad_norm": 1.4257323741912842, |
|
"learning_rate": 9.540715869125407e-05, |
|
"loss": 0.6704, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.24512534818941503, |
|
"grad_norm": 2.0480964183807373, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 0.6739, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2562674094707521, |
|
"grad_norm": 0.6940401792526245, |
|
"learning_rate": 9.362480035363986e-05, |
|
"loss": 0.7361, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.26740947075208915, |
|
"grad_norm": 0.8433642983436584, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 0.7383, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2785515320334262, |
|
"grad_norm": 0.8747265338897705, |
|
"learning_rate": 9.157348061512727e-05, |
|
"loss": 0.772, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.28969359331476324, |
|
"grad_norm": 0.8755362629890442, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.747, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.3008356545961003, |
|
"grad_norm": 1.0401290655136108, |
|
"learning_rate": 8.926584654403724e-05, |
|
"loss": 0.789, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.31197771587743733, |
|
"grad_norm": 0.9953641295433044, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 0.8023, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3231197771587744, |
|
"grad_norm": 1.0071543455123901, |
|
"learning_rate": 8.671612547178428e-05, |
|
"loss": 0.6908, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3342618384401114, |
|
"grad_norm": 1.1228718757629395, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.7995, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.34540389972144847, |
|
"grad_norm": 1.0646946430206299, |
|
"learning_rate": 8.39400372766471e-05, |
|
"loss": 0.6854, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3565459610027855, |
|
"grad_norm": 0.9896352291107178, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 0.6517, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.36768802228412256, |
|
"grad_norm": 1.0033907890319824, |
|
"learning_rate": 8.095469746549172e-05, |
|
"loss": 0.6694, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3788300835654596, |
|
"grad_norm": 1.1896623373031616, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.7511, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.38997214484679665, |
|
"grad_norm": 1.091591238975525, |
|
"learning_rate": 7.777851165098012e-05, |
|
"loss": 0.6982, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4011142061281337, |
|
"grad_norm": 1.131341814994812, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 0.761, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.41225626740947074, |
|
"grad_norm": 1.158227801322937, |
|
"learning_rate": 7.443106207484776e-05, |
|
"loss": 0.8267, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.4233983286908078, |
|
"grad_norm": 1.8035699129104614, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 0.6249, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.43454038997214484, |
|
"grad_norm": 1.2867003679275513, |
|
"learning_rate": 7.09329868768714e-05, |
|
"loss": 0.891, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4456824512534819, |
|
"grad_norm": 1.0538527965545654, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 0.7137, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4568245125348189, |
|
"grad_norm": 1.4648083448410034, |
|
"learning_rate": 6.730585285387465e-05, |
|
"loss": 0.6051, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.467966573816156, |
|
"grad_norm": 1.2210773229599, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.7698, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.479108635097493, |
|
"grad_norm": 1.5098011493682861, |
|
"learning_rate": 6.357202249325371e-05, |
|
"loss": 0.7258, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.49025069637883006, |
|
"grad_norm": 1.8282679319381714, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 0.5714, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.5013927576601671, |
|
"grad_norm": 0.700736403465271, |
|
"learning_rate": 5.9754516100806423e-05, |
|
"loss": 0.6174, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5125348189415042, |
|
"grad_norm": 0.8653382062911987, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 0.6672, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.5236768802228412, |
|
"grad_norm": 0.7241457104682922, |
|
"learning_rate": 5.587686987289189e-05, |
|
"loss": 0.6421, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5348189415041783, |
|
"grad_norm": 1.0107219219207764, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 0.7138, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5459610027855153, |
|
"grad_norm": 0.729120135307312, |
|
"learning_rate": 5.196299078795344e-05, |
|
"loss": 0.5774, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5571030640668524, |
|
"grad_norm": 0.8338662981987, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6265, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5571030640668524, |
|
"eval_loss": 0.6604854464530945, |
|
"eval_runtime": 17.4209, |
|
"eval_samples_per_second": 8.725, |
|
"eval_steps_per_second": 2.181, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5682451253481894, |
|
"grad_norm": 0.8395673632621765, |
|
"learning_rate": 4.8037009212046586e-05, |
|
"loss": 0.5995, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5793871866295265, |
|
"grad_norm": 1.080180287361145, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 0.687, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5905292479108635, |
|
"grad_norm": 1.0408283472061157, |
|
"learning_rate": 4.412313012710813e-05, |
|
"loss": 0.6857, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.6016713091922006, |
|
"grad_norm": 1.0837056636810303, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 0.7375, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.6128133704735376, |
|
"grad_norm": 1.1068450212478638, |
|
"learning_rate": 4.0245483899193595e-05, |
|
"loss": 0.6629, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6239554317548747, |
|
"grad_norm": 0.9491481184959412, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 0.6415, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6350974930362117, |
|
"grad_norm": 0.9771397709846497, |
|
"learning_rate": 3.642797750674629e-05, |
|
"loss": 0.6092, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6462395543175488, |
|
"grad_norm": 0.9483528733253479, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.6702, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6573816155988857, |
|
"grad_norm": 1.1656486988067627, |
|
"learning_rate": 3.2694147146125345e-05, |
|
"loss": 0.7169, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6685236768802229, |
|
"grad_norm": 1.362808346748352, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 0.7768, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6796657381615598, |
|
"grad_norm": 1.020258903503418, |
|
"learning_rate": 2.9067013123128613e-05, |
|
"loss": 0.6284, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6908077994428969, |
|
"grad_norm": 1.527144432067871, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 0.5853, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.7019498607242339, |
|
"grad_norm": 1.3415321111679077, |
|
"learning_rate": 2.556893792515227e-05, |
|
"loss": 0.5872, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.713091922005571, |
|
"grad_norm": 1.19660484790802, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 0.5428, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.724233983286908, |
|
"grad_norm": 1.7170783281326294, |
|
"learning_rate": 2.2221488349019903e-05, |
|
"loss": 0.604, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7353760445682451, |
|
"grad_norm": 1.8448792695999146, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.6284, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7465181058495822, |
|
"grad_norm": 0.6753265261650085, |
|
"learning_rate": 1.9045302534508297e-05, |
|
"loss": 0.6319, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7576601671309192, |
|
"grad_norm": 0.8194894790649414, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 0.5969, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7688022284122563, |
|
"grad_norm": 0.8043373227119446, |
|
"learning_rate": 1.605996272335291e-05, |
|
"loss": 0.6543, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7799442896935933, |
|
"grad_norm": 0.9129596948623657, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.7098, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7910863509749304, |
|
"grad_norm": 0.8895006775856018, |
|
"learning_rate": 1.3283874528215733e-05, |
|
"loss": 0.5837, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.8022284122562674, |
|
"grad_norm": 0.7797386646270752, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 0.6462, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.8133704735376045, |
|
"grad_norm": 1.0365535020828247, |
|
"learning_rate": 1.0734153455962765e-05, |
|
"loss": 0.7263, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.8245125348189415, |
|
"grad_norm": 0.8929746747016907, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.5768, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.8356545961002786, |
|
"grad_norm": 0.9315441250801086, |
|
"learning_rate": 8.426519384872733e-06, |
|
"loss": 0.6044, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8467966573816156, |
|
"grad_norm": 0.9474170804023743, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 0.6207, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8579387186629527, |
|
"grad_norm": 1.0279390811920166, |
|
"learning_rate": 6.375199646360142e-06, |
|
"loss": 0.6206, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.8690807799442897, |
|
"grad_norm": 1.0129948854446411, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 0.6151, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8802228412256268, |
|
"grad_norm": 0.9171512126922607, |
|
"learning_rate": 4.592841308745932e-06, |
|
"loss": 0.6025, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8913649025069638, |
|
"grad_norm": 1.0913035869598389, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 0.7344, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9025069637883009, |
|
"grad_norm": 1.4194581508636475, |
|
"learning_rate": 3.0904332038757977e-06, |
|
"loss": 0.7239, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.9136490250696379, |
|
"grad_norm": 1.1939295530319214, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 0.636, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.924791086350975, |
|
"grad_norm": 1.0078566074371338, |
|
"learning_rate": 1.8772381773176417e-06, |
|
"loss": 0.5, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.935933147632312, |
|
"grad_norm": 1.0714085102081299, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 0.4576, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.947075208913649, |
|
"grad_norm": 1.0616954565048218, |
|
"learning_rate": 9.607359798384785e-07, |
|
"loss": 0.4891, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.958217270194986, |
|
"grad_norm": 1.1367872953414917, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 0.589, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.9693593314763231, |
|
"grad_norm": 1.2933335304260254, |
|
"learning_rate": 3.465771522536854e-07, |
|
"loss": 0.5224, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.9805013927576601, |
|
"grad_norm": 1.6042691469192505, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 0.5322, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9916434540389972, |
|
"grad_norm": 0.7805180549621582, |
|
"learning_rate": 3.8548187963854956e-08, |
|
"loss": 0.6384, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.0027855153203342, |
|
"grad_norm": 1.1597189903259277, |
|
"learning_rate": 0.0, |
|
"loss": 0.7541, |
|
"step": 90 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 90, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3576622208712704e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|