|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9973216459702945, |
|
"eval_steps": 100, |
|
"global_step": 1026, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009739469198928659, |
|
"grad_norm": 5.16649192443276, |
|
"learning_rate": 4.854368932038835e-07, |
|
"loss": 0.956, |
|
"mean_token_accuracy": 0.7770209729671478, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.019478938397857318, |
|
"grad_norm": 4.487616874759018, |
|
"learning_rate": 9.70873786407767e-07, |
|
"loss": 0.9595, |
|
"mean_token_accuracy": 0.774706457555294, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029218407596785977, |
|
"grad_norm": 1.9918521253065147, |
|
"learning_rate": 1.4563106796116506e-06, |
|
"loss": 0.9087, |
|
"mean_token_accuracy": 0.7794785097241401, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.038957876795714635, |
|
"grad_norm": 2.140741610293528, |
|
"learning_rate": 1.941747572815534e-06, |
|
"loss": 0.8516, |
|
"mean_token_accuracy": 0.7845764443278312, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.048697345994643294, |
|
"grad_norm": 1.577104941303547, |
|
"learning_rate": 2.427184466019418e-06, |
|
"loss": 0.7835, |
|
"mean_token_accuracy": 0.795551997423172, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05843681519357195, |
|
"grad_norm": 0.9851513931833208, |
|
"learning_rate": 2.912621359223301e-06, |
|
"loss": 0.7536, |
|
"mean_token_accuracy": 0.801626966893673, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0681762843925006, |
|
"grad_norm": 0.7665830684245213, |
|
"learning_rate": 3.398058252427185e-06, |
|
"loss": 0.7207, |
|
"mean_token_accuracy": 0.8073863789439202, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07791575359142927, |
|
"grad_norm": 0.6623786102965908, |
|
"learning_rate": 3.883495145631068e-06, |
|
"loss": 0.6976, |
|
"mean_token_accuracy": 0.8124146014451981, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08765522279035792, |
|
"grad_norm": 0.5409465013517523, |
|
"learning_rate": 4.368932038834952e-06, |
|
"loss": 0.6791, |
|
"mean_token_accuracy": 0.8153241157531739, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09739469198928659, |
|
"grad_norm": 0.533911697336328, |
|
"learning_rate": 4.854368932038836e-06, |
|
"loss": 0.6494, |
|
"mean_token_accuracy": 0.8212776482105255, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10713416118821524, |
|
"grad_norm": 0.46335668910229383, |
|
"learning_rate": 5.3398058252427185e-06, |
|
"loss": 0.6374, |
|
"mean_token_accuracy": 0.8242872759699822, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1168736303871439, |
|
"grad_norm": 0.48547854119926936, |
|
"learning_rate": 5.825242718446602e-06, |
|
"loss": 0.6321, |
|
"mean_token_accuracy": 0.8252887204289436, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12661309958607256, |
|
"grad_norm": 0.4864652293922326, |
|
"learning_rate": 6.310679611650487e-06, |
|
"loss": 0.6246, |
|
"mean_token_accuracy": 0.8261281028389931, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1363525687850012, |
|
"grad_norm": 0.48690138801181443, |
|
"learning_rate": 6.79611650485437e-06, |
|
"loss": 0.6168, |
|
"mean_token_accuracy": 0.8278339207172394, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14609203798392986, |
|
"grad_norm": 0.4337713878880355, |
|
"learning_rate": 7.2815533980582534e-06, |
|
"loss": 0.5925, |
|
"mean_token_accuracy": 0.8333460614085197, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.15583150718285854, |
|
"grad_norm": 0.45660620623082415, |
|
"learning_rate": 7.766990291262136e-06, |
|
"loss": 0.5973, |
|
"mean_token_accuracy": 0.8320671111345291, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1655709763817872, |
|
"grad_norm": 0.4424374164027964, |
|
"learning_rate": 8.25242718446602e-06, |
|
"loss": 0.5873, |
|
"mean_token_accuracy": 0.8338468298316002, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.17531044558071585, |
|
"grad_norm": 0.4949541791856808, |
|
"learning_rate": 8.737864077669904e-06, |
|
"loss": 0.5846, |
|
"mean_token_accuracy": 0.8346328064799309, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1850499147796445, |
|
"grad_norm": 0.44716595203588544, |
|
"learning_rate": 9.223300970873788e-06, |
|
"loss": 0.574, |
|
"mean_token_accuracy": 0.8365551233291626, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.19478938397857318, |
|
"grad_norm": 0.40922461158607365, |
|
"learning_rate": 9.708737864077671e-06, |
|
"loss": 0.5745, |
|
"mean_token_accuracy": 0.8366570115089417, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20452885317750183, |
|
"grad_norm": 0.5016773528604923, |
|
"learning_rate": 9.99988415036596e-06, |
|
"loss": 0.5649, |
|
"mean_token_accuracy": 0.8382768034934998, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.21426832237643048, |
|
"grad_norm": 0.5010899629730204, |
|
"learning_rate": 9.99858090363555e-06, |
|
"loss": 0.569, |
|
"mean_token_accuracy": 0.8377967774868011, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22400779157535913, |
|
"grad_norm": 0.5010895934649102, |
|
"learning_rate": 9.995829976834402e-06, |
|
"loss": 0.5654, |
|
"mean_token_accuracy": 0.8383171066641808, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2337472607742878, |
|
"grad_norm": 0.5864787846174829, |
|
"learning_rate": 9.99163216668102e-06, |
|
"loss": 0.5703, |
|
"mean_token_accuracy": 0.8369988009333611, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24348672997321646, |
|
"grad_norm": 0.5502383423638442, |
|
"learning_rate": 9.985988688937684e-06, |
|
"loss": 0.5632, |
|
"mean_token_accuracy": 0.8387595832347869, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2532261991721451, |
|
"grad_norm": 0.4838385873501471, |
|
"learning_rate": 9.978901178058333e-06, |
|
"loss": 0.5472, |
|
"mean_token_accuracy": 0.8424718379974365, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26296566837107377, |
|
"grad_norm": 0.5371306966614764, |
|
"learning_rate": 9.970371686715205e-06, |
|
"loss": 0.5431, |
|
"mean_token_accuracy": 0.8432716697454452, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2727051375700024, |
|
"grad_norm": 0.5584506793424302, |
|
"learning_rate": 9.960402685204347e-06, |
|
"loss": 0.5516, |
|
"mean_token_accuracy": 0.8408507108688354, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2824446067689311, |
|
"grad_norm": 0.5467853450677332, |
|
"learning_rate": 9.948997060730161e-06, |
|
"loss": 0.5464, |
|
"mean_token_accuracy": 0.8424971371889114, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2921840759678597, |
|
"grad_norm": 0.48818575924287444, |
|
"learning_rate": 9.936158116569231e-06, |
|
"loss": 0.5489, |
|
"mean_token_accuracy": 0.8420207649469376, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.30192354516678843, |
|
"grad_norm": 0.46723378629583207, |
|
"learning_rate": 9.921889571113629e-06, |
|
"loss": 0.5462, |
|
"mean_token_accuracy": 0.8419363871216774, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3116630143657171, |
|
"grad_norm": 0.561719955013033, |
|
"learning_rate": 9.906195556793996e-06, |
|
"loss": 0.546, |
|
"mean_token_accuracy": 0.8423524782061577, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.32140248356464574, |
|
"grad_norm": 0.625160714021079, |
|
"learning_rate": 9.889080618882719e-06, |
|
"loss": 0.5335, |
|
"mean_token_accuracy": 0.8451830595731735, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3311419527635744, |
|
"grad_norm": 0.5878540451317104, |
|
"learning_rate": 9.870549714177538e-06, |
|
"loss": 0.5463, |
|
"mean_token_accuracy": 0.8417419150471688, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.34088142196250304, |
|
"grad_norm": 0.6154889939045929, |
|
"learning_rate": 9.850608209565967e-06, |
|
"loss": 0.5327, |
|
"mean_token_accuracy": 0.8450453072786331, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3506208911614317, |
|
"grad_norm": 0.4329465610848603, |
|
"learning_rate": 9.829261880470941e-06, |
|
"loss": 0.5392, |
|
"mean_token_accuracy": 0.8434989348053932, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.36036036036036034, |
|
"grad_norm": 0.5971707984256834, |
|
"learning_rate": 9.806516909178161e-06, |
|
"loss": 0.5324, |
|
"mean_token_accuracy": 0.84499292075634, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.370099829559289, |
|
"grad_norm": 0.44190322710346147, |
|
"learning_rate": 9.78237988304557e-06, |
|
"loss": 0.5332, |
|
"mean_token_accuracy": 0.845010556280613, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3798392987582177, |
|
"grad_norm": 0.4258724885095506, |
|
"learning_rate": 9.756857792595555e-06, |
|
"loss": 0.5319, |
|
"mean_token_accuracy": 0.845000034570694, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.38957876795714635, |
|
"grad_norm": 0.4729261486427116, |
|
"learning_rate": 9.729958029490353e-06, |
|
"loss": 0.5336, |
|
"mean_token_accuracy": 0.8447421163320541, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.399318237156075, |
|
"grad_norm": 0.4528083565277609, |
|
"learning_rate": 9.701688384391296e-06, |
|
"loss": 0.5347, |
|
"mean_token_accuracy": 0.8443498685956001, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.40905770635500366, |
|
"grad_norm": 0.5259149339810829, |
|
"learning_rate": 9.672057044702492e-06, |
|
"loss": 0.5199, |
|
"mean_token_accuracy": 0.848120279610157, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4187971755539323, |
|
"grad_norm": 0.46836949308672154, |
|
"learning_rate": 9.641072592199599e-06, |
|
"loss": 0.5219, |
|
"mean_token_accuracy": 0.8473641723394394, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.42853664475286096, |
|
"grad_norm": 0.5302502163183013, |
|
"learning_rate": 9.608744000544392e-06, |
|
"loss": 0.5174, |
|
"mean_token_accuracy": 0.8485160410404206, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4382761139517896, |
|
"grad_norm": 0.51343914785562, |
|
"learning_rate": 9.575080632685832e-06, |
|
"loss": 0.5239, |
|
"mean_token_accuracy": 0.846913392841816, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.44801558315071827, |
|
"grad_norm": 0.5925559680424145, |
|
"learning_rate": 9.54009223814837e-06, |
|
"loss": 0.5277, |
|
"mean_token_accuracy": 0.8454070091247559, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4577550523496469, |
|
"grad_norm": 0.426110283810703, |
|
"learning_rate": 9.503788950208324e-06, |
|
"loss": 0.5215, |
|
"mean_token_accuracy": 0.8473676040768623, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4674945215485756, |
|
"grad_norm": 0.5303378941560718, |
|
"learning_rate": 9.466181282959083e-06, |
|
"loss": 0.5282, |
|
"mean_token_accuracy": 0.8465007901191711, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4772339907475043, |
|
"grad_norm": 0.4965315113021109, |
|
"learning_rate": 9.427280128266049e-06, |
|
"loss": 0.5179, |
|
"mean_token_accuracy": 0.8484022691845894, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.48697345994643293, |
|
"grad_norm": 0.44982624201366644, |
|
"learning_rate": 9.387096752612144e-06, |
|
"loss": 0.5224, |
|
"mean_token_accuracy": 0.8463509559631348, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4967129291453616, |
|
"grad_norm": 0.4623459721033935, |
|
"learning_rate": 9.345642793834825e-06, |
|
"loss": 0.5271, |
|
"mean_token_accuracy": 0.8463737353682518, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5064523983442902, |
|
"grad_norm": 0.5825923961149249, |
|
"learning_rate": 9.302930257755579e-06, |
|
"loss": 0.53, |
|
"mean_token_accuracy": 0.8450088694691658, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5161918675432189, |
|
"grad_norm": 0.45940800329264964, |
|
"learning_rate": 9.258971514702789e-06, |
|
"loss": 0.507, |
|
"mean_token_accuracy": 0.8508246764540672, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5259313367421475, |
|
"grad_norm": 0.4465364801256697, |
|
"learning_rate": 9.213779295929082e-06, |
|
"loss": 0.5087, |
|
"mean_token_accuracy": 0.8500014141201973, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5356708059410762, |
|
"grad_norm": 0.4686695606038613, |
|
"learning_rate": 9.167366689924116e-06, |
|
"loss": 0.5163, |
|
"mean_token_accuracy": 0.8484609499573708, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5454102751400048, |
|
"grad_norm": 0.5572145736502692, |
|
"learning_rate": 9.119747138623925e-06, |
|
"loss": 0.5221, |
|
"mean_token_accuracy": 0.8470426678657532, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5551497443389335, |
|
"grad_norm": 0.4765827682250088, |
|
"learning_rate": 9.070934433517872e-06, |
|
"loss": 0.5068, |
|
"mean_token_accuracy": 0.8509115263819694, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5648892135378621, |
|
"grad_norm": 0.49934327207360063, |
|
"learning_rate": 9.020942711654404e-06, |
|
"loss": 0.5106, |
|
"mean_token_accuracy": 0.8498208403587342, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5746286827367908, |
|
"grad_norm": 0.5101324493724546, |
|
"learning_rate": 8.969786451546691e-06, |
|
"loss": 0.5123, |
|
"mean_token_accuracy": 0.8496938437223435, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5843681519357194, |
|
"grad_norm": 0.5089021103596151, |
|
"learning_rate": 8.917480468979387e-06, |
|
"loss": 0.5128, |
|
"mean_token_accuracy": 0.8487787261605263, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5941076211346482, |
|
"grad_norm": 0.5501775930667125, |
|
"learning_rate": 8.864039912717713e-06, |
|
"loss": 0.5123, |
|
"mean_token_accuracy": 0.849444879591465, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6038470903335769, |
|
"grad_norm": 0.41563520519489466, |
|
"learning_rate": 8.809480260120096e-06, |
|
"loss": 0.5048, |
|
"mean_token_accuracy": 0.8513683333992959, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6135865595325055, |
|
"grad_norm": 0.4999791105897695, |
|
"learning_rate": 8.753817312655642e-06, |
|
"loss": 0.514, |
|
"mean_token_accuracy": 0.8484693005681038, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6233260287314342, |
|
"grad_norm": 0.48088239973601826, |
|
"learning_rate": 8.697067191327748e-06, |
|
"loss": 0.5114, |
|
"mean_token_accuracy": 0.8495015501976013, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6330654979303628, |
|
"grad_norm": 0.4440141428026334, |
|
"learning_rate": 8.639246332005163e-06, |
|
"loss": 0.5064, |
|
"mean_token_accuracy": 0.8507678374648094, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6428049671292915, |
|
"grad_norm": 0.5288324873519791, |
|
"learning_rate": 8.580371480661857e-06, |
|
"loss": 0.5024, |
|
"mean_token_accuracy": 0.8514497712254524, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6525444363282201, |
|
"grad_norm": 0.4563934748637001, |
|
"learning_rate": 8.520459688527091e-06, |
|
"loss": 0.5108, |
|
"mean_token_accuracy": 0.849525935947895, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6622839055271488, |
|
"grad_norm": 0.4756063591717307, |
|
"learning_rate": 8.459528307147066e-06, |
|
"loss": 0.51, |
|
"mean_token_accuracy": 0.8501726359128952, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6720233747260774, |
|
"grad_norm": 0.5732019741491505, |
|
"learning_rate": 8.397594983359591e-06, |
|
"loss": 0.5062, |
|
"mean_token_accuracy": 0.850397090613842, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6817628439250061, |
|
"grad_norm": 0.5267653966986828, |
|
"learning_rate": 8.334677654183254e-06, |
|
"loss": 0.5065, |
|
"mean_token_accuracy": 0.8505988359451294, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6915023131239347, |
|
"grad_norm": 0.4796149747636043, |
|
"learning_rate": 8.27079454162252e-06, |
|
"loss": 0.5028, |
|
"mean_token_accuracy": 0.851087860763073, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7012417823228634, |
|
"grad_norm": 0.4809967856664383, |
|
"learning_rate": 8.205964147390313e-06, |
|
"loss": 0.5084, |
|
"mean_token_accuracy": 0.8496400877833367, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.710981251521792, |
|
"grad_norm": 0.4348534092140185, |
|
"learning_rate": 8.140205247549583e-06, |
|
"loss": 0.4983, |
|
"mean_token_accuracy": 0.8522587567567825, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 0.4799630529257426, |
|
"learning_rate": 8.073536887075417e-06, |
|
"loss": 0.5119, |
|
"mean_token_accuracy": 0.8493492469191551, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7304601899196493, |
|
"grad_norm": 0.5305052950944196, |
|
"learning_rate": 8.005978374339264e-06, |
|
"loss": 0.4946, |
|
"mean_token_accuracy": 0.8531364649534225, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.740199659118578, |
|
"grad_norm": 0.46401531374429855, |
|
"learning_rate": 7.937549275516882e-06, |
|
"loss": 0.493, |
|
"mean_token_accuracy": 0.8535278528928757, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7499391283175066, |
|
"grad_norm": 0.46753323308197925, |
|
"learning_rate": 7.868269408921614e-06, |
|
"loss": 0.504, |
|
"mean_token_accuracy": 0.8511819407343865, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7596785975164354, |
|
"grad_norm": 0.43897436674183826, |
|
"learning_rate": 7.798158839264645e-06, |
|
"loss": 0.4983, |
|
"mean_token_accuracy": 0.8521765768527985, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7694180667153641, |
|
"grad_norm": 0.4030095081507055, |
|
"learning_rate": 7.7272378718439e-06, |
|
"loss": 0.5092, |
|
"mean_token_accuracy": 0.8500049978494644, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7791575359142927, |
|
"grad_norm": 0.393465790048579, |
|
"learning_rate": 7.655527046663254e-06, |
|
"loss": 0.5022, |
|
"mean_token_accuracy": 0.8510702222585678, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7888970051132214, |
|
"grad_norm": 0.49269595855244275, |
|
"learning_rate": 7.5830471324837765e-06, |
|
"loss": 0.4945, |
|
"mean_token_accuracy": 0.8529314771294594, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.79863647431215, |
|
"grad_norm": 0.4729001525106615, |
|
"learning_rate": 7.5098191208087144e-06, |
|
"loss": 0.5, |
|
"mean_token_accuracy": 0.8525989070534706, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8083759435110787, |
|
"grad_norm": 0.44720074409020516, |
|
"learning_rate": 7.4358642198039835e-06, |
|
"loss": 0.4946, |
|
"mean_token_accuracy": 0.8530281245708465, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8181154127100073, |
|
"grad_norm": 0.4192773521625985, |
|
"learning_rate": 7.36120384815588e-06, |
|
"loss": 0.4927, |
|
"mean_token_accuracy": 0.8539465010166168, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.827854881908936, |
|
"grad_norm": 0.41037683317185314, |
|
"learning_rate": 7.285859628867851e-06, |
|
"loss": 0.4952, |
|
"mean_token_accuracy": 0.8532393842935562, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8375943511078646, |
|
"grad_norm": 0.4536782151813187, |
|
"learning_rate": 7.209853382998077e-06, |
|
"loss": 0.4983, |
|
"mean_token_accuracy": 0.8528945103287697, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8473338203067933, |
|
"grad_norm": 0.4895833290473578, |
|
"learning_rate": 7.133207123339689e-06, |
|
"loss": 0.4939, |
|
"mean_token_accuracy": 0.853129243850708, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8570732895057219, |
|
"grad_norm": 0.4771689532642086, |
|
"learning_rate": 7.055943048045476e-06, |
|
"loss": 0.5002, |
|
"mean_token_accuracy": 0.8518661975860595, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8668127587046506, |
|
"grad_norm": 0.4439177979495324, |
|
"learning_rate": 6.978083534198878e-06, |
|
"loss": 0.4977, |
|
"mean_token_accuracy": 0.8526063248515129, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.8765522279035792, |
|
"grad_norm": 0.45795986197220584, |
|
"learning_rate": 6.899651131333194e-06, |
|
"loss": 0.4876, |
|
"mean_token_accuracy": 0.8546572834253311, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8862916971025079, |
|
"grad_norm": 0.38084088770564956, |
|
"learning_rate": 6.82066855490081e-06, |
|
"loss": 0.5009, |
|
"mean_token_accuracy": 0.851148933172226, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8960311663014365, |
|
"grad_norm": 0.4552737931829716, |
|
"learning_rate": 6.741158679694403e-06, |
|
"loss": 0.4968, |
|
"mean_token_accuracy": 0.8524380102753639, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9057706355003652, |
|
"grad_norm": 0.38831398930391275, |
|
"learning_rate": 6.661144533221974e-06, |
|
"loss": 0.4897, |
|
"mean_token_accuracy": 0.8537176489830017, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.9155101046992938, |
|
"grad_norm": 0.42794585960634596, |
|
"learning_rate": 6.58064928903767e-06, |
|
"loss": 0.4942, |
|
"mean_token_accuracy": 0.8529506504535675, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9252495738982226, |
|
"grad_norm": 0.42890975667323755, |
|
"learning_rate": 6.499696260030297e-06, |
|
"loss": 0.5064, |
|
"mean_token_accuracy": 0.8502799227833748, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9349890430971513, |
|
"grad_norm": 0.4478332747744812, |
|
"learning_rate": 6.418308891671484e-06, |
|
"loss": 0.4855, |
|
"mean_token_accuracy": 0.8555838361382484, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9447285122960799, |
|
"grad_norm": 0.435211802978247, |
|
"learning_rate": 6.336510755225447e-06, |
|
"loss": 0.4835, |
|
"mean_token_accuracy": 0.8558964654803276, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.9544679814950086, |
|
"grad_norm": 0.4462726381437146, |
|
"learning_rate": 6.25432554092232e-06, |
|
"loss": 0.4898, |
|
"mean_token_accuracy": 0.8544702440500259, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9642074506939372, |
|
"grad_norm": 0.44503915866443544, |
|
"learning_rate": 6.171777051097037e-06, |
|
"loss": 0.4858, |
|
"mean_token_accuracy": 0.8554443955421448, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.9739469198928659, |
|
"grad_norm": 0.47966098493432496, |
|
"learning_rate": 6.088889193295738e-06, |
|
"loss": 0.4929, |
|
"mean_token_accuracy": 0.8535514727234841, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9836863890917945, |
|
"grad_norm": 0.5437585793823457, |
|
"learning_rate": 6.005685973351708e-06, |
|
"loss": 0.4931, |
|
"mean_token_accuracy": 0.8531181156635285, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9934258582907232, |
|
"grad_norm": 0.4124694681433172, |
|
"learning_rate": 5.922191488432857e-06, |
|
"loss": 0.4917, |
|
"mean_token_accuracy": 0.8535135626792908, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0019478938397857, |
|
"grad_norm": 0.803029283965446, |
|
"learning_rate": 5.838429920062734e-06, |
|
"loss": 0.4727, |
|
"mean_token_accuracy": 0.8567549926894051, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.0116873630387144, |
|
"grad_norm": 0.3927635667112885, |
|
"learning_rate": 5.754425527117118e-06, |
|
"loss": 0.4479, |
|
"mean_token_accuracy": 0.864441742002964, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.021426832237643, |
|
"grad_norm": 0.45928387456979786, |
|
"learning_rate": 5.670202638798213e-06, |
|
"loss": 0.4598, |
|
"mean_token_accuracy": 0.8615871027112008, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.0311663014365717, |
|
"grad_norm": 0.517389580126703, |
|
"learning_rate": 5.585785647588458e-06, |
|
"loss": 0.4572, |
|
"mean_token_accuracy": 0.8620010375976562, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0409057706355003, |
|
"grad_norm": 0.37886924917283415, |
|
"learning_rate": 5.501199002186024e-06, |
|
"loss": 0.455, |
|
"mean_token_accuracy": 0.8625968441367149, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.050645239834429, |
|
"grad_norm": 0.39773153996958316, |
|
"learning_rate": 5.416467200424032e-06, |
|
"loss": 0.45, |
|
"mean_token_accuracy": 0.8637859463691712, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0603847090333576, |
|
"grad_norm": 0.40508395528042257, |
|
"learning_rate": 5.33161478217552e-06, |
|
"loss": 0.4516, |
|
"mean_token_accuracy": 0.8634377360343933, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.0701241782322863, |
|
"grad_norm": 0.37429603209812456, |
|
"learning_rate": 5.246666322246267e-06, |
|
"loss": 0.4445, |
|
"mean_token_accuracy": 0.8651037693023682, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.079863647431215, |
|
"grad_norm": 0.47125189912053295, |
|
"learning_rate": 5.1616464232574635e-06, |
|
"loss": 0.4626, |
|
"mean_token_accuracy": 0.8602706581354141, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.0896031166301436, |
|
"grad_norm": 0.435383825468146, |
|
"learning_rate": 5.076579708520355e-06, |
|
"loss": 0.4497, |
|
"mean_token_accuracy": 0.8635805040597916, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0993425858290724, |
|
"grad_norm": 0.412421290706311, |
|
"learning_rate": 4.991490814904888e-06, |
|
"loss": 0.4378, |
|
"mean_token_accuracy": 0.8668369174003601, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.109082055028001, |
|
"grad_norm": 0.41209358309681204, |
|
"learning_rate": 4.906404385704402e-06, |
|
"loss": 0.4525, |
|
"mean_token_accuracy": 0.8637306377291679, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1188215242269297, |
|
"grad_norm": 0.3821281290056491, |
|
"learning_rate": 4.82134506349851e-06, |
|
"loss": 0.4564, |
|
"mean_token_accuracy": 0.8625956058502198, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.1285609934258582, |
|
"grad_norm": 0.34980242368851916, |
|
"learning_rate": 4.736337483016138e-06, |
|
"loss": 0.4513, |
|
"mean_token_accuracy": 0.8634121060371399, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.138300462624787, |
|
"grad_norm": 0.39297932611221925, |
|
"learning_rate": 4.651406264000871e-06, |
|
"loss": 0.4512, |
|
"mean_token_accuracy": 0.8632175624370575, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.1480399318237156, |
|
"grad_norm": 0.4317512663581136, |
|
"learning_rate": 4.5665760040806174e-06, |
|
"loss": 0.4558, |
|
"mean_token_accuracy": 0.8621700823307037, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1577794010226443, |
|
"grad_norm": 0.4166519413744608, |
|
"learning_rate": 4.481871271643698e-06, |
|
"loss": 0.4543, |
|
"mean_token_accuracy": 0.8628953084349632, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.1675188702215729, |
|
"grad_norm": 0.3817664896689048, |
|
"learning_rate": 4.397316598723385e-06, |
|
"loss": 0.4599, |
|
"mean_token_accuracy": 0.8614401906728745, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1772583394205016, |
|
"grad_norm": 0.39223028227054235, |
|
"learning_rate": 4.312936473892984e-06, |
|
"loss": 0.4559, |
|
"mean_token_accuracy": 0.8621881052851676, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.1869978086194302, |
|
"grad_norm": 0.39870192645434044, |
|
"learning_rate": 4.228755335173488e-06, |
|
"loss": 0.4554, |
|
"mean_token_accuracy": 0.8622724115848541, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.196737277818359, |
|
"grad_norm": 0.3968827579618214, |
|
"learning_rate": 4.1447975629559e-06, |
|
"loss": 0.4496, |
|
"mean_token_accuracy": 0.8639340966939926, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.2064767470172875, |
|
"grad_norm": 0.3525974840125754, |
|
"learning_rate": 4.061087472940204e-06, |
|
"loss": 0.4468, |
|
"mean_token_accuracy": 0.8643500834703446, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 0.34784808537613615, |
|
"learning_rate": 3.977649309093113e-06, |
|
"loss": 0.4463, |
|
"mean_token_accuracy": 0.8645710095763206, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.225955685415145, |
|
"grad_norm": 0.3588527258157974, |
|
"learning_rate": 3.89450723662657e-06, |
|
"loss": 0.4517, |
|
"mean_token_accuracy": 0.863483439385891, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2356951546140735, |
|
"grad_norm": 0.495886348965792, |
|
"learning_rate": 3.8116853349990574e-06, |
|
"loss": 0.4507, |
|
"mean_token_accuracy": 0.8636451244354248, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.245434623813002, |
|
"grad_norm": 0.3842625748309975, |
|
"learning_rate": 3.729207590941753e-06, |
|
"loss": 0.4377, |
|
"mean_token_accuracy": 0.8670691177248955, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2551740930119308, |
|
"grad_norm": 0.4426936471853252, |
|
"learning_rate": 3.647097891511536e-06, |
|
"loss": 0.4495, |
|
"mean_token_accuracy": 0.8640359625220299, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.2649135622108596, |
|
"grad_norm": 0.3707914127374137, |
|
"learning_rate": 3.565380017172854e-06, |
|
"loss": 0.4397, |
|
"mean_token_accuracy": 0.8666136890649796, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2746530314097881, |
|
"grad_norm": 0.35267532874755536, |
|
"learning_rate": 3.4840776349104755e-06, |
|
"loss": 0.4539, |
|
"mean_token_accuracy": 0.8626947477459908, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.284392500608717, |
|
"grad_norm": 0.3749743227529099, |
|
"learning_rate": 3.4032142913750956e-06, |
|
"loss": 0.4497, |
|
"mean_token_accuracy": 0.8637418314814568, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2941319698076454, |
|
"grad_norm": 0.37600006002277636, |
|
"learning_rate": 3.322813406063794e-06, |
|
"loss": 0.4559, |
|
"mean_token_accuracy": 0.8622224271297455, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.3038714390065742, |
|
"grad_norm": 0.4005804398308692, |
|
"learning_rate": 3.242898264537331e-06, |
|
"loss": 0.4521, |
|
"mean_token_accuracy": 0.8632362619042396, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3136109082055027, |
|
"grad_norm": 0.38630202996272256, |
|
"learning_rate": 3.1634920116762175e-06, |
|
"loss": 0.4499, |
|
"mean_token_accuracy": 0.8635998621582985, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.3233503774044315, |
|
"grad_norm": 0.36180685239531096, |
|
"learning_rate": 3.0846176449775363e-06, |
|
"loss": 0.4508, |
|
"mean_token_accuracy": 0.8636634424328804, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.33308984660336, |
|
"grad_norm": 0.3357931590954893, |
|
"learning_rate": 3.0062980078944515e-06, |
|
"loss": 0.4379, |
|
"mean_token_accuracy": 0.8665053129196167, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.3428293158022888, |
|
"grad_norm": 0.35662293115272325, |
|
"learning_rate": 2.9285557832203328e-06, |
|
"loss": 0.4458, |
|
"mean_token_accuracy": 0.8648849859833717, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3525687850012174, |
|
"grad_norm": 0.35254913645932934, |
|
"learning_rate": 2.851413486519388e-06, |
|
"loss": 0.4413, |
|
"mean_token_accuracy": 0.8654858738183975, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.3623082542001461, |
|
"grad_norm": 0.3448432687225989, |
|
"learning_rate": 2.774893459605766e-06, |
|
"loss": 0.4431, |
|
"mean_token_accuracy": 0.86555365473032, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3720477233990747, |
|
"grad_norm": 0.35405617501926967, |
|
"learning_rate": 2.69901786407295e-06, |
|
"loss": 0.444, |
|
"mean_token_accuracy": 0.8651208564639091, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.3817871925980034, |
|
"grad_norm": 0.36859590132072323, |
|
"learning_rate": 2.6238086748753587e-06, |
|
"loss": 0.456, |
|
"mean_token_accuracy": 0.8627100110054016, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3915266617969322, |
|
"grad_norm": 0.35246470987862855, |
|
"learning_rate": 2.5492876739639912e-06, |
|
"loss": 0.4533, |
|
"mean_token_accuracy": 0.8628792524337768, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.4012661309958607, |
|
"grad_norm": 0.36351033252725057, |
|
"learning_rate": 2.475476443977996e-06, |
|
"loss": 0.4469, |
|
"mean_token_accuracy": 0.8642540082335473, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4110056001947893, |
|
"grad_norm": 0.35938233389923585, |
|
"learning_rate": 2.40239636199393e-06, |
|
"loss": 0.4451, |
|
"mean_token_accuracy": 0.8650069192051888, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.420745069393718, |
|
"grad_norm": 0.3673326064574297, |
|
"learning_rate": 2.3300685933345656e-06, |
|
"loss": 0.4485, |
|
"mean_token_accuracy": 0.8642319470643998, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4304845385926468, |
|
"grad_norm": 0.37166399663475924, |
|
"learning_rate": 2.2585140854390432e-06, |
|
"loss": 0.4496, |
|
"mean_token_accuracy": 0.8641144469380379, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.4402240077915753, |
|
"grad_norm": 0.34969080725493995, |
|
"learning_rate": 2.187753561796097e-06, |
|
"loss": 0.449, |
|
"mean_token_accuracy": 0.8638374775648117, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.449963476990504, |
|
"grad_norm": 0.33346605175419364, |
|
"learning_rate": 2.117807515942163e-06, |
|
"loss": 0.4487, |
|
"mean_token_accuracy": 0.8639461770653725, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.4597029461894326, |
|
"grad_norm": 0.3402282068550279, |
|
"learning_rate": 2.0486962055260744e-06, |
|
"loss": 0.4381, |
|
"mean_token_accuracy": 0.8668898791074753, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4694424153883614, |
|
"grad_norm": 0.3454615445905308, |
|
"learning_rate": 1.9804396464420798e-06, |
|
"loss": 0.4407, |
|
"mean_token_accuracy": 0.8662415385246277, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.47918188458729, |
|
"grad_norm": 0.3815154168732918, |
|
"learning_rate": 1.9130576070328695e-06, |
|
"loss": 0.4502, |
|
"mean_token_accuracy": 0.863922019302845, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4889213537862187, |
|
"grad_norm": 0.35782414062256107, |
|
"learning_rate": 1.8465696023643115e-06, |
|
"loss": 0.4484, |
|
"mean_token_accuracy": 0.8640252217650414, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.4986608229851472, |
|
"grad_norm": 0.34966179386580715, |
|
"learning_rate": 1.7809948885735295e-06, |
|
"loss": 0.4476, |
|
"mean_token_accuracy": 0.8640663206577301, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.508400292184076, |
|
"grad_norm": 0.3283826370466805, |
|
"learning_rate": 1.7163524572919748e-06, |
|
"loss": 0.4535, |
|
"mean_token_accuracy": 0.8629004299640656, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.5181397613830048, |
|
"grad_norm": 0.3309412101835561, |
|
"learning_rate": 1.6526610301451028e-06, |
|
"loss": 0.4374, |
|
"mean_token_accuracy": 0.8666589662432671, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.5278792305819333, |
|
"grad_norm": 0.35820157894670385, |
|
"learning_rate": 1.5899390533302538e-06, |
|
"loss": 0.4387, |
|
"mean_token_accuracy": 0.8667290091514588, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.5376186997808619, |
|
"grad_norm": 0.34472450626785944, |
|
"learning_rate": 1.5282046922742876e-06, |
|
"loss": 0.4502, |
|
"mean_token_accuracy": 0.8635128363966942, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.5473581689797906, |
|
"grad_norm": 0.31690650065908227, |
|
"learning_rate": 1.4674758263725614e-06, |
|
"loss": 0.4461, |
|
"mean_token_accuracy": 0.8644041374325753, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.5570976381787194, |
|
"grad_norm": 0.3261992242400567, |
|
"learning_rate": 1.4077700438107183e-06, |
|
"loss": 0.445, |
|
"mean_token_accuracy": 0.8651425749063492, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.566837107377648, |
|
"grad_norm": 0.37132210271715693, |
|
"learning_rate": 1.3491046364708294e-06, |
|
"loss": 0.445, |
|
"mean_token_accuracy": 0.8648254871368408, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.5765765765765765, |
|
"grad_norm": 0.34739884194127096, |
|
"learning_rate": 1.2914965949233572e-06, |
|
"loss": 0.4474, |
|
"mean_token_accuracy": 0.8643729150295257, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5863160457755052, |
|
"grad_norm": 0.3092847696386134, |
|
"learning_rate": 1.2349626035063705e-06, |
|
"loss": 0.4389, |
|
"mean_token_accuracy": 0.8666358023881913, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.596055514974434, |
|
"grad_norm": 0.33368514787450476, |
|
"learning_rate": 1.1795190354934587e-06, |
|
"loss": 0.4635, |
|
"mean_token_accuracy": 0.8606240957975387, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6057949841733625, |
|
"grad_norm": 0.3355701607651583, |
|
"learning_rate": 1.1251819483517334e-06, |
|
"loss": 0.4469, |
|
"mean_token_accuracy": 0.8647612199187279, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.615534453372291, |
|
"grad_norm": 0.3215469441235877, |
|
"learning_rate": 1.0719670790912928e-06, |
|
"loss": 0.4479, |
|
"mean_token_accuracy": 0.8641064539551735, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6252739225712198, |
|
"grad_norm": 0.3331920310980409, |
|
"learning_rate": 1.019889839707498e-06, |
|
"loss": 0.447, |
|
"mean_token_accuracy": 0.8645351231098175, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.6350133917701486, |
|
"grad_norm": 0.3311460501397384, |
|
"learning_rate": 9.689653127173743e-07, |
|
"loss": 0.4548, |
|
"mean_token_accuracy": 0.8624306350946427, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6447528609690771, |
|
"grad_norm": 0.3387701231769685, |
|
"learning_rate": 9.192082467914465e-07, |
|
"loss": 0.444, |
|
"mean_token_accuracy": 0.8649628892540931, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.654492330168006, |
|
"grad_norm": 0.33969198086970775, |
|
"learning_rate": 8.706330524822548e-07, |
|
"loss": 0.4413, |
|
"mean_token_accuracy": 0.865912164747715, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6642317993669344, |
|
"grad_norm": 0.32626435288941413, |
|
"learning_rate": 8.232537980507848e-07, |
|
"loss": 0.4454, |
|
"mean_token_accuracy": 0.8650734156370163, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.6739712685658632, |
|
"grad_norm": 0.32534723430048623, |
|
"learning_rate": 7.770842053920585e-07, |
|
"loss": 0.4424, |
|
"mean_token_accuracy": 0.8653772249817848, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.683710737764792, |
|
"grad_norm": 0.33973250353406415, |
|
"learning_rate": 7.321376460610136e-07, |
|
"loss": 0.4398, |
|
"mean_token_accuracy": 0.865781269967556, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.6934502069637205, |
|
"grad_norm": 0.31477206750092435, |
|
"learning_rate": 6.884271373998608e-07, |
|
"loss": 0.4402, |
|
"mean_token_accuracy": 0.8660067468881607, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.703189676162649, |
|
"grad_norm": 0.3537664329765484, |
|
"learning_rate": 6.459653387680248e-07, |
|
"loss": 0.4426, |
|
"mean_token_accuracy": 0.8656087100505829, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.7129291453615778, |
|
"grad_norm": 0.32858889216418335, |
|
"learning_rate": 6.047645478757635e-07, |
|
"loss": 0.4367, |
|
"mean_token_accuracy": 0.8670719146728516, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.7226686145605066, |
|
"grad_norm": 0.32864732265316704, |
|
"learning_rate": 5.648366972225222e-07, |
|
"loss": 0.4527, |
|
"mean_token_accuracy": 0.8630405649542808, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.7324080837594351, |
|
"grad_norm": 0.3173505795175411, |
|
"learning_rate": 5.261933506410722e-07, |
|
"loss": 0.4401, |
|
"mean_token_accuracy": 0.8665132194757461, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7421475529583637, |
|
"grad_norm": 0.36040952773775503, |
|
"learning_rate": 4.888456999484098e-07, |
|
"loss": 0.4465, |
|
"mean_token_accuracy": 0.8646954327821732, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.7518870221572924, |
|
"grad_norm": 0.31604383070212816, |
|
"learning_rate": 4.528045617044019e-07, |
|
"loss": 0.443, |
|
"mean_token_accuracy": 0.8652036920189857, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7616264913562212, |
|
"grad_norm": 0.31389323834734445, |
|
"learning_rate": 4.180803740791156e-07, |
|
"loss": 0.4426, |
|
"mean_token_accuracy": 0.8656812936067582, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.7713659605551497, |
|
"grad_norm": 0.3097110381509705, |
|
"learning_rate": 3.846831938297324e-07, |
|
"loss": 0.4468, |
|
"mean_token_accuracy": 0.8643257409334183, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7811054297540783, |
|
"grad_norm": 0.31179540512244613, |
|
"learning_rate": 3.5262269338792623e-07, |
|
"loss": 0.4447, |
|
"mean_token_accuracy": 0.8651932507753373, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.790844898953007, |
|
"grad_norm": 0.33502042463366694, |
|
"learning_rate": 3.219081580585548e-07, |
|
"loss": 0.4508, |
|
"mean_token_accuracy": 0.863429008424282, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.8005843681519358, |
|
"grad_norm": 0.3164739894198197, |
|
"learning_rate": 2.9254848333046817e-07, |
|
"loss": 0.4528, |
|
"mean_token_accuracy": 0.8630515649914742, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.8103238373508643, |
|
"grad_norm": 0.3030075688047627, |
|
"learning_rate": 2.645521723002037e-07, |
|
"loss": 0.4507, |
|
"mean_token_accuracy": 0.8635053560137749, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.8200633065497929, |
|
"grad_norm": 0.3371646202292577, |
|
"learning_rate": 2.3792733320934348e-07, |
|
"loss": 0.4441, |
|
"mean_token_accuracy": 0.8654543533921242, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.8298027757487216, |
|
"grad_norm": 0.3074575906934633, |
|
"learning_rate": 2.12681677096217e-07, |
|
"loss": 0.4351, |
|
"mean_token_accuracy": 0.8675669968128205, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8395422449476504, |
|
"grad_norm": 0.30631898093344184, |
|
"learning_rate": 1.888225155626433e-07, |
|
"loss": 0.444, |
|
"mean_token_accuracy": 0.8653656959533691, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.8492817141465792, |
|
"grad_norm": 0.2958594503607962, |
|
"learning_rate": 1.6635675865635859e-07, |
|
"loss": 0.4568, |
|
"mean_token_accuracy": 0.8619549512863159, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8590211833455077, |
|
"grad_norm": 0.31258753396965294, |
|
"learning_rate": 1.4529091286973994e-07, |
|
"loss": 0.444, |
|
"mean_token_accuracy": 0.8656645834445953, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.8687606525444362, |
|
"grad_norm": 0.30999125345039114, |
|
"learning_rate": 1.2563107925540774e-07, |
|
"loss": 0.4444, |
|
"mean_token_accuracy": 0.8654705569148063, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.878500121743365, |
|
"grad_norm": 0.3160340098992061, |
|
"learning_rate": 1.0738295165924783e-07, |
|
"loss": 0.4459, |
|
"mean_token_accuracy": 0.8648449763655662, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.8882395909422938, |
|
"grad_norm": 0.3187758679651324, |
|
"learning_rate": 9.055181507137245e-08, |
|
"loss": 0.444, |
|
"mean_token_accuracy": 0.8654528453946113, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.8979790601412223, |
|
"grad_norm": 0.33617665839091787, |
|
"learning_rate": 7.514254409549005e-08, |
|
"loss": 0.4481, |
|
"mean_token_accuracy": 0.8641701668500901, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.9077185293401508, |
|
"grad_norm": 0.31232672260705024, |
|
"learning_rate": 6.115960153712963e-08, |
|
"loss": 0.4414, |
|
"mean_token_accuracy": 0.8660656422376632, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.9174579985390796, |
|
"grad_norm": 0.303119007372895, |
|
"learning_rate": 4.860703711113246e-08, |
|
"loss": 0.4362, |
|
"mean_token_accuracy": 0.8676238685846329, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.9271974677380084, |
|
"grad_norm": 0.30341116951092534, |
|
"learning_rate": 3.748848626878132e-08, |
|
"loss": 0.4508, |
|
"mean_token_accuracy": 0.8636912703514099, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.936936936936937, |
|
"grad_norm": 0.31515978754450974, |
|
"learning_rate": 2.7807169144906108e-08, |
|
"loss": 0.4528, |
|
"mean_token_accuracy": 0.8630853027105332, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.9466764061358655, |
|
"grad_norm": 0.30748596132213357, |
|
"learning_rate": 1.9565889625275945e-08, |
|
"loss": 0.4558, |
|
"mean_token_accuracy": 0.8628044292330742, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9564158753347942, |
|
"grad_norm": 0.30248992557073057, |
|
"learning_rate": 1.2767034534540978e-08, |
|
"loss": 0.4467, |
|
"mean_token_accuracy": 0.86439578384161, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.966155344533723, |
|
"grad_norm": 0.3256862744869616, |
|
"learning_rate": 7.412572944965335e-09, |
|
"loss": 0.4548, |
|
"mean_token_accuracy": 0.8625592529773712, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9758948137326515, |
|
"grad_norm": 0.3147340514506444, |
|
"learning_rate": 3.5040556061483043e-09, |
|
"loss": 0.4422, |
|
"mean_token_accuracy": 0.8657747611403466, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.98563428293158, |
|
"grad_norm": 0.32764257560082577, |
|
"learning_rate": 1.0426144958985974e-09, |
|
"loss": 0.4419, |
|
"mean_token_accuracy": 0.8660721600055694, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9953737521305088, |
|
"grad_norm": 0.3098108021812636, |
|
"learning_rate": 2.8962492393258546e-11, |
|
"loss": 0.4452, |
|
"mean_token_accuracy": 0.8650736406445503, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.9973216459702945, |
|
"mean_token_accuracy": 0.8681519404053688, |
|
"step": 1026, |
|
"total_flos": 1074983740637184.0, |
|
"train_loss": 0.49954891321022377, |
|
"train_runtime": 168401.8035, |
|
"train_samples_per_second": 0.78, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1026, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1074983740637184.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|