{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9973216459702945, "eval_steps": 100, "global_step": 1026, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009739469198928659, "grad_norm": 5.16649192443276, "learning_rate": 4.854368932038835e-07, "loss": 0.956, "mean_token_accuracy": 0.7770209729671478, "step": 5 }, { "epoch": 0.019478938397857318, "grad_norm": 4.487616874759018, "learning_rate": 9.70873786407767e-07, "loss": 0.9595, "mean_token_accuracy": 0.774706457555294, "step": 10 }, { "epoch": 0.029218407596785977, "grad_norm": 1.9918521253065147, "learning_rate": 1.4563106796116506e-06, "loss": 0.9087, "mean_token_accuracy": 0.7794785097241401, "step": 15 }, { "epoch": 0.038957876795714635, "grad_norm": 2.140741610293528, "learning_rate": 1.941747572815534e-06, "loss": 0.8516, "mean_token_accuracy": 0.7845764443278312, "step": 20 }, { "epoch": 0.048697345994643294, "grad_norm": 1.577104941303547, "learning_rate": 2.427184466019418e-06, "loss": 0.7835, "mean_token_accuracy": 0.795551997423172, "step": 25 }, { "epoch": 0.05843681519357195, "grad_norm": 0.9851513931833208, "learning_rate": 2.912621359223301e-06, "loss": 0.7536, "mean_token_accuracy": 0.801626966893673, "step": 30 }, { "epoch": 0.0681762843925006, "grad_norm": 0.7665830684245213, "learning_rate": 3.398058252427185e-06, "loss": 0.7207, "mean_token_accuracy": 0.8073863789439202, "step": 35 }, { "epoch": 0.07791575359142927, "grad_norm": 0.6623786102965908, "learning_rate": 3.883495145631068e-06, "loss": 0.6976, "mean_token_accuracy": 0.8124146014451981, "step": 40 }, { "epoch": 0.08765522279035792, "grad_norm": 0.5409465013517523, "learning_rate": 4.368932038834952e-06, "loss": 0.6791, "mean_token_accuracy": 0.8153241157531739, "step": 45 }, { "epoch": 0.09739469198928659, "grad_norm": 0.533911697336328, "learning_rate": 4.854368932038836e-06, "loss": 0.6494, "mean_token_accuracy": 0.8212776482105255, "step": 50 }, { "epoch": 0.10713416118821524, "grad_norm": 0.46335668910229383, "learning_rate": 5.3398058252427185e-06, "loss": 0.6374, "mean_token_accuracy": 0.8242872759699822, "step": 55 }, { "epoch": 0.1168736303871439, "grad_norm": 0.48547854119926936, "learning_rate": 5.825242718446602e-06, "loss": 0.6321, "mean_token_accuracy": 0.8252887204289436, "step": 60 }, { "epoch": 0.12661309958607256, "grad_norm": 0.4864652293922326, "learning_rate": 6.310679611650487e-06, "loss": 0.6246, "mean_token_accuracy": 0.8261281028389931, "step": 65 }, { "epoch": 0.1363525687850012, "grad_norm": 0.48690138801181443, "learning_rate": 6.79611650485437e-06, "loss": 0.6168, "mean_token_accuracy": 0.8278339207172394, "step": 70 }, { "epoch": 0.14609203798392986, "grad_norm": 0.4337713878880355, "learning_rate": 7.2815533980582534e-06, "loss": 0.5925, "mean_token_accuracy": 0.8333460614085197, "step": 75 }, { "epoch": 0.15583150718285854, "grad_norm": 0.45660620623082415, "learning_rate": 7.766990291262136e-06, "loss": 0.5973, "mean_token_accuracy": 0.8320671111345291, "step": 80 }, { "epoch": 0.1655709763817872, "grad_norm": 0.4424374164027964, "learning_rate": 8.25242718446602e-06, "loss": 0.5873, "mean_token_accuracy": 0.8338468298316002, "step": 85 }, { "epoch": 0.17531044558071585, "grad_norm": 0.4949541791856808, "learning_rate": 8.737864077669904e-06, "loss": 0.5846, "mean_token_accuracy": 0.8346328064799309, "step": 90 }, { "epoch": 0.1850499147796445, "grad_norm": 0.44716595203588544, "learning_rate": 9.223300970873788e-06, "loss": 0.574, "mean_token_accuracy": 0.8365551233291626, "step": 95 }, { "epoch": 0.19478938397857318, "grad_norm": 0.40922461158607365, "learning_rate": 9.708737864077671e-06, "loss": 0.5745, "mean_token_accuracy": 0.8366570115089417, "step": 100 }, { "epoch": 0.20452885317750183, "grad_norm": 0.5016773528604923, "learning_rate": 9.99988415036596e-06, "loss": 0.5649, "mean_token_accuracy": 0.8382768034934998, "step": 105 }, { "epoch": 0.21426832237643048, "grad_norm": 0.5010899629730204, "learning_rate": 9.99858090363555e-06, "loss": 0.569, "mean_token_accuracy": 0.8377967774868011, "step": 110 }, { "epoch": 0.22400779157535913, "grad_norm": 0.5010895934649102, "learning_rate": 9.995829976834402e-06, "loss": 0.5654, "mean_token_accuracy": 0.8383171066641808, "step": 115 }, { "epoch": 0.2337472607742878, "grad_norm": 0.5864787846174829, "learning_rate": 9.99163216668102e-06, "loss": 0.5703, "mean_token_accuracy": 0.8369988009333611, "step": 120 }, { "epoch": 0.24348672997321646, "grad_norm": 0.5502383423638442, "learning_rate": 9.985988688937684e-06, "loss": 0.5632, "mean_token_accuracy": 0.8387595832347869, "step": 125 }, { "epoch": 0.2532261991721451, "grad_norm": 0.4838385873501471, "learning_rate": 9.978901178058333e-06, "loss": 0.5472, "mean_token_accuracy": 0.8424718379974365, "step": 130 }, { "epoch": 0.26296566837107377, "grad_norm": 0.5371306966614764, "learning_rate": 9.970371686715205e-06, "loss": 0.5431, "mean_token_accuracy": 0.8432716697454452, "step": 135 }, { "epoch": 0.2727051375700024, "grad_norm": 0.5584506793424302, "learning_rate": 9.960402685204347e-06, "loss": 0.5516, "mean_token_accuracy": 0.8408507108688354, "step": 140 }, { "epoch": 0.2824446067689311, "grad_norm": 0.5467853450677332, "learning_rate": 9.948997060730161e-06, "loss": 0.5464, "mean_token_accuracy": 0.8424971371889114, "step": 145 }, { "epoch": 0.2921840759678597, "grad_norm": 0.48818575924287444, "learning_rate": 9.936158116569231e-06, "loss": 0.5489, "mean_token_accuracy": 0.8420207649469376, "step": 150 }, { "epoch": 0.30192354516678843, "grad_norm": 0.46723378629583207, "learning_rate": 9.921889571113629e-06, "loss": 0.5462, "mean_token_accuracy": 0.8419363871216774, "step": 155 }, { "epoch": 0.3116630143657171, "grad_norm": 0.561719955013033, "learning_rate": 9.906195556793996e-06, "loss": 0.546, "mean_token_accuracy": 0.8423524782061577, "step": 160 }, { "epoch": 0.32140248356464574, "grad_norm": 0.625160714021079, "learning_rate": 9.889080618882719e-06, "loss": 0.5335, "mean_token_accuracy": 0.8451830595731735, "step": 165 }, { "epoch": 0.3311419527635744, "grad_norm": 0.5878540451317104, "learning_rate": 9.870549714177538e-06, "loss": 0.5463, "mean_token_accuracy": 0.8417419150471688, "step": 170 }, { "epoch": 0.34088142196250304, "grad_norm": 0.6154889939045929, "learning_rate": 9.850608209565967e-06, "loss": 0.5327, "mean_token_accuracy": 0.8450453072786331, "step": 175 }, { "epoch": 0.3506208911614317, "grad_norm": 0.4329465610848603, "learning_rate": 9.829261880470941e-06, "loss": 0.5392, "mean_token_accuracy": 0.8434989348053932, "step": 180 }, { "epoch": 0.36036036036036034, "grad_norm": 0.5971707984256834, "learning_rate": 9.806516909178161e-06, "loss": 0.5324, "mean_token_accuracy": 0.84499292075634, "step": 185 }, { "epoch": 0.370099829559289, "grad_norm": 0.44190322710346147, "learning_rate": 9.78237988304557e-06, "loss": 0.5332, "mean_token_accuracy": 0.845010556280613, "step": 190 }, { "epoch": 0.3798392987582177, "grad_norm": 0.4258724885095506, "learning_rate": 9.756857792595555e-06, "loss": 0.5319, "mean_token_accuracy": 0.845000034570694, "step": 195 }, { "epoch": 0.38957876795714635, "grad_norm": 0.4729261486427116, "learning_rate": 9.729958029490353e-06, "loss": 0.5336, "mean_token_accuracy": 0.8447421163320541, "step": 200 }, { "epoch": 0.399318237156075, "grad_norm": 0.4528083565277609, "learning_rate": 9.701688384391296e-06, "loss": 0.5347, "mean_token_accuracy": 0.8443498685956001, "step": 205 }, { "epoch": 0.40905770635500366, "grad_norm": 0.5259149339810829, "learning_rate": 9.672057044702492e-06, "loss": 0.5199, "mean_token_accuracy": 0.848120279610157, "step": 210 }, { "epoch": 0.4187971755539323, "grad_norm": 0.46836949308672154, "learning_rate": 9.641072592199599e-06, "loss": 0.5219, "mean_token_accuracy": 0.8473641723394394, "step": 215 }, { "epoch": 0.42853664475286096, "grad_norm": 0.5302502163183013, "learning_rate": 9.608744000544392e-06, "loss": 0.5174, "mean_token_accuracy": 0.8485160410404206, "step": 220 }, { "epoch": 0.4382761139517896, "grad_norm": 0.51343914785562, "learning_rate": 9.575080632685832e-06, "loss": 0.5239, "mean_token_accuracy": 0.846913392841816, "step": 225 }, { "epoch": 0.44801558315071827, "grad_norm": 0.5925559680424145, "learning_rate": 9.54009223814837e-06, "loss": 0.5277, "mean_token_accuracy": 0.8454070091247559, "step": 230 }, { "epoch": 0.4577550523496469, "grad_norm": 0.426110283810703, "learning_rate": 9.503788950208324e-06, "loss": 0.5215, "mean_token_accuracy": 0.8473676040768623, "step": 235 }, { "epoch": 0.4674945215485756, "grad_norm": 0.5303378941560718, "learning_rate": 9.466181282959083e-06, "loss": 0.5282, "mean_token_accuracy": 0.8465007901191711, "step": 240 }, { "epoch": 0.4772339907475043, "grad_norm": 0.4965315113021109, "learning_rate": 9.427280128266049e-06, "loss": 0.5179, "mean_token_accuracy": 0.8484022691845894, "step": 245 }, { "epoch": 0.48697345994643293, "grad_norm": 0.44982624201366644, "learning_rate": 9.387096752612144e-06, "loss": 0.5224, "mean_token_accuracy": 0.8463509559631348, "step": 250 }, { "epoch": 0.4967129291453616, "grad_norm": 0.4623459721033935, "learning_rate": 9.345642793834825e-06, "loss": 0.5271, "mean_token_accuracy": 0.8463737353682518, "step": 255 }, { "epoch": 0.5064523983442902, "grad_norm": 0.5825923961149249, "learning_rate": 9.302930257755579e-06, "loss": 0.53, "mean_token_accuracy": 0.8450088694691658, "step": 260 }, { "epoch": 0.5161918675432189, "grad_norm": 0.45940800329264964, "learning_rate": 9.258971514702789e-06, "loss": 0.507, "mean_token_accuracy": 0.8508246764540672, "step": 265 }, { "epoch": 0.5259313367421475, "grad_norm": 0.4465364801256697, "learning_rate": 9.213779295929082e-06, "loss": 0.5087, "mean_token_accuracy": 0.8500014141201973, "step": 270 }, { "epoch": 0.5356708059410762, "grad_norm": 0.4686695606038613, "learning_rate": 9.167366689924116e-06, "loss": 0.5163, "mean_token_accuracy": 0.8484609499573708, "step": 275 }, { "epoch": 0.5454102751400048, "grad_norm": 0.5572145736502692, "learning_rate": 9.119747138623925e-06, "loss": 0.5221, "mean_token_accuracy": 0.8470426678657532, "step": 280 }, { "epoch": 0.5551497443389335, "grad_norm": 0.4765827682250088, "learning_rate": 9.070934433517872e-06, "loss": 0.5068, "mean_token_accuracy": 0.8509115263819694, "step": 285 }, { "epoch": 0.5648892135378621, "grad_norm": 0.49934327207360063, "learning_rate": 9.020942711654404e-06, "loss": 0.5106, "mean_token_accuracy": 0.8498208403587342, "step": 290 }, { "epoch": 0.5746286827367908, "grad_norm": 0.5101324493724546, "learning_rate": 8.969786451546691e-06, "loss": 0.5123, "mean_token_accuracy": 0.8496938437223435, "step": 295 }, { "epoch": 0.5843681519357194, "grad_norm": 0.5089021103596151, "learning_rate": 8.917480468979387e-06, "loss": 0.5128, "mean_token_accuracy": 0.8487787261605263, "step": 300 }, { "epoch": 0.5941076211346482, "grad_norm": 0.5501775930667125, "learning_rate": 8.864039912717713e-06, "loss": 0.5123, "mean_token_accuracy": 0.849444879591465, "step": 305 }, { "epoch": 0.6038470903335769, "grad_norm": 0.41563520519489466, "learning_rate": 8.809480260120096e-06, "loss": 0.5048, "mean_token_accuracy": 0.8513683333992959, "step": 310 }, { "epoch": 0.6135865595325055, "grad_norm": 0.4999791105897695, "learning_rate": 8.753817312655642e-06, "loss": 0.514, "mean_token_accuracy": 0.8484693005681038, "step": 315 }, { "epoch": 0.6233260287314342, "grad_norm": 0.48088239973601826, "learning_rate": 8.697067191327748e-06, "loss": 0.5114, "mean_token_accuracy": 0.8495015501976013, "step": 320 }, { "epoch": 0.6330654979303628, "grad_norm": 0.4440141428026334, "learning_rate": 8.639246332005163e-06, "loss": 0.5064, "mean_token_accuracy": 0.8507678374648094, "step": 325 }, { "epoch": 0.6428049671292915, "grad_norm": 0.5288324873519791, "learning_rate": 8.580371480661857e-06, "loss": 0.5024, "mean_token_accuracy": 0.8514497712254524, "step": 330 }, { "epoch": 0.6525444363282201, "grad_norm": 0.4563934748637001, "learning_rate": 8.520459688527091e-06, "loss": 0.5108, "mean_token_accuracy": 0.849525935947895, "step": 335 }, { "epoch": 0.6622839055271488, "grad_norm": 0.4756063591717307, "learning_rate": 8.459528307147066e-06, "loss": 0.51, "mean_token_accuracy": 0.8501726359128952, "step": 340 }, { "epoch": 0.6720233747260774, "grad_norm": 0.5732019741491505, "learning_rate": 8.397594983359591e-06, "loss": 0.5062, "mean_token_accuracy": 0.850397090613842, "step": 345 }, { "epoch": 0.6817628439250061, "grad_norm": 0.5267653966986828, "learning_rate": 8.334677654183254e-06, "loss": 0.5065, "mean_token_accuracy": 0.8505988359451294, "step": 350 }, { "epoch": 0.6915023131239347, "grad_norm": 0.4796149747636043, "learning_rate": 8.27079454162252e-06, "loss": 0.5028, "mean_token_accuracy": 0.851087860763073, "step": 355 }, { "epoch": 0.7012417823228634, "grad_norm": 0.4809967856664383, "learning_rate": 8.205964147390313e-06, "loss": 0.5084, "mean_token_accuracy": 0.8496400877833367, "step": 360 }, { "epoch": 0.710981251521792, "grad_norm": 0.4348534092140185, "learning_rate": 8.140205247549583e-06, "loss": 0.4983, "mean_token_accuracy": 0.8522587567567825, "step": 365 }, { "epoch": 0.7207207207207207, "grad_norm": 0.4799630529257426, "learning_rate": 8.073536887075417e-06, "loss": 0.5119, "mean_token_accuracy": 0.8493492469191551, "step": 370 }, { "epoch": 0.7304601899196493, "grad_norm": 0.5305052950944196, "learning_rate": 8.005978374339264e-06, "loss": 0.4946, "mean_token_accuracy": 0.8531364649534225, "step": 375 }, { "epoch": 0.740199659118578, "grad_norm": 0.46401531374429855, "learning_rate": 7.937549275516882e-06, "loss": 0.493, "mean_token_accuracy": 0.8535278528928757, "step": 380 }, { "epoch": 0.7499391283175066, "grad_norm": 0.46753323308197925, "learning_rate": 7.868269408921614e-06, "loss": 0.504, "mean_token_accuracy": 0.8511819407343865, "step": 385 }, { "epoch": 0.7596785975164354, "grad_norm": 0.43897436674183826, "learning_rate": 7.798158839264645e-06, "loss": 0.4983, "mean_token_accuracy": 0.8521765768527985, "step": 390 }, { "epoch": 0.7694180667153641, "grad_norm": 0.4030095081507055, "learning_rate": 7.7272378718439e-06, "loss": 0.5092, "mean_token_accuracy": 0.8500049978494644, "step": 395 }, { "epoch": 0.7791575359142927, "grad_norm": 0.393465790048579, "learning_rate": 7.655527046663254e-06, "loss": 0.5022, "mean_token_accuracy": 0.8510702222585678, "step": 400 }, { "epoch": 0.7888970051132214, "grad_norm": 0.49269595855244275, "learning_rate": 7.5830471324837765e-06, "loss": 0.4945, "mean_token_accuracy": 0.8529314771294594, "step": 405 }, { "epoch": 0.79863647431215, "grad_norm": 0.4729001525106615, "learning_rate": 7.5098191208087144e-06, "loss": 0.5, "mean_token_accuracy": 0.8525989070534706, "step": 410 }, { "epoch": 0.8083759435110787, "grad_norm": 0.44720074409020516, "learning_rate": 7.4358642198039835e-06, "loss": 0.4946, "mean_token_accuracy": 0.8530281245708465, "step": 415 }, { "epoch": 0.8181154127100073, "grad_norm": 0.4192773521625985, "learning_rate": 7.36120384815588e-06, "loss": 0.4927, "mean_token_accuracy": 0.8539465010166168, "step": 420 }, { "epoch": 0.827854881908936, "grad_norm": 0.41037683317185314, "learning_rate": 7.285859628867851e-06, "loss": 0.4952, "mean_token_accuracy": 0.8532393842935562, "step": 425 }, { "epoch": 0.8375943511078646, "grad_norm": 0.4536782151813187, "learning_rate": 7.209853382998077e-06, "loss": 0.4983, "mean_token_accuracy": 0.8528945103287697, "step": 430 }, { "epoch": 0.8473338203067933, "grad_norm": 0.4895833290473578, "learning_rate": 7.133207123339689e-06, "loss": 0.4939, "mean_token_accuracy": 0.853129243850708, "step": 435 }, { "epoch": 0.8570732895057219, "grad_norm": 0.4771689532642086, "learning_rate": 7.055943048045476e-06, "loss": 0.5002, "mean_token_accuracy": 0.8518661975860595, "step": 440 }, { "epoch": 0.8668127587046506, "grad_norm": 0.4439177979495324, "learning_rate": 6.978083534198878e-06, "loss": 0.4977, "mean_token_accuracy": 0.8526063248515129, "step": 445 }, { "epoch": 0.8765522279035792, "grad_norm": 0.45795986197220584, "learning_rate": 6.899651131333194e-06, "loss": 0.4876, "mean_token_accuracy": 0.8546572834253311, "step": 450 }, { "epoch": 0.8862916971025079, "grad_norm": 0.38084088770564956, "learning_rate": 6.82066855490081e-06, "loss": 0.5009, "mean_token_accuracy": 0.851148933172226, "step": 455 }, { "epoch": 0.8960311663014365, "grad_norm": 0.4552737931829716, "learning_rate": 6.741158679694403e-06, "loss": 0.4968, "mean_token_accuracy": 0.8524380102753639, "step": 460 }, { "epoch": 0.9057706355003652, "grad_norm": 0.38831398930391275, "learning_rate": 6.661144533221974e-06, "loss": 0.4897, "mean_token_accuracy": 0.8537176489830017, "step": 465 }, { "epoch": 0.9155101046992938, "grad_norm": 0.42794585960634596, "learning_rate": 6.58064928903767e-06, "loss": 0.4942, "mean_token_accuracy": 0.8529506504535675, "step": 470 }, { "epoch": 0.9252495738982226, "grad_norm": 0.42890975667323755, "learning_rate": 6.499696260030297e-06, "loss": 0.5064, "mean_token_accuracy": 0.8502799227833748, "step": 475 }, { "epoch": 0.9349890430971513, "grad_norm": 0.4478332747744812, "learning_rate": 6.418308891671484e-06, "loss": 0.4855, "mean_token_accuracy": 0.8555838361382484, "step": 480 }, { "epoch": 0.9447285122960799, "grad_norm": 0.435211802978247, "learning_rate": 6.336510755225447e-06, "loss": 0.4835, "mean_token_accuracy": 0.8558964654803276, "step": 485 }, { "epoch": 0.9544679814950086, "grad_norm": 0.4462726381437146, "learning_rate": 6.25432554092232e-06, "loss": 0.4898, "mean_token_accuracy": 0.8544702440500259, "step": 490 }, { "epoch": 0.9642074506939372, "grad_norm": 0.44503915866443544, "learning_rate": 6.171777051097037e-06, "loss": 0.4858, "mean_token_accuracy": 0.8554443955421448, "step": 495 }, { "epoch": 0.9739469198928659, "grad_norm": 0.47966098493432496, "learning_rate": 6.088889193295738e-06, "loss": 0.4929, "mean_token_accuracy": 0.8535514727234841, "step": 500 }, { "epoch": 0.9836863890917945, "grad_norm": 0.5437585793823457, "learning_rate": 6.005685973351708e-06, "loss": 0.4931, "mean_token_accuracy": 0.8531181156635285, "step": 505 }, { "epoch": 0.9934258582907232, "grad_norm": 0.4124694681433172, "learning_rate": 5.922191488432857e-06, "loss": 0.4917, "mean_token_accuracy": 0.8535135626792908, "step": 510 }, { "epoch": 1.0019478938397857, "grad_norm": 0.803029283965446, "learning_rate": 5.838429920062734e-06, "loss": 0.4727, "mean_token_accuracy": 0.8567549926894051, "step": 515 }, { "epoch": 1.0116873630387144, "grad_norm": 0.3927635667112885, "learning_rate": 5.754425527117118e-06, "loss": 0.4479, "mean_token_accuracy": 0.864441742002964, "step": 520 }, { "epoch": 1.021426832237643, "grad_norm": 0.45928387456979786, "learning_rate": 5.670202638798213e-06, "loss": 0.4598, "mean_token_accuracy": 0.8615871027112008, "step": 525 }, { "epoch": 1.0311663014365717, "grad_norm": 0.517389580126703, "learning_rate": 5.585785647588458e-06, "loss": 0.4572, "mean_token_accuracy": 0.8620010375976562, "step": 530 }, { "epoch": 1.0409057706355003, "grad_norm": 0.37886924917283415, "learning_rate": 5.501199002186024e-06, "loss": 0.455, "mean_token_accuracy": 0.8625968441367149, "step": 535 }, { "epoch": 1.050645239834429, "grad_norm": 0.39773153996958316, "learning_rate": 5.416467200424032e-06, "loss": 0.45, "mean_token_accuracy": 0.8637859463691712, "step": 540 }, { "epoch": 1.0603847090333576, "grad_norm": 0.40508395528042257, "learning_rate": 5.33161478217552e-06, "loss": 0.4516, "mean_token_accuracy": 0.8634377360343933, "step": 545 }, { "epoch": 1.0701241782322863, "grad_norm": 0.37429603209812456, "learning_rate": 5.246666322246267e-06, "loss": 0.4445, "mean_token_accuracy": 0.8651037693023682, "step": 550 }, { "epoch": 1.079863647431215, "grad_norm": 0.47125189912053295, "learning_rate": 5.1616464232574635e-06, "loss": 0.4626, "mean_token_accuracy": 0.8602706581354141, "step": 555 }, { "epoch": 1.0896031166301436, "grad_norm": 0.435383825468146, "learning_rate": 5.076579708520355e-06, "loss": 0.4497, "mean_token_accuracy": 0.8635805040597916, "step": 560 }, { "epoch": 1.0993425858290724, "grad_norm": 0.412421290706311, "learning_rate": 4.991490814904888e-06, "loss": 0.4378, "mean_token_accuracy": 0.8668369174003601, "step": 565 }, { "epoch": 1.109082055028001, "grad_norm": 0.41209358309681204, "learning_rate": 4.906404385704402e-06, "loss": 0.4525, "mean_token_accuracy": 0.8637306377291679, "step": 570 }, { "epoch": 1.1188215242269297, "grad_norm": 0.3821281290056491, "learning_rate": 4.82134506349851e-06, "loss": 0.4564, "mean_token_accuracy": 0.8625956058502198, "step": 575 }, { "epoch": 1.1285609934258582, "grad_norm": 0.34980242368851916, "learning_rate": 4.736337483016138e-06, "loss": 0.4513, "mean_token_accuracy": 0.8634121060371399, "step": 580 }, { "epoch": 1.138300462624787, "grad_norm": 0.39297932611221925, "learning_rate": 4.651406264000871e-06, "loss": 0.4512, "mean_token_accuracy": 0.8632175624370575, "step": 585 }, { "epoch": 1.1480399318237156, "grad_norm": 0.4317512663581136, "learning_rate": 4.5665760040806174e-06, "loss": 0.4558, "mean_token_accuracy": 0.8621700823307037, "step": 590 }, { "epoch": 1.1577794010226443, "grad_norm": 0.4166519413744608, "learning_rate": 4.481871271643698e-06, "loss": 0.4543, "mean_token_accuracy": 0.8628953084349632, "step": 595 }, { "epoch": 1.1675188702215729, "grad_norm": 0.3817664896689048, "learning_rate": 4.397316598723385e-06, "loss": 0.4599, "mean_token_accuracy": 0.8614401906728745, "step": 600 }, { "epoch": 1.1772583394205016, "grad_norm": 0.39223028227054235, "learning_rate": 4.312936473892984e-06, "loss": 0.4559, "mean_token_accuracy": 0.8621881052851676, "step": 605 }, { "epoch": 1.1869978086194302, "grad_norm": 0.39870192645434044, "learning_rate": 4.228755335173488e-06, "loss": 0.4554, "mean_token_accuracy": 0.8622724115848541, "step": 610 }, { "epoch": 1.196737277818359, "grad_norm": 0.3968827579618214, "learning_rate": 4.1447975629559e-06, "loss": 0.4496, "mean_token_accuracy": 0.8639340966939926, "step": 615 }, { "epoch": 1.2064767470172875, "grad_norm": 0.3525974840125754, "learning_rate": 4.061087472940204e-06, "loss": 0.4468, "mean_token_accuracy": 0.8643500834703446, "step": 620 }, { "epoch": 1.2162162162162162, "grad_norm": 0.34784808537613615, "learning_rate": 3.977649309093113e-06, "loss": 0.4463, "mean_token_accuracy": 0.8645710095763206, "step": 625 }, { "epoch": 1.225955685415145, "grad_norm": 0.3588527258157974, "learning_rate": 3.89450723662657e-06, "loss": 0.4517, "mean_token_accuracy": 0.863483439385891, "step": 630 }, { "epoch": 1.2356951546140735, "grad_norm": 0.495886348965792, "learning_rate": 3.8116853349990574e-06, "loss": 0.4507, "mean_token_accuracy": 0.8636451244354248, "step": 635 }, { "epoch": 1.245434623813002, "grad_norm": 0.3842625748309975, "learning_rate": 3.729207590941753e-06, "loss": 0.4377, "mean_token_accuracy": 0.8670691177248955, "step": 640 }, { "epoch": 1.2551740930119308, "grad_norm": 0.4426936471853252, "learning_rate": 3.647097891511536e-06, "loss": 0.4495, "mean_token_accuracy": 0.8640359625220299, "step": 645 }, { "epoch": 1.2649135622108596, "grad_norm": 0.3707914127374137, "learning_rate": 3.565380017172854e-06, "loss": 0.4397, "mean_token_accuracy": 0.8666136890649796, "step": 650 }, { "epoch": 1.2746530314097881, "grad_norm": 0.35267532874755536, "learning_rate": 3.4840776349104755e-06, "loss": 0.4539, "mean_token_accuracy": 0.8626947477459908, "step": 655 }, { "epoch": 1.284392500608717, "grad_norm": 0.3749743227529099, "learning_rate": 3.4032142913750956e-06, "loss": 0.4497, "mean_token_accuracy": 0.8637418314814568, "step": 660 }, { "epoch": 1.2941319698076454, "grad_norm": 0.37600006002277636, "learning_rate": 3.322813406063794e-06, "loss": 0.4559, "mean_token_accuracy": 0.8622224271297455, "step": 665 }, { "epoch": 1.3038714390065742, "grad_norm": 0.4005804398308692, "learning_rate": 3.242898264537331e-06, "loss": 0.4521, "mean_token_accuracy": 0.8632362619042396, "step": 670 }, { "epoch": 1.3136109082055027, "grad_norm": 0.38630202996272256, "learning_rate": 3.1634920116762175e-06, "loss": 0.4499, "mean_token_accuracy": 0.8635998621582985, "step": 675 }, { "epoch": 1.3233503774044315, "grad_norm": 0.36180685239531096, "learning_rate": 3.0846176449775363e-06, "loss": 0.4508, "mean_token_accuracy": 0.8636634424328804, "step": 680 }, { "epoch": 1.33308984660336, "grad_norm": 0.3357931590954893, "learning_rate": 3.0062980078944515e-06, "loss": 0.4379, "mean_token_accuracy": 0.8665053129196167, "step": 685 }, { "epoch": 1.3428293158022888, "grad_norm": 0.35662293115272325, "learning_rate": 2.9285557832203328e-06, "loss": 0.4458, "mean_token_accuracy": 0.8648849859833717, "step": 690 }, { "epoch": 1.3525687850012174, "grad_norm": 0.35254913645932934, "learning_rate": 2.851413486519388e-06, "loss": 0.4413, "mean_token_accuracy": 0.8654858738183975, "step": 695 }, { "epoch": 1.3623082542001461, "grad_norm": 0.3448432687225989, "learning_rate": 2.774893459605766e-06, "loss": 0.4431, "mean_token_accuracy": 0.86555365473032, "step": 700 }, { "epoch": 1.3720477233990747, "grad_norm": 0.35405617501926967, "learning_rate": 2.69901786407295e-06, "loss": 0.444, "mean_token_accuracy": 0.8651208564639091, "step": 705 }, { "epoch": 1.3817871925980034, "grad_norm": 0.36859590132072323, "learning_rate": 2.6238086748753587e-06, "loss": 0.456, "mean_token_accuracy": 0.8627100110054016, "step": 710 }, { "epoch": 1.3915266617969322, "grad_norm": 0.35246470987862855, "learning_rate": 2.5492876739639912e-06, "loss": 0.4533, "mean_token_accuracy": 0.8628792524337768, "step": 715 }, { "epoch": 1.4012661309958607, "grad_norm": 0.36351033252725057, "learning_rate": 2.475476443977996e-06, "loss": 0.4469, "mean_token_accuracy": 0.8642540082335473, "step": 720 }, { "epoch": 1.4110056001947893, "grad_norm": 0.35938233389923585, "learning_rate": 2.40239636199393e-06, "loss": 0.4451, "mean_token_accuracy": 0.8650069192051888, "step": 725 }, { "epoch": 1.420745069393718, "grad_norm": 0.3673326064574297, "learning_rate": 2.3300685933345656e-06, "loss": 0.4485, "mean_token_accuracy": 0.8642319470643998, "step": 730 }, { "epoch": 1.4304845385926468, "grad_norm": 0.37166399663475924, "learning_rate": 2.2585140854390432e-06, "loss": 0.4496, "mean_token_accuracy": 0.8641144469380379, "step": 735 }, { "epoch": 1.4402240077915753, "grad_norm": 0.34969080725493995, "learning_rate": 2.187753561796097e-06, "loss": 0.449, "mean_token_accuracy": 0.8638374775648117, "step": 740 }, { "epoch": 1.449963476990504, "grad_norm": 0.33346605175419364, "learning_rate": 2.117807515942163e-06, "loss": 0.4487, "mean_token_accuracy": 0.8639461770653725, "step": 745 }, { "epoch": 1.4597029461894326, "grad_norm": 0.3402282068550279, "learning_rate": 2.0486962055260744e-06, "loss": 0.4381, "mean_token_accuracy": 0.8668898791074753, "step": 750 }, { "epoch": 1.4694424153883614, "grad_norm": 0.3454615445905308, "learning_rate": 1.9804396464420798e-06, "loss": 0.4407, "mean_token_accuracy": 0.8662415385246277, "step": 755 }, { "epoch": 1.47918188458729, "grad_norm": 0.3815154168732918, "learning_rate": 1.9130576070328695e-06, "loss": 0.4502, "mean_token_accuracy": 0.863922019302845, "step": 760 }, { "epoch": 1.4889213537862187, "grad_norm": 0.35782414062256107, "learning_rate": 1.8465696023643115e-06, "loss": 0.4484, "mean_token_accuracy": 0.8640252217650414, "step": 765 }, { "epoch": 1.4986608229851472, "grad_norm": 0.34966179386580715, "learning_rate": 1.7809948885735295e-06, "loss": 0.4476, "mean_token_accuracy": 0.8640663206577301, "step": 770 }, { "epoch": 1.508400292184076, "grad_norm": 0.3283826370466805, "learning_rate": 1.7163524572919748e-06, "loss": 0.4535, "mean_token_accuracy": 0.8629004299640656, "step": 775 }, { "epoch": 1.5181397613830048, "grad_norm": 0.3309412101835561, "learning_rate": 1.6526610301451028e-06, "loss": 0.4374, "mean_token_accuracy": 0.8666589662432671, "step": 780 }, { "epoch": 1.5278792305819333, "grad_norm": 0.35820157894670385, "learning_rate": 1.5899390533302538e-06, "loss": 0.4387, "mean_token_accuracy": 0.8667290091514588, "step": 785 }, { "epoch": 1.5376186997808619, "grad_norm": 0.34472450626785944, "learning_rate": 1.5282046922742876e-06, "loss": 0.4502, "mean_token_accuracy": 0.8635128363966942, "step": 790 }, { "epoch": 1.5473581689797906, "grad_norm": 0.31690650065908227, "learning_rate": 1.4674758263725614e-06, "loss": 0.4461, "mean_token_accuracy": 0.8644041374325753, "step": 795 }, { "epoch": 1.5570976381787194, "grad_norm": 0.3261992242400567, "learning_rate": 1.4077700438107183e-06, "loss": 0.445, "mean_token_accuracy": 0.8651425749063492, "step": 800 }, { "epoch": 1.566837107377648, "grad_norm": 0.37132210271715693, "learning_rate": 1.3491046364708294e-06, "loss": 0.445, "mean_token_accuracy": 0.8648254871368408, "step": 805 }, { "epoch": 1.5765765765765765, "grad_norm": 0.34739884194127096, "learning_rate": 1.2914965949233572e-06, "loss": 0.4474, "mean_token_accuracy": 0.8643729150295257, "step": 810 }, { "epoch": 1.5863160457755052, "grad_norm": 0.3092847696386134, "learning_rate": 1.2349626035063705e-06, "loss": 0.4389, "mean_token_accuracy": 0.8666358023881913, "step": 815 }, { "epoch": 1.596055514974434, "grad_norm": 0.33368514787450476, "learning_rate": 1.1795190354934587e-06, "loss": 0.4635, "mean_token_accuracy": 0.8606240957975387, "step": 820 }, { "epoch": 1.6057949841733625, "grad_norm": 0.3355701607651583, "learning_rate": 1.1251819483517334e-06, "loss": 0.4469, "mean_token_accuracy": 0.8647612199187279, "step": 825 }, { "epoch": 1.615534453372291, "grad_norm": 0.3215469441235877, "learning_rate": 1.0719670790912928e-06, "loss": 0.4479, "mean_token_accuracy": 0.8641064539551735, "step": 830 }, { "epoch": 1.6252739225712198, "grad_norm": 0.3331920310980409, "learning_rate": 1.019889839707498e-06, "loss": 0.447, "mean_token_accuracy": 0.8645351231098175, "step": 835 }, { "epoch": 1.6350133917701486, "grad_norm": 0.3311460501397384, "learning_rate": 9.689653127173743e-07, "loss": 0.4548, "mean_token_accuracy": 0.8624306350946427, "step": 840 }, { "epoch": 1.6447528609690771, "grad_norm": 0.3387701231769685, "learning_rate": 9.192082467914465e-07, "loss": 0.444, "mean_token_accuracy": 0.8649628892540931, "step": 845 }, { "epoch": 1.654492330168006, "grad_norm": 0.33969198086970775, "learning_rate": 8.706330524822548e-07, "loss": 0.4413, "mean_token_accuracy": 0.865912164747715, "step": 850 }, { "epoch": 1.6642317993669344, "grad_norm": 0.32626435288941413, "learning_rate": 8.232537980507848e-07, "loss": 0.4454, "mean_token_accuracy": 0.8650734156370163, "step": 855 }, { "epoch": 1.6739712685658632, "grad_norm": 0.32534723430048623, "learning_rate": 7.770842053920585e-07, "loss": 0.4424, "mean_token_accuracy": 0.8653772249817848, "step": 860 }, { "epoch": 1.683710737764792, "grad_norm": 0.33973250353406415, "learning_rate": 7.321376460610136e-07, "loss": 0.4398, "mean_token_accuracy": 0.865781269967556, "step": 865 }, { "epoch": 1.6934502069637205, "grad_norm": 0.31477206750092435, "learning_rate": 6.884271373998608e-07, "loss": 0.4402, "mean_token_accuracy": 0.8660067468881607, "step": 870 }, { "epoch": 1.703189676162649, "grad_norm": 0.3537664329765484, "learning_rate": 6.459653387680248e-07, "loss": 0.4426, "mean_token_accuracy": 0.8656087100505829, "step": 875 }, { "epoch": 1.7129291453615778, "grad_norm": 0.32858889216418335, "learning_rate": 6.047645478757635e-07, "loss": 0.4367, "mean_token_accuracy": 0.8670719146728516, "step": 880 }, { "epoch": 1.7226686145605066, "grad_norm": 0.32864732265316704, "learning_rate": 5.648366972225222e-07, "loss": 0.4527, "mean_token_accuracy": 0.8630405649542808, "step": 885 }, { "epoch": 1.7324080837594351, "grad_norm": 0.3173505795175411, "learning_rate": 5.261933506410722e-07, "loss": 0.4401, "mean_token_accuracy": 0.8665132194757461, "step": 890 }, { "epoch": 1.7421475529583637, "grad_norm": 0.36040952773775503, "learning_rate": 4.888456999484098e-07, "loss": 0.4465, "mean_token_accuracy": 0.8646954327821732, "step": 895 }, { "epoch": 1.7518870221572924, "grad_norm": 0.31604383070212816, "learning_rate": 4.528045617044019e-07, "loss": 0.443, "mean_token_accuracy": 0.8652036920189857, "step": 900 }, { "epoch": 1.7616264913562212, "grad_norm": 0.31389323834734445, "learning_rate": 4.180803740791156e-07, "loss": 0.4426, "mean_token_accuracy": 0.8656812936067582, "step": 905 }, { "epoch": 1.7713659605551497, "grad_norm": 0.3097110381509705, "learning_rate": 3.846831938297324e-07, "loss": 0.4468, "mean_token_accuracy": 0.8643257409334183, "step": 910 }, { "epoch": 1.7811054297540783, "grad_norm": 0.31179540512244613, "learning_rate": 3.5262269338792623e-07, "loss": 0.4447, "mean_token_accuracy": 0.8651932507753373, "step": 915 }, { "epoch": 1.790844898953007, "grad_norm": 0.33502042463366694, "learning_rate": 3.219081580585548e-07, "loss": 0.4508, "mean_token_accuracy": 0.863429008424282, "step": 920 }, { "epoch": 1.8005843681519358, "grad_norm": 0.3164739894198197, "learning_rate": 2.9254848333046817e-07, "loss": 0.4528, "mean_token_accuracy": 0.8630515649914742, "step": 925 }, { "epoch": 1.8103238373508643, "grad_norm": 0.3030075688047627, "learning_rate": 2.645521723002037e-07, "loss": 0.4507, "mean_token_accuracy": 0.8635053560137749, "step": 930 }, { "epoch": 1.8200633065497929, "grad_norm": 0.3371646202292577, "learning_rate": 2.3792733320934348e-07, "loss": 0.4441, "mean_token_accuracy": 0.8654543533921242, "step": 935 }, { "epoch": 1.8298027757487216, "grad_norm": 0.3074575906934633, "learning_rate": 2.12681677096217e-07, "loss": 0.4351, "mean_token_accuracy": 0.8675669968128205, "step": 940 }, { "epoch": 1.8395422449476504, "grad_norm": 0.30631898093344184, "learning_rate": 1.888225155626433e-07, "loss": 0.444, "mean_token_accuracy": 0.8653656959533691, "step": 945 }, { "epoch": 1.8492817141465792, "grad_norm": 0.2958594503607962, "learning_rate": 1.6635675865635859e-07, "loss": 0.4568, "mean_token_accuracy": 0.8619549512863159, "step": 950 }, { "epoch": 1.8590211833455077, "grad_norm": 0.31258753396965294, "learning_rate": 1.4529091286973994e-07, "loss": 0.444, "mean_token_accuracy": 0.8656645834445953, "step": 955 }, { "epoch": 1.8687606525444362, "grad_norm": 0.30999125345039114, "learning_rate": 1.2563107925540774e-07, "loss": 0.4444, "mean_token_accuracy": 0.8654705569148063, "step": 960 }, { "epoch": 1.878500121743365, "grad_norm": 0.3160340098992061, "learning_rate": 1.0738295165924783e-07, "loss": 0.4459, "mean_token_accuracy": 0.8648449763655662, "step": 965 }, { "epoch": 1.8882395909422938, "grad_norm": 0.3187758679651324, "learning_rate": 9.055181507137245e-08, "loss": 0.444, "mean_token_accuracy": 0.8654528453946113, "step": 970 }, { "epoch": 1.8979790601412223, "grad_norm": 0.33617665839091787, "learning_rate": 7.514254409549005e-08, "loss": 0.4481, "mean_token_accuracy": 0.8641701668500901, "step": 975 }, { "epoch": 1.9077185293401508, "grad_norm": 0.31232672260705024, "learning_rate": 6.115960153712963e-08, "loss": 0.4414, "mean_token_accuracy": 0.8660656422376632, "step": 980 }, { "epoch": 1.9174579985390796, "grad_norm": 0.303119007372895, "learning_rate": 4.860703711113246e-08, "loss": 0.4362, "mean_token_accuracy": 0.8676238685846329, "step": 985 }, { "epoch": 1.9271974677380084, "grad_norm": 0.30341116951092534, "learning_rate": 3.748848626878132e-08, "loss": 0.4508, "mean_token_accuracy": 0.8636912703514099, "step": 990 }, { "epoch": 1.936936936936937, "grad_norm": 0.31515978754450974, "learning_rate": 2.7807169144906108e-08, "loss": 0.4528, "mean_token_accuracy": 0.8630853027105332, "step": 995 }, { "epoch": 1.9466764061358655, "grad_norm": 0.30748596132213357, "learning_rate": 1.9565889625275945e-08, "loss": 0.4558, "mean_token_accuracy": 0.8628044292330742, "step": 1000 }, { "epoch": 1.9564158753347942, "grad_norm": 0.30248992557073057, "learning_rate": 1.2767034534540978e-08, "loss": 0.4467, "mean_token_accuracy": 0.86439578384161, "step": 1005 }, { "epoch": 1.966155344533723, "grad_norm": 0.3256862744869616, "learning_rate": 7.412572944965335e-09, "loss": 0.4548, "mean_token_accuracy": 0.8625592529773712, "step": 1010 }, { "epoch": 1.9758948137326515, "grad_norm": 0.3147340514506444, "learning_rate": 3.5040556061483043e-09, "loss": 0.4422, "mean_token_accuracy": 0.8657747611403466, "step": 1015 }, { "epoch": 1.98563428293158, "grad_norm": 0.32764257560082577, "learning_rate": 1.0426144958985974e-09, "loss": 0.4419, "mean_token_accuracy": 0.8660721600055694, "step": 1020 }, { "epoch": 1.9953737521305088, "grad_norm": 0.3098108021812636, "learning_rate": 2.8962492393258546e-11, "loss": 0.4452, "mean_token_accuracy": 0.8650736406445503, "step": 1025 }, { "epoch": 1.9973216459702945, "mean_token_accuracy": 0.8681519404053688, "step": 1026, "total_flos": 1074983740637184.0, "train_loss": 0.49954891321022377, "train_runtime": 168401.8035, "train_samples_per_second": 0.78, "train_steps_per_second": 0.006 } ], "logging_steps": 5, "max_steps": 1026, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1074983740637184.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }