diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,3495 +2,1180 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 6.0, + "epoch": 1.0, "eval_steps": 500, - "global_step": 1926, + "global_step": 640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.015600624024960999, - "grad_norm": 129.38328725317604, - "learning_rate": 4.1450777202072546e-07, - "loss": 2.516, - "mean_token_accuracy": 0.5559801399707794, - "num_tokens": 133460.0, + "epoch": 0.007816316560820713, + "grad_norm": 69.75188475573104, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.5559, + "mean_token_accuracy": 0.6205124616622925, + "num_tokens": 5924597.0, "step": 5 }, { - "epoch": 0.031201248049921998, - "grad_norm": 109.78138881563702, - "learning_rate": 9.326424870466322e-07, - "loss": 2.4202, - "mean_token_accuracy": 0.5638172149658203, - "num_tokens": 269026.0, + "epoch": 0.015632633121641426, + "grad_norm": 37.890360906113195, + "learning_rate": 4.5e-06, + "loss": 2.096, + "mean_token_accuracy": 0.6493684396147728, + "num_tokens": 11853358.0, "step": 10 }, { - "epoch": 0.046801872074883, - "grad_norm": 21.95808254821198, - "learning_rate": 1.4507772020725389e-06, - "loss": 2.0485, - "mean_token_accuracy": 0.6104433059692382, - "num_tokens": 401447.0, + "epoch": 0.02344894968246214, + "grad_norm": 37.89921398442958, + "learning_rate": 7e-06, + "loss": 1.2923, + "mean_token_accuracy": 0.699188905954361, + "num_tokens": 17774953.0, "step": 15 }, { - "epoch": 0.062402496099843996, - "grad_norm": 14.11423255160202, - "learning_rate": 1.968911917098446e-06, - "loss": 1.6362, - "mean_token_accuracy": 0.6654394745826722, - "num_tokens": 536684.0, + "epoch": 0.03126526624328285, + "grad_norm": 3.9611680059042, + "learning_rate": 9.5e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.736442020535469, + "num_tokens": 23743902.0, "step": 20 }, { - "epoch": 0.078003120124805, - "grad_norm": 9.622340375457814, - "learning_rate": 2.4870466321243523e-06, - "loss": 1.2735, - "mean_token_accuracy": 0.7199482321739197, - "num_tokens": 670646.0, + "epoch": 0.039081582804103565, + "grad_norm": 6.339716255577251, + "learning_rate": 9.999075719055307e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7516884453594684, + "num_tokens": 29670699.0, "step": 25 }, { - "epoch": 0.093603744149766, - "grad_norm": 9.306287977685484, - "learning_rate": 3.0051813471502592e-06, - "loss": 0.8738, - "mean_token_accuracy": 0.7971087396144867, - "num_tokens": 804318.0, + "epoch": 0.04689789936492428, + "grad_norm": 3.2460756840648544, + "learning_rate": 9.995321478440751e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7534175351262092, + "num_tokens": 35627815.0, "step": 30 }, { - "epoch": 0.10920436817472699, - "grad_norm": 1.3713338593715485, - "learning_rate": 3.5233160621761657e-06, - "loss": 0.6686, - "mean_token_accuracy": 0.8349042236804962, - "num_tokens": 938579.0, + "epoch": 0.05471421592574499, + "grad_norm": 3.4636834905707827, + "learning_rate": 9.988681918400355e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7533099494874478, + "num_tokens": 41558055.0, "step": 35 }, { - "epoch": 0.12480499219968799, - "grad_norm": 1.0749561589307342, - "learning_rate": 4.041450777202073e-06, - "loss": 0.5967, - "mean_token_accuracy": 0.8444766402244568, - "num_tokens": 1071445.0, + "epoch": 0.0625305324865657, + "grad_norm": 2.995605141331059, + "learning_rate": 9.9791613005318e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7614992260932922, + "num_tokens": 47493638.0, "step": 40 }, { - "epoch": 0.14040561622464898, - "grad_norm": 0.9496605192114075, - "learning_rate": 4.55958549222798e-06, - "loss": 0.589, - "mean_token_accuracy": 0.8443255186080932, - "num_tokens": 1208671.0, + "epoch": 0.07034684904738642, + "grad_norm": 3.1347703455639584, + "learning_rate": 9.966765735638018e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7702661901712418, + "num_tokens": 53415769.0, "step": 45 }, { - "epoch": 0.15600624024961, - "grad_norm": 0.9337552437064247, - "learning_rate": 5.077720207253887e-06, - "loss": 0.5467, - "mean_token_accuracy": 0.8530825853347779, - "num_tokens": 1340184.0, + "epoch": 0.07816316560820713, + "grad_norm": 4.194109438368802, + "learning_rate": 9.951503179804989e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7751710690557957, + "num_tokens": 59313984.0, "step": 50 }, { - "epoch": 0.17160686427457097, - "grad_norm": 0.8238179306235244, - "learning_rate": 5.5958549222797934e-06, - "loss": 0.5285, - "mean_token_accuracy": 0.8581411242485046, - "num_tokens": 1472408.0, + "epoch": 0.08597948216902784, + "grad_norm": 2.5694956316167405, + "learning_rate": 9.933383429295124e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.7756987683475017, + "num_tokens": 65218898.0, "step": 55 }, { - "epoch": 0.187207488299532, - "grad_norm": 0.8310047027070511, - "learning_rate": 6.113989637305699e-06, - "loss": 0.5171, - "mean_token_accuracy": 0.8576966345310211, - "num_tokens": 1607538.0, + "epoch": 0.09379579872984856, + "grad_norm": 2.4022434258446457, + "learning_rate": 9.912418114259548e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7788987644016743, + "num_tokens": 71140560.0, "step": 60 }, { - "epoch": 0.20280811232449297, - "grad_norm": 0.9786821516730765, - "learning_rate": 6.632124352331607e-06, - "loss": 0.4964, - "mean_token_accuracy": 0.8631992697715759, - "num_tokens": 1742785.0, + "epoch": 0.10161211529066927, + "grad_norm": 2.0354431025023314, + "learning_rate": 9.888620691273284e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7819231644272804, + "num_tokens": 77078478.0, "step": 65 }, { - "epoch": 0.21840873634945399, - "grad_norm": 0.9411599854617563, - "learning_rate": 7.150259067357514e-06, - "loss": 0.4961, - "mean_token_accuracy": 0.8622171819210053, - "num_tokens": 1878332.0, + "epoch": 0.10942843185148998, + "grad_norm": 1.88497692389339, + "learning_rate": 9.862006434698169e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7847778849303723, + "num_tokens": 82996564.0, "step": 70 }, { - "epoch": 0.23400936037441497, - "grad_norm": 1.0340069668875889, - "learning_rate": 7.66839378238342e-06, - "loss": 0.4678, - "mean_token_accuracy": 0.8705899536609649, - "num_tokens": 2013744.0, + "epoch": 0.1172447484123107, + "grad_norm": 1.9200062767141564, + "learning_rate": 9.832592426879006e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7845040634274483, + "num_tokens": 88951976.0, "step": 75 }, { - "epoch": 0.24960998439937598, - "grad_norm": 0.9135002582214133, - "learning_rate": 8.186528497409328e-06, - "loss": 0.4413, - "mean_token_accuracy": 0.8774827301502228, - "num_tokens": 2147559.0, + "epoch": 0.1250610649731314, + "grad_norm": 2.210483530160085, + "learning_rate": 9.800397547179276e-06, + "loss": 0.6829, + "mean_token_accuracy": 0.7839573793113231, + "num_tokens": 94895652.0, "step": 80 }, { - "epoch": 0.26521060842433697, - "grad_norm": 0.9713464125782284, - "learning_rate": 8.704663212435233e-06, - "loss": 0.4213, - "mean_token_accuracy": 0.8831651329994201, - "num_tokens": 2282653.0, + "epoch": 0.13287738153395212, + "grad_norm": 2.0791394687934845, + "learning_rate": 9.765442459863428e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7845308348536492, + "num_tokens": 100832079.0, "step": 85 }, { - "epoch": 0.28081123244929795, - "grad_norm": 1.258905573505298, - "learning_rate": 9.22279792746114e-06, - "loss": 0.3952, - "mean_token_accuracy": 0.8886785268783569, - "num_tokens": 2417343.0, + "epoch": 0.14069369809477283, + "grad_norm": 2.148171577637037, + "learning_rate": 9.72774960083353e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7805382929742336, + "num_tokens": 106787613.0, "step": 90 }, { - "epoch": 0.296411856474259, - "grad_norm": 1.0236538285858234, - "learning_rate": 9.740932642487048e-06, - "loss": 0.3721, - "mean_token_accuracy": 0.8954536736011505, - "num_tokens": 2552005.0, + "epoch": 0.14851001465559355, + "grad_norm": 2.0898108754881326, + "learning_rate": 9.687343163228806e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.7865298599004745, + "num_tokens": 112717599.0, "step": 95 }, { - "epoch": 0.31201248049922, - "grad_norm": 1.0098528095352195, - "learning_rate": 1.0259067357512955e-05, - "loss": 0.3399, - "mean_token_accuracy": 0.9048069655895233, - "num_tokens": 2687951.0, + "epoch": 0.15632633121641426, + "grad_norm": 4.0822591130362955, + "learning_rate": 9.644249081897277e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.7872977338731288, + "num_tokens": 118650442.0, "step": 100 }, { - "epoch": 0.32761310452418096, - "grad_norm": 1.04186438377383, - "learning_rate": 1.0777202072538861e-05, - "loss": 0.3326, - "mean_token_accuracy": 0.905648124217987, - "num_tokens": 2820443.0, + "epoch": 0.16414264777723497, + "grad_norm": 2.441738444276975, + "learning_rate": 9.598495016749493e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.7859964817762375, + "num_tokens": 124574766.0, "step": 105 }, { - "epoch": 0.34321372854914195, - "grad_norm": 1.032367777542592, - "learning_rate": 1.1295336787564768e-05, - "loss": 0.3141, - "mean_token_accuracy": 0.9102253496646882, - "num_tokens": 2955262.0, + "epoch": 0.1719589643380557, + "grad_norm": 2.075995258307634, + "learning_rate": 9.55011033500505e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.787811417132616, + "num_tokens": 130516200.0, "step": 110 }, { - "epoch": 0.358814352574103, - "grad_norm": 0.9772961653469765, - "learning_rate": 1.1813471502590674e-05, - "loss": 0.2923, - "mean_token_accuracy": 0.9158960580825806, - "num_tokens": 3088104.0, + "epoch": 0.1797752808988764, + "grad_norm": 2.609489028108713, + "learning_rate": 9.499126092343237e-06, + "loss": 0.661, + "mean_token_accuracy": 0.7875787198543549, + "num_tokens": 136457829.0, "step": 115 }, { - "epoch": 0.374414976599064, - "grad_norm": 1.1625665709943187, - "learning_rate": 1.2331606217616581e-05, - "loss": 0.2951, - "mean_token_accuracy": 0.915445065498352, - "num_tokens": 3222890.0, + "epoch": 0.1875915974596971, + "grad_norm": 3.7440429066647734, + "learning_rate": 9.445575012969977e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.7853954270482063, + "num_tokens": 142385131.0, "step": 120 }, { - "epoch": 0.39001560062402496, - "grad_norm": 0.9514106687699287, - "learning_rate": 1.2849740932642487e-05, - "loss": 0.2684, - "mean_token_accuracy": 0.9234537720680237, - "num_tokens": 3357508.0, + "epoch": 0.19540791402051783, + "grad_norm": 2.8751982634480044, + "learning_rate": 9.38949146861382e-06, + "loss": 0.663, + "mean_token_accuracy": 0.7872662946581841, + "num_tokens": 148289898.0, "step": 125 }, { - "epoch": 0.40561622464898595, - "grad_norm": 0.9436965806836382, - "learning_rate": 1.3367875647668396e-05, - "loss": 0.2649, - "mean_token_accuracy": 0.9228121876716614, - "num_tokens": 3492123.0, + "epoch": 0.20322423058133854, + "grad_norm": 2.925083855527294, + "learning_rate": 9.33091145646446e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.7910164006054401, + "num_tokens": 154232386.0, "step": 130 }, { - "epoch": 0.42121684867394693, - "grad_norm": 2.9221775609122385, - "learning_rate": 1.3886010362694302e-05, - "loss": 0.2524, - "mean_token_accuracy": 0.9274605810642242, - "num_tokens": 3626221.0, + "epoch": 0.21104054714215925, + "grad_norm": 3.73974121507644, + "learning_rate": 9.26987257606797e-06, + "loss": 0.653, + "mean_token_accuracy": 0.7898711428046227, + "num_tokens": 160150772.0, "step": 135 }, { - "epoch": 0.43681747269890797, - "grad_norm": 0.9970697064309755, - "learning_rate": 1.4404145077720209e-05, - "loss": 0.2565, - "mean_token_accuracy": 0.9262707114219666, - "num_tokens": 3759517.0, + "epoch": 0.21885686370297996, + "grad_norm": 2.074910644810112, + "learning_rate": 9.206414005193539e-06, + "loss": 0.6564, + "mean_token_accuracy": 0.7886676676571369, + "num_tokens": 166088160.0, "step": 140 }, { - "epoch": 0.45241809672386896, - "grad_norm": 1.0665217619628524, - "learning_rate": 1.4922279792746115e-05, - "loss": 0.2446, - "mean_token_accuracy": 0.9297859013080597, - "num_tokens": 3893880.0, + "epoch": 0.22667318026380068, + "grad_norm": 3.765010702267194, + "learning_rate": 9.140576474687263e-06, + "loss": 0.665, + "mean_token_accuracy": 0.7880136586725712, + "num_tokens": 172032929.0, "step": 145 }, { - "epoch": 0.46801872074882994, - "grad_norm": 0.8670182837524361, - "learning_rate": 1.544041450777202e-05, - "loss": 0.2473, - "mean_token_accuracy": 0.9284217417240143, - "num_tokens": 4030353.0, + "epoch": 0.2344894968246214, + "grad_norm": 2.3920249014704944, + "learning_rate": 9.072402242329067e-06, + "loss": 0.6503, + "mean_token_accuracy": 0.7899810753762722, + "num_tokens": 178005406.0, "step": 150 }, { - "epoch": 0.4836193447737909, - "grad_norm": 1.05147852072561, - "learning_rate": 1.595854922279793e-05, - "loss": 0.2405, - "mean_token_accuracy": 0.9299076437950134, - "num_tokens": 4165133.0, + "epoch": 0.2423058133854421, + "grad_norm": 4.535876880687944, + "learning_rate": 9.001935065709569e-06, + "loss": 0.6427, + "mean_token_accuracy": 0.7929855234920978, + "num_tokens": 183927144.0, "step": 155 }, { - "epoch": 0.49921996879875197, - "grad_norm": 0.8517307290696935, - "learning_rate": 1.6476683937823835e-05, - "loss": 0.2338, - "mean_token_accuracy": 0.9323501110076904, - "num_tokens": 4301452.0, + "epoch": 0.2501221299462628, + "grad_norm": 3.44122188439969, + "learning_rate": 8.929220174144304e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.7914013616740704, + "num_tokens": 189848723.0, "step": 160 }, { - "epoch": 0.514820592823713, - "grad_norm": 0.8672850209975566, - "learning_rate": 1.6994818652849744e-05, - "loss": 0.2308, - "mean_token_accuracy": 0.9331668317317963, - "num_tokens": 4434235.0, + "epoch": 0.25793844650708353, + "grad_norm": 2.2378659722902188, + "learning_rate": 8.85430423964332e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.792429718375206, + "num_tokens": 195777877.0, "step": 165 }, { - "epoch": 0.5304212168486739, - "grad_norm": 0.7613765456655692, - "learning_rate": 1.751295336787565e-05, - "loss": 0.2237, - "mean_token_accuracy": 0.9341790914535523, - "num_tokens": 4565517.0, + "epoch": 0.26575476306790424, + "grad_norm": 2.982598937419653, + "learning_rate": 8.777235346954753e-06, + "loss": 0.649, + "mean_token_accuracy": 0.790771734714508, + "num_tokens": 201713378.0, "step": 170 }, { - "epoch": 0.5460218408736349, - "grad_norm": 0.7810705098422559, - "learning_rate": 1.8031088082901555e-05, - "loss": 0.2, - "mean_token_accuracy": 0.9416050374507904, - "num_tokens": 4705082.0, + "epoch": 0.27357107962872496, + "grad_norm": 1.9062511611198192, + "learning_rate": 8.698062962701691e-06, + "loss": 0.652, + "mean_token_accuracy": 0.790651909261942, + "num_tokens": 207661581.0, "step": 175 }, { - "epoch": 0.5616224648985959, - "grad_norm": 0.9205061316516083, - "learning_rate": 1.854922279792746e-05, - "loss": 0.2088, - "mean_token_accuracy": 0.9381526112556458, - "num_tokens": 4837136.0, + "epoch": 0.28138739618954567, + "grad_norm": 2.7700238166088877, + "learning_rate": 8.616837903632026e-06, + "loss": 0.6438, + "mean_token_accuracy": 0.7914554052054882, + "num_tokens": 213597723.0, "step": 180 }, { - "epoch": 0.5772230889235569, - "grad_norm": 1.058982637944881, - "learning_rate": 1.9067357512953367e-05, - "loss": 0.2043, - "mean_token_accuracy": 0.9402987122535705, - "num_tokens": 4971044.0, + "epoch": 0.2892037127503664, + "grad_norm": 4.298205320427554, + "learning_rate": 8.533612304001763e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.7872735880315304, + "num_tokens": 219543570.0, "step": 185 }, { - "epoch": 0.592823712948518, - "grad_norm": 0.824897710515413, - "learning_rate": 1.9585492227979276e-05, - "loss": 0.2064, - "mean_token_accuracy": 0.9386222898960114, - "num_tokens": 5106335.0, + "epoch": 0.2970200293111871, + "grad_norm": 2.62809039414661, + "learning_rate": 8.44843958211269e-06, + "loss": 0.646, + "mean_token_accuracy": 0.7901849329471589, + "num_tokens": 225490076.0, "step": 190 }, { - "epoch": 0.608424336973479, - "grad_norm": 0.7483263969132283, - "learning_rate": 1.998845931909983e-05, - "loss": 0.2045, - "mean_token_accuracy": 0.9389774143695832, - "num_tokens": 5242005.0, + "epoch": 0.3048363458720078, + "grad_norm": 2.540505165257454, + "learning_rate": 8.361374406025853e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.7908910043537617, + "num_tokens": 231429929.0, "step": 195 }, { - "epoch": 0.62402496099844, - "grad_norm": 0.956493222774255, - "learning_rate": 1.9930755914598962e-05, - "loss": 0.1916, - "mean_token_accuracy": 0.9423897206783295, - "num_tokens": 5376154.0, + "epoch": 0.3126526624328285, + "grad_norm": 1.763996580236139, + "learning_rate": 8.272472658472906e-06, + "loss": 0.6529, + "mean_token_accuracy": 0.7878620237112045, + "num_tokens": 237384555.0, "step": 200 }, { - "epoch": 0.6396255850234009, - "grad_norm": 0.7286666895021545, - "learning_rate": 1.9873052510098098e-05, - "loss": 0.2031, - "mean_token_accuracy": 0.9401489853858948, - "num_tokens": 5509423.0, + "epoch": 0.32046897899364923, + "grad_norm": 1.6233120878672673, + "learning_rate": 8.181791400987807e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.7936271652579308, + "num_tokens": 243313192.0, "step": 205 }, { - "epoch": 0.6552262090483619, - "grad_norm": 0.9135913203051874, - "learning_rate": 1.9815349105597233e-05, - "loss": 0.201, - "mean_token_accuracy": 0.9404536783695221, - "num_tokens": 5642113.0, + "epoch": 0.32828529555446995, + "grad_norm": 1.9405753464802942, + "learning_rate": 8.089388837281915e-06, + "loss": 0.6439, + "mean_token_accuracy": 0.7914594881236553, + "num_tokens": 249220870.0, "step": 210 }, { - "epoch": 0.6708268330733229, - "grad_norm": 0.7481872057084359, - "learning_rate": 1.9757645701096365e-05, - "loss": 0.1952, - "mean_token_accuracy": 0.9418298482894898, - "num_tokens": 5774341.0, + "epoch": 0.33610161211529066, + "grad_norm": 3.488125244229796, + "learning_rate": 7.995324275885961e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.7946518436074257, + "num_tokens": 255166697.0, "step": 215 }, { - "epoch": 0.6864274570982839, - "grad_norm": 0.7237687833871097, - "learning_rate": 1.96999422965955e-05, - "loss": 0.1966, - "mean_token_accuracy": 0.9417529046535492, - "num_tokens": 5908974.0, + "epoch": 0.3439179286761114, + "grad_norm": 8.607586639921637, + "learning_rate": 7.89965809208291e-06, + "loss": 0.6426, + "mean_token_accuracy": 0.7916349656879902, + "num_tokens": 261089161.0, "step": 220 }, { - "epoch": 0.7020280811232449, - "grad_norm": 0.7969480392056931, - "learning_rate": 1.9642238892094636e-05, - "loss": 0.1907, - "mean_token_accuracy": 0.9438920319080353, - "num_tokens": 6045637.0, + "epoch": 0.3517342452369321, + "grad_norm": 1.6660014877602962, + "learning_rate": 7.802451689156122e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.79035182595253, + "num_tokens": 267021428.0, "step": 225 }, { - "epoch": 0.717628705148206, - "grad_norm": 0.7265197616696507, - "learning_rate": 1.958453548759377e-05, - "loss": 0.1978, - "mean_token_accuracy": 0.9413581132888794, - "num_tokens": 6177199.0, + "epoch": 0.3595505617977528, + "grad_norm": 1.6153459083711095, + "learning_rate": 7.70376745897768e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.7924111239612103, + "num_tokens": 272961221.0, "step": 230 }, { - "epoch": 0.733229329173167, - "grad_norm": 0.7170346798055343, - "learning_rate": 1.9526832083092904e-05, - "loss": 0.1867, - "mean_token_accuracy": 0.9436001777648926, - "num_tokens": 6309061.0, + "epoch": 0.3673668783585735, + "grad_norm": 1.7376528456449458, + "learning_rate": 7.6036687419622215e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.7931422784924507, + "num_tokens": 278866458.0, "step": 235 }, { - "epoch": 0.748829953198128, - "grad_norm": 0.7850135506356014, - "learning_rate": 1.9469128678592036e-05, - "loss": 0.1861, - "mean_token_accuracy": 0.9450059533119202, - "num_tokens": 6445167.0, + "epoch": 0.3751831949193942, + "grad_norm": 1.7509363427645832, + "learning_rate": 7.5022197864119175e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.7915025249123573, + "num_tokens": 284785499.0, "step": 240 }, { - "epoch": 0.7644305772230889, - "grad_norm": 0.6636614795993062, - "learning_rate": 1.9411425274091175e-05, - "loss": 0.1834, - "mean_token_accuracy": 0.9457758605480194, - "num_tokens": 6581753.0, + "epoch": 0.38299951148021494, + "grad_norm": 2.863849300726178, + "learning_rate": 7.399485707278744e-06, + "loss": 0.6478, + "mean_token_accuracy": 0.7907331958413124, + "num_tokens": 290732280.0, "step": 245 }, { - "epoch": 0.7800312012480499, - "grad_norm": 0.6771924573777538, - "learning_rate": 1.9353721869590307e-05, - "loss": 0.1809, - "mean_token_accuracy": 0.9465215265750885, - "num_tokens": 6719052.0, + "epoch": 0.39081582804103565, + "grad_norm": 1.4846518170955887, + "learning_rate": 7.295532444370485e-06, + "loss": 0.6451, + "mean_token_accuracy": 0.7914663501083851, + "num_tokens": 296654941.0, "step": 250 }, { - "epoch": 0.7956318252730109, - "grad_norm": 0.7411173161974047, - "learning_rate": 1.9296018465089442e-05, - "loss": 0.1777, - "mean_token_accuracy": 0.9468727946281433, - "num_tokens": 6853523.0, + "epoch": 0.39863214460185636, + "grad_norm": 1.7383678766587618, + "learning_rate": 7.190426720027306e-06, + "loss": 0.644, + "mean_token_accuracy": 0.7916645854711533, + "num_tokens": 302605292.0, "step": 255 }, { - "epoch": 0.8112324492979719, - "grad_norm": 0.650242673061882, - "learning_rate": 1.9238315060588578e-05, - "loss": 0.1781, - "mean_token_accuracy": 0.9471937596797944, - "num_tokens": 6988476.0, + "epoch": 0.4064484611626771, + "grad_norm": 2.2690859031654087, + "learning_rate": 7.084235996296068e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.792822826653719, + "num_tokens": 308552365.0, "step": 260 }, { - "epoch": 0.8268330733229329, - "grad_norm": 0.5819540826094594, - "learning_rate": 1.918061165608771e-05, - "loss": 0.1654, - "mean_token_accuracy": 0.9496288418769836, - "num_tokens": 7125030.0, + "epoch": 0.4142647777234978, + "grad_norm": 1.6536907812645472, + "learning_rate": 6.977028431629839e-06, + "loss": 0.6418, + "mean_token_accuracy": 0.7921351306140423, + "num_tokens": 314489877.0, "step": 265 }, { - "epoch": 0.8424336973478939, - "grad_norm": 0.6403317597999031, - "learning_rate": 1.9122908251586845e-05, - "loss": 0.1666, - "mean_token_accuracy": 0.9501708805561065, - "num_tokens": 7257681.0, + "epoch": 0.4220810942843185, + "grad_norm": 1.6417211396491032, + "learning_rate": 6.86887283714044e-06, + "loss": 0.6376, + "mean_token_accuracy": 0.7937514387071133, + "num_tokens": 320414279.0, "step": 270 }, { - "epoch": 0.858034321372855, - "grad_norm": 0.6418778871794507, - "learning_rate": 1.906520484708598e-05, - "loss": 0.1767, - "mean_token_accuracy": 0.9471412360668182, - "num_tokens": 7387859.0, + "epoch": 0.4298974108451392, + "grad_norm": 2.1351664741224963, + "learning_rate": 6.7598386324320745e-06, + "loss": 0.6298, + "mean_token_accuracy": 0.7947640925645828, + "num_tokens": 326349818.0, "step": 275 }, { - "epoch": 0.8736349453978159, - "grad_norm": 0.622540720290129, - "learning_rate": 1.9007501442585113e-05, - "loss": 0.1694, - "mean_token_accuracy": 0.9499199390411377, - "num_tokens": 7525714.0, + "epoch": 0.43771372740595993, + "grad_norm": 1.4007468087271036, + "learning_rate": 6.649995801044391e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.7926677256822586, + "num_tokens": 332276019.0, "step": 280 }, { - "epoch": 0.8892355694227769, - "grad_norm": 0.583668566012697, - "learning_rate": 1.894979803808425e-05, - "loss": 0.1666, - "mean_token_accuracy": 0.9499054729938508, - "num_tokens": 7662117.0, + "epoch": 0.44553004396678064, + "grad_norm": 1.6121431265331962, + "learning_rate": 6.539414845533596e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.7921099595725536, + "num_tokens": 338209339.0, "step": 285 }, { - "epoch": 0.9048361934477379, - "grad_norm": 0.5710724995564919, - "learning_rate": 1.889209463358338e-05, - "loss": 0.1589, - "mean_token_accuracy": 0.9527071595191956, - "num_tokens": 7798972.0, + "epoch": 0.45334636052760136, + "grad_norm": 2.031155969828718, + "learning_rate": 6.428166742220423e-06, + "loss": 0.625, + "mean_token_accuracy": 0.7955845050513745, + "num_tokens": 344137484.0, "step": 290 }, { - "epoch": 0.9204368174726989, - "grad_norm": 0.5746752867779363, - "learning_rate": 1.883439122908252e-05, - "loss": 0.1686, - "mean_token_accuracy": 0.9504578649997711, - "num_tokens": 7935314.0, + "epoch": 0.46116267708842207, + "grad_norm": 1.4450498950991713, + "learning_rate": 6.316322895634029e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.7931911982595921, + "num_tokens": 350078203.0, "step": 295 }, { - "epoch": 0.9360374414976599, - "grad_norm": 0.603982996580727, - "learning_rate": 1.877668782458165e-05, - "loss": 0.1629, - "mean_token_accuracy": 0.9501371085643768, - "num_tokens": 8066117.0, + "epoch": 0.4689789936492428, + "grad_norm": 1.9913252644164074, + "learning_rate": 6.20395509268104e-06, + "loss": 0.6214, + "mean_token_accuracy": 0.7970752798020839, + "num_tokens": 356025763.0, "step": 300 }, { - "epoch": 0.9516380655226209, - "grad_norm": 0.5922487470312926, - "learning_rate": 1.8718984420080787e-05, - "loss": 0.1648, - "mean_token_accuracy": 0.9501347541809082, - "num_tokens": 8201824.0, + "epoch": 0.4767953102100635, + "grad_norm": 2.8315822034325837, + "learning_rate": 6.0911354565691594e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.796255373954773, + "num_tokens": 361992798.0, "step": 305 }, { - "epoch": 0.9672386895475819, - "grad_norm": 0.5388743066315098, - "learning_rate": 1.8661281015579923e-05, - "loss": 0.1645, - "mean_token_accuracy": 0.9501584231853485, - "num_tokens": 8337103.0, + "epoch": 0.4846116267708842, + "grad_norm": 1.8374439736436459, + "learning_rate": 5.977936400514943e-06, + "loss": 0.6307, + "mean_token_accuracy": 0.7953431971371174, + "num_tokens": 367913461.0, "step": 310 }, { - "epoch": 0.982839313572543, - "grad_norm": 0.6436638863952264, - "learning_rate": 1.8603577611079055e-05, - "loss": 0.1701, - "mean_token_accuracy": 0.9488860845565796, - "num_tokens": 8469896.0, + "epoch": 0.4924279433317049, + "grad_norm": 2.3033335141428, + "learning_rate": 5.864430581265406e-06, + "loss": 0.6356, + "mean_token_accuracy": 0.7944584995508194, + "num_tokens": 373852019.0, "step": 315 }, { - "epoch": 0.9984399375975039, - "grad_norm": 0.6288476584333543, - "learning_rate": 1.854587420657819e-05, - "loss": 0.1632, - "mean_token_accuracy": 0.9505932807922364, - "num_tokens": 8606247.0, + "epoch": 0.5002442598925256, + "grad_norm": 1.6907176529892718, + "learning_rate": 5.750690852463339e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.7937369205057621, + "num_tokens": 379764522.0, "step": 320 }, { - "epoch": 1.0124804992199687, - "grad_norm": 0.5165837525539804, - "learning_rate": 1.8488170802077322e-05, - "loss": 0.1253, - "mean_token_accuracy": 0.956979387336307, - "num_tokens": 8725614.0, + "epoch": 0.5080605764533463, + "grad_norm": 1.818966581747677, + "learning_rate": 5.636790217886243e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.7948664158582688, + "num_tokens": 385692482.0, "step": 325 }, { - "epoch": 1.0280811232449298, - "grad_norm": 0.5586235874833988, - "learning_rate": 1.8430467397576458e-05, - "loss": 0.1385, - "mean_token_accuracy": 0.95757777094841, - "num_tokens": 8860633.0, + "epoch": 0.5158768930141671, + "grad_norm": 1.6206125873755466, + "learning_rate": 5.522801784588895e-06, + "loss": 0.631, + "mean_token_accuracy": 0.7939370617270469, + "num_tokens": 391635856.0, "step": 330 }, { - "epoch": 1.0436817472698907, - "grad_norm": 0.5360739832407388, - "learning_rate": 1.8372763993075593e-05, - "loss": 0.139, - "mean_token_accuracy": 0.9575653314590454, - "num_tokens": 8993751.0, + "epoch": 0.5236932095749878, + "grad_norm": 3.3420400728114945, + "learning_rate": 5.408798715979626e-06, + "loss": 0.6341, + "mean_token_accuracy": 0.7946567349135876, + "num_tokens": 397545573.0, "step": 335 }, { - "epoch": 1.0592823712948518, - "grad_norm": 0.586330833103125, - "learning_rate": 1.8315060588574725e-05, - "loss": 0.132, - "mean_token_accuracy": 0.959426885843277, - "num_tokens": 9129958.0, + "epoch": 0.5315095261358085, + "grad_norm": 4.102411753363632, + "learning_rate": 5.294854184860437e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.7956276901066304, + "num_tokens": 403475982.0, "step": 340 }, { - "epoch": 1.074882995319813, - "grad_norm": 0.5465937446033756, - "learning_rate": 1.8257357184073864e-05, - "loss": 0.1347, - "mean_token_accuracy": 0.9579224228858948, - "num_tokens": 9261996.0, + "epoch": 0.5393258426966292, + "grad_norm": 3.1767409652573853, + "learning_rate": 5.1810413264610724e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.7952337145805359, + "num_tokens": 409418002.0, "step": 345 }, { - "epoch": 1.0904836193447738, - "grad_norm": 0.6247134416034062, - "learning_rate": 1.8199653779572996e-05, - "loss": 0.1342, - "mean_token_accuracy": 0.9582812249660492, - "num_tokens": 9397095.0, + "epoch": 0.5471421592574499, + "grad_norm": 3.166395119010415, + "learning_rate": 5.067433191497221e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.7954030476510525, + "num_tokens": 415344226.0, "step": 350 }, { - "epoch": 1.1060842433697349, - "grad_norm": 0.6041369695098568, - "learning_rate": 1.8141950375072132e-05, - "loss": 0.1393, - "mean_token_accuracy": 0.9570248365402222, - "num_tokens": 9529942.0, + "epoch": 0.5549584758182706, + "grad_norm": 1.526943049343455, + "learning_rate": 4.954102699282953e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.7941608227789402, + "num_tokens": 421277466.0, "step": 355 }, { - "epoch": 1.1216848673946958, - "grad_norm": 0.6320621020426849, - "learning_rate": 1.8084246970571264e-05, - "loss": 0.1372, - "mean_token_accuracy": 0.9566760540008545, - "num_tokens": 9661196.0, + "epoch": 0.5627747923790913, + "grad_norm": 1.9099044626436186, + "learning_rate": 4.841122590927511e-06, + "loss": 0.618, + "mean_token_accuracy": 0.7975563704967499, + "num_tokens": 427241607.0, "step": 360 }, { - "epoch": 1.1372854914196568, - "grad_norm": 0.55603630239386, - "learning_rate": 1.80265435660704e-05, - "loss": 0.135, - "mean_token_accuracy": 0.9582180440425873, - "num_tokens": 9795761.0, + "epoch": 0.570591108939912, + "grad_norm": 2.5623011943198137, + "learning_rate": 4.7285653826464605e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.7963444076478481, + "num_tokens": 433157588.0, "step": 365 }, { - "epoch": 1.1528861154446177, - "grad_norm": 0.5409365481066034, - "learning_rate": 1.7968840161569535e-05, - "loss": 0.1348, - "mean_token_accuracy": 0.9582259178161621, - "num_tokens": 9927945.0, + "epoch": 0.5784074255007328, + "grad_norm": 1.6826723465337794, + "learning_rate": 4.616503319217202e-06, + "loss": 0.6205, + "mean_token_accuracy": 0.7979116909205913, + "num_tokens": 439093218.0, "step": 370 }, { - "epoch": 1.1684867394695788, - "grad_norm": 0.5119811661411944, - "learning_rate": 1.7911136757068667e-05, - "loss": 0.1348, - "mean_token_accuracy": 0.9586300194263458, - "num_tokens": 10063699.0, + "epoch": 0.5862237420615535, + "grad_norm": 2.4233422176625905, + "learning_rate": 4.5050083276087155e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.7943588711321354, + "num_tokens": 445010423.0, "step": 375 }, { - "epoch": 1.1840873634945397, - "grad_norm": 0.5525476773773188, - "learning_rate": 1.7853433352567802e-05, - "loss": 0.1326, - "mean_token_accuracy": 0.9586183845996856, - "num_tokens": 10197789.0, + "epoch": 0.5940400586223742, + "grad_norm": 1.8195780340101564, + "learning_rate": 4.394151970815259e-06, + "loss": 0.613, + "mean_token_accuracy": 0.799777788668871, + "num_tokens": 450918292.0, "step": 380 }, { - "epoch": 1.1996879875195008, - "grad_norm": 0.5483837555676431, - "learning_rate": 1.7795729948066938e-05, - "loss": 0.1351, - "mean_token_accuracy": 0.9586268842220307, - "num_tokens": 10331682.0, + "epoch": 0.6018563751831949, + "grad_norm": 1.8194352528770072, + "learning_rate": 4.284005401923723e-06, + "loss": 0.6225, + "mean_token_accuracy": 0.7965258292853832, + "num_tokens": 456832151.0, "step": 385 }, { - "epoch": 1.2152886115444619, - "grad_norm": 0.5433839857021544, - "learning_rate": 1.773802654356607e-05, - "loss": 0.1285, - "mean_token_accuracy": 0.9602697193622589, - "num_tokens": 10468253.0, + "epoch": 0.6096726917440156, + "grad_norm": 1.7307705633009496, + "learning_rate": 4.174639318444044e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.7983451545238495, + "num_tokens": 462764585.0, "step": 390 }, { - "epoch": 1.2308892355694228, - "grad_norm": 0.5051500128900668, - "learning_rate": 1.7680323139065206e-05, - "loss": 0.1281, - "mean_token_accuracy": 0.9597992599010468, - "num_tokens": 10605460.0, + "epoch": 0.6174890083048363, + "grad_norm": 1.5492850018471778, + "learning_rate": 4.066123916932069e-06, + "loss": 0.6232, + "mean_token_accuracy": 0.7965681925415993, + "num_tokens": 468701502.0, "step": 395 }, { - "epoch": 1.2464898595943839, - "grad_norm": 0.5987143217070542, - "learning_rate": 1.762261973456434e-05, - "loss": 0.1347, - "mean_token_accuracy": 0.9584230363368988, - "num_tokens": 10740021.0, + "epoch": 0.625305324865657, + "grad_norm": 1.5785740475386758, + "learning_rate": 3.95852884793392e-06, + "loss": 0.639, + "mean_token_accuracy": 0.7930570214986801, + "num_tokens": 474663601.0, "step": 400 }, { - "epoch": 1.2620904836193447, - "grad_norm": 0.5700082158465958, - "learning_rate": 1.7564916330063477e-05, - "loss": 0.1278, - "mean_token_accuracy": 0.9604202508926392, - "num_tokens": 10876584.0, + "epoch": 0.6331216414264778, + "grad_norm": 1.7178497128631158, + "learning_rate": 3.851923171280848e-06, + "loss": 0.631, + "mean_token_accuracy": 0.7956325292587281, + "num_tokens": 480597092.0, "step": 405 }, { - "epoch": 1.2776911076443058, - "grad_norm": 0.5881496450863309, - "learning_rate": 1.750721292556261e-05, - "loss": 0.1235, - "mean_token_accuracy": 0.9618001461029053, - "num_tokens": 11014040.0, + "epoch": 0.6409379579872985, + "grad_norm": 2.2284125586269634, + "learning_rate": 3.7463753117632086e-06, + "loss": 0.6194, + "mean_token_accuracy": 0.7979160696268082, + "num_tokens": 486517715.0, "step": 410 }, { - "epoch": 1.2932917316692667, - "grad_norm": 0.5581301413309461, - "learning_rate": 1.7449509521061744e-05, - "loss": 0.126, - "mean_token_accuracy": 0.9612515866756439, - "num_tokens": 11147226.0, + "epoch": 0.6487542745481192, + "grad_norm": 1.6485020103488872, + "learning_rate": 3.6419530152120585e-06, + "loss": 0.6155, + "mean_token_accuracy": 0.7989446625113488, + "num_tokens": 492472305.0, "step": 415 }, { - "epoch": 1.3088923556942278, - "grad_norm": 0.5414259556709807, - "learning_rate": 1.739180611656088e-05, - "loss": 0.1236, - "mean_token_accuracy": 0.9617384076118469, - "num_tokens": 11281741.0, + "epoch": 0.6565705911089399, + "grad_norm": 1.6583017170770122, + "learning_rate": 3.5387233050165305e-06, + "loss": 0.6154, + "mean_token_accuracy": 0.7981764920055866, + "num_tokens": 498385685.0, "step": 420 }, { - "epoch": 1.3244929797191887, - "grad_norm": 0.5836584507491059, - "learning_rate": 1.7334102712060012e-05, - "loss": 0.1258, - "mean_token_accuracy": 0.9612742185592651, - "num_tokens": 11415198.0, + "epoch": 0.6643869076697606, + "grad_norm": 1.7340707542937976, + "learning_rate": 3.436752439104914e-06, + "loss": 0.6232, + "mean_token_accuracy": 0.7975495472550392, + "num_tokens": 504307250.0, "step": 425 }, { - "epoch": 1.3400936037441498, - "grad_norm": 0.5465194574173875, - "learning_rate": 1.7276399307559147e-05, - "loss": 0.1276, - "mean_token_accuracy": 0.9597448110580444, - "num_tokens": 11547705.0, + "epoch": 0.6722032242305813, + "grad_norm": 1.7732002119177575, + "learning_rate": 3.336105867417036e-06, + "loss": 0.6136, + "mean_token_accuracy": 0.7990594677627086, + "num_tokens": 510245141.0, "step": 430 }, { - "epoch": 1.3556942277691109, - "grad_norm": 0.5683225699257467, - "learning_rate": 1.7218695903058283e-05, - "loss": 0.1268, - "mean_token_accuracy": 0.9613869369029999, - "num_tokens": 11685203.0, + "epoch": 0.680019540791402, + "grad_norm": 1.7788709835674736, + "learning_rate": 3.236848189895271e-06, + "loss": 0.6221, + "mean_token_accuracy": 0.7987750940024853, + "num_tokens": 516171739.0, "step": 435 }, { - "epoch": 1.3712948517940717, - "grad_norm": 0.5752321684685067, - "learning_rate": 1.7160992498557415e-05, - "loss": 0.1292, - "mean_token_accuracy": 0.9604803204536438, - "num_tokens": 11823197.0, + "epoch": 0.6878358573522227, + "grad_norm": 1.8951328728941093, + "learning_rate": 3.1390431150210858e-06, + "loss": 0.6216, + "mean_token_accuracy": 0.7972325548529625, + "num_tokens": 522101344.0, "step": 440 }, { - "epoch": 1.3868954758190328, - "grad_norm": 0.5388447370973155, - "learning_rate": 1.710328909405655e-05, - "loss": 0.128, - "mean_token_accuracy": 0.9602981925010681, - "num_tokens": 11956118.0, + "epoch": 0.6956521739130435, + "grad_norm": 2.265125941545771, + "learning_rate": 3.0427534189238056e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.797095137834549, + "num_tokens": 528042612.0, "step": 445 }, { - "epoch": 1.4024960998439937, - "grad_norm": 0.5444320848079807, - "learning_rate": 1.7045585689555686e-05, - "loss": 0.1222, - "mean_token_accuracy": 0.962395453453064, - "num_tokens": 12090445.0, + "epoch": 0.7034684904738642, + "grad_norm": 1.895267259607391, + "learning_rate": 2.9480409050877836e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.7996291488409042, + "num_tokens": 533972183.0, "step": 450 }, { - "epoch": 1.4180967238689548, - "grad_norm": 0.554566606109682, - "learning_rate": 1.698788228505482e-05, - "loss": 0.1234, - "mean_token_accuracy": 0.9616039395332336, - "num_tokens": 12217939.0, + "epoch": 0.7112848070346849, + "grad_norm": 2.1073222786880508, + "learning_rate": 2.854966364683872e-06, + "loss": 0.6066, + "mean_token_accuracy": 0.8017579860985279, + "num_tokens": 539882836.0, "step": 455 }, { - "epoch": 1.4336973478939157, - "grad_norm": 0.6070727464807655, - "learning_rate": 1.6930178880553953e-05, - "loss": 0.1284, - "mean_token_accuracy": 0.9608745992183685, - "num_tokens": 12350482.0, + "epoch": 0.7191011235955056, + "grad_norm": 1.586483401475392, + "learning_rate": 2.7635895375506516e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.79697345495224, + "num_tokens": 545834579.0, "step": 460 }, { - "epoch": 1.4492979719188768, - "grad_norm": 0.5523782519510332, - "learning_rate": 1.687247547605309e-05, - "loss": 0.1167, - "mean_token_accuracy": 0.9636275410652161, - "num_tokens": 12485977.0, + "epoch": 0.7269174401563263, + "grad_norm": 1.678175813746661, + "learning_rate": 2.6739690738504428e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.7964953184127808, + "num_tokens": 551762612.0, "step": 465 }, { - "epoch": 1.4648985959438376, - "grad_norm": 0.577639640008196, - "learning_rate": 1.6814772071552224e-05, - "loss": 0.1304, - "mean_token_accuracy": 0.9609861671924591, - "num_tokens": 12617413.0, + "epoch": 0.734733756717147, + "grad_norm": 4.728635128027955, + "learning_rate": 2.5861624964247402e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.7991872586309909, + "num_tokens": 557733732.0, "step": 470 }, { - "epoch": 1.4804992199687987, - "grad_norm": 0.5602199179222196, - "learning_rate": 1.6757068667051356e-05, - "loss": 0.1216, - "mean_token_accuracy": 0.9624531030654907, - "num_tokens": 12750714.0, + "epoch": 0.7425500732779677, + "grad_norm": 3.1433754367730327, + "learning_rate": 2.5002261638732066e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.796341958642006, + "num_tokens": 563678120.0, "step": 475 }, { - "epoch": 1.4960998439937598, - "grad_norm": 0.4967916021381074, - "learning_rate": 1.6699365262550492e-05, - "loss": 0.1187, - "mean_token_accuracy": 0.9640235006809235, - "num_tokens": 12887826.0, + "epoch": 0.7503663898387885, + "grad_norm": 1.5793999144249435, + "learning_rate": 2.416215234379941e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.7986149400472641, + "num_tokens": 569641651.0, "step": 480 }, { - "epoch": 1.5117004680187207, - "grad_norm": 0.5435358963770943, - "learning_rate": 1.6641661858049627e-05, - "loss": 0.119, - "mean_token_accuracy": 0.9637284338474273, - "num_tokens": 13023445.0, + "epoch": 0.7581827063996092, + "grad_norm": 1.5637633654884855, + "learning_rate": 2.3341836303102336e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.7989203184843063, + "num_tokens": 575596629.0, "step": 485 }, { - "epoch": 1.5273010920436816, - "grad_norm": 0.6202411461407195, - "learning_rate": 1.658395845354876e-05, - "loss": 0.1239, - "mean_token_accuracy": 0.9621273696422576, - "num_tokens": 13156136.0, + "epoch": 0.7659990229604299, + "grad_norm": 1.7578654658273658, + "learning_rate": 2.2541840036005227e-06, + "loss": 0.6152, + "mean_token_accuracy": 0.8000254578888416, + "num_tokens": 581506425.0, "step": 490 }, { - "epoch": 1.5429017160686427, - "grad_norm": 0.5860990715234197, - "learning_rate": 1.6526255049047895e-05, - "loss": 0.114, - "mean_token_accuracy": 0.9654937386512756, - "num_tokens": 13295904.0, + "epoch": 0.7738153395212506, + "grad_norm": 1.6111848358937764, + "learning_rate": 2.1762677019637836e-06, + "loss": 0.615, + "mean_token_accuracy": 0.7983125224709511, + "num_tokens": 587441928.0, "step": 495 }, { - "epoch": 1.5585023400936038, - "grad_norm": 0.5650400891346256, - "learning_rate": 1.646855164454703e-05, - "loss": 0.1156, - "mean_token_accuracy": 0.9648711144924164, - "num_tokens": 13430554.0, + "epoch": 0.7816316560820713, + "grad_norm": 2.729056351863656, + "learning_rate": 2.100484735932027e-06, + "loss": 0.6219, + "mean_token_accuracy": 0.7974658064544201, + "num_tokens": 593387525.0, "step": 500 }, { - "epoch": 1.5741029641185649, - "grad_norm": 0.5781630886222578, - "learning_rate": 1.6410848240046166e-05, - "loss": 0.1197, - "mean_token_accuracy": 0.9637046754360199, - "num_tokens": 13565119.0, + "epoch": 0.789447972642892, + "grad_norm": 2.0026132736871256, + "learning_rate": 2.0268837467570714e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.7999734558165074, + "num_tokens": 599357871.0, "step": 505 }, { - "epoch": 1.5897035881435257, - "grad_norm": 0.5432256680693549, - "learning_rate": 1.6353144835545298e-05, - "loss": 0.1234, - "mean_token_accuracy": 0.9622001588344574, - "num_tokens": 13699623.0, + "epoch": 0.7972642892037127, + "grad_norm": 3.202086545883362, + "learning_rate": 1.955511975190185e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.7985524848103523, + "num_tokens": 605266801.0, "step": 510 }, { - "epoch": 1.6053042121684866, - "grad_norm": 0.6409787925018047, - "learning_rate": 1.6295441431044434e-05, - "loss": 0.1244, - "mean_token_accuracy": 0.9616802036762238, - "num_tokens": 13829628.0, + "epoch": 0.8050806057645334, + "grad_norm": 2.5845078387849876, + "learning_rate": 1.8864152311606342e-06, + "loss": 0.6184, + "mean_token_accuracy": 0.7977212890982628, + "num_tokens": 611200207.0, "step": 515 }, { - "epoch": 1.6209048361934477, - "grad_norm": 0.6383227582961417, - "learning_rate": 1.623773802654357e-05, - "loss": 0.1238, - "mean_token_accuracy": 0.9625266194343567, - "num_tokens": 13961224.0, + "epoch": 0.8128969223253542, + "grad_norm": 1.3459195379746025, + "learning_rate": 1.8196378643726092e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.7991946995258331, + "num_tokens": 617152907.0, "step": 520 }, { - "epoch": 1.6365054602184088, - "grad_norm": 0.6314161515841656, - "learning_rate": 1.61800346220427e-05, - "loss": 0.1121, - "mean_token_accuracy": 0.9656075298786163, - "num_tokens": 14098377.0, + "epoch": 0.8207132388861749, + "grad_norm": 1.6903121443946976, + "learning_rate": 1.7552227358393933e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.7996014229953289, + "num_tokens": 623085161.0, "step": 525 }, { - "epoch": 1.6521060842433697, - "grad_norm": 0.614045523172391, - "learning_rate": 1.6122331217541837e-05, - "loss": 0.1196, - "mean_token_accuracy": 0.963841724395752, - "num_tokens": 14235157.0, + "epoch": 0.8285295554469956, + "grad_norm": 5.000507187078701, + "learning_rate": 1.6932111903730453e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.7996949210762978, + "num_tokens": 629027147.0, "step": 530 }, { - "epoch": 1.6677067082683308, - "grad_norm": 0.5872146814971247, - "learning_rate": 1.6064627813040972e-05, - "loss": 0.1133, - "mean_token_accuracy": 0.9656576633453369, - "num_tokens": 14369585.0, + "epoch": 0.8363458720078163, + "grad_norm": 2.026922156270765, + "learning_rate": 1.6336430300472606e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.800903269648552, + "num_tokens": 634987448.0, "step": 535 }, { - "epoch": 1.6833073322932917, - "grad_norm": 0.5744613415474724, - "learning_rate": 1.6006924408540104e-05, - "loss": 0.1073, - "mean_token_accuracy": 0.9668791234493256, - "num_tokens": 14506057.0, + "epoch": 0.844162188568637, + "grad_norm": 4.687498745163573, + "learning_rate": 1.576556488650428e-06, + "loss": 0.6128, + "mean_token_accuracy": 0.7995632983744144, + "num_tokens": 640920908.0, "step": 540 }, { - "epoch": 1.6989079563182528, - "grad_norm": 0.5973743143104617, - "learning_rate": 1.594922100403924e-05, - "loss": 0.1083, - "mean_token_accuracy": 0.9668962955474854, - "num_tokens": 14642411.0, + "epoch": 0.8519785051294577, + "grad_norm": 1.7305252046419992, + "learning_rate": 1.5219882071452967e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8014576397836208, + "num_tokens": 646854354.0, "step": 545 }, { - "epoch": 1.7145085803432139, - "grad_norm": 0.6176806575167721, - "learning_rate": 1.5891517599538372e-05, - "loss": 0.1071, - "mean_token_accuracy": 0.9674273788928985, - "num_tokens": 14774203.0, + "epoch": 0.8597948216902784, + "grad_norm": 1.5220809585740145, + "learning_rate": 1.4699732101510026e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.7980151705443859, + "num_tokens": 652785154.0, "step": 550 }, { - "epoch": 1.7301092043681747, - "grad_norm": 0.5649249307146187, - "learning_rate": 1.583381419503751e-05, - "loss": 0.1099, - "mean_token_accuracy": 0.9660561561584473, - "num_tokens": 14909209.0, + "epoch": 0.8676111382510991, + "grad_norm": 2.3863927864336896, + "learning_rate": 1.4205448834625275e-06, + "loss": 0.6174, + "mean_token_accuracy": 0.7987048149108886, + "num_tokens": 658699371.0, "step": 555 }, { - "epoch": 1.7457098283931356, - "grad_norm": 0.5450399044206087, - "learning_rate": 1.5776110790536643e-05, - "loss": 0.1077, - "mean_token_accuracy": 0.9665981948375701, - "num_tokens": 15041381.0, + "epoch": 0.8754274548119199, + "grad_norm": 4.490834509368819, + "learning_rate": 1.37373495262205e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.7986162424087524, + "num_tokens": 664648725.0, "step": 560 }, { - "epoch": 1.7613104524180967, - "grad_norm": 0.550655637358523, - "learning_rate": 1.5718407386035778e-05, - "loss": 0.104, - "mean_token_accuracy": 0.9683968663215637, - "num_tokens": 15178472.0, + "epoch": 0.8832437713727406, + "grad_norm": 1.8330616045846946, + "learning_rate": 1.3295734625559315e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.798362709581852, + "num_tokens": 670559073.0, "step": 565 }, { - "epoch": 1.7769110764430578, - "grad_norm": 0.5468003600764055, - "learning_rate": 1.5660703981534914e-05, - "loss": 0.105, - "mean_token_accuracy": 0.9676320672035217, - "num_tokens": 15314309.0, + "epoch": 0.8910600879335613, + "grad_norm": 1.8866171760203552, + "learning_rate": 1.2880887582903884e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.7984154649078846, + "num_tokens": 676506420.0, "step": 570 }, { - "epoch": 1.7925117004680189, - "grad_norm": 0.662660268609156, - "learning_rate": 1.5603000577034046e-05, - "loss": 0.1081, - "mean_token_accuracy": 0.9670586466789246, - "num_tokens": 15447427.0, + "epoch": 0.898876404494382, + "grad_norm": 1.7452455628625234, + "learning_rate": 1.2493074667582584e-06, + "loss": 0.621, + "mean_token_accuracy": 0.7975537806749344, + "num_tokens": 682448693.0, "step": 575 }, { - "epoch": 1.8081123244929798, - "grad_norm": 0.5514391464827504, - "learning_rate": 1.554529717253318e-05, - "loss": 0.1095, - "mean_token_accuracy": 0.9667521238327026, - "num_tokens": 15585445.0, + "epoch": 0.9066927210552027, + "grad_norm": 2.030867984009538, + "learning_rate": 1.213254479708519e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.7988204933702946, + "num_tokens": 688390022.0, "step": 580 }, { - "epoch": 1.8237129485179406, - "grad_norm": 0.5981613351314938, - "learning_rate": 1.5487593768032313e-05, - "loss": 0.1038, - "mean_token_accuracy": 0.968332713842392, - "num_tokens": 15719904.0, + "epoch": 0.9145090376160234, + "grad_norm": 1.550865659075602, + "learning_rate": 1.179952937729534e-06, + "loss": 0.6193, + "mean_token_accuracy": 0.7983230344951153, + "num_tokens": 694331611.0, "step": 585 }, { - "epoch": 1.8393135725429017, - "grad_norm": 0.5481985576397634, - "learning_rate": 1.542989036353145e-05, - "loss": 0.1068, - "mean_token_accuracy": 0.9675829410552979, - "num_tokens": 15854787.0, + "epoch": 0.9223253541768441, + "grad_norm": 1.571410028230591, + "learning_rate": 1.149424215396281e-06, + "loss": 0.6136, + "mean_token_accuracy": 0.7990704528987408, + "num_tokens": 700261331.0, "step": 590 }, { - "epoch": 1.8549141965678628, - "grad_norm": 0.5826788610080513, - "learning_rate": 1.5372186959030584e-05, - "loss": 0.1027, - "mean_token_accuracy": 0.9690861344337464, - "num_tokens": 15991840.0, + "epoch": 0.9301416707376648, + "grad_norm": 2.163421002394794, + "learning_rate": 1.1216879075510877e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.7987876988947391, + "num_tokens": 706193693.0, "step": 595 }, { - "epoch": 1.8705148205928237, - "grad_norm": 0.6031982698144975, - "learning_rate": 1.5314483554529717e-05, - "loss": 0.1048, - "mean_token_accuracy": 0.9682729780673981, - "num_tokens": 16126187.0, + "epoch": 0.9379579872984856, + "grad_norm": 3.7294851958982975, + "learning_rate": 1.0967618167267032e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.8031502008438111, + "num_tokens": 712120509.0, "step": 600 }, { - "epoch": 1.8861154446177846, - "grad_norm": 0.644822605733718, - "learning_rate": 1.5256780150028854e-05, - "loss": 0.1034, - "mean_token_accuracy": 0.9689009010791778, - "num_tokens": 16260005.0, + "epoch": 0.9457743038593063, + "grad_norm": 1.937019840071183, + "learning_rate": 1.0746619417197436e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.8023772671818733, + "num_tokens": 718044202.0, "step": 605 }, { - "epoch": 1.9017160686427457, - "grad_norm": 0.6292923852496343, - "learning_rate": 1.5199076745527988e-05, - "loss": 0.1027, - "mean_token_accuracy": 0.9685427904129028, - "num_tokens": 16392826.0, + "epoch": 0.953590620420127, + "grad_norm": 1.822048015248325, + "learning_rate": 1.0554024673218808e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.8010022938251495, + "num_tokens": 723993614.0, "step": 610 }, { - "epoch": 1.9173166926677068, - "grad_norm": 0.5905364025571268, - "learning_rate": 1.5141373341027121e-05, - "loss": 0.1023, - "mean_token_accuracy": 0.969213730096817, - "num_tokens": 16530546.0, + "epoch": 0.9614069369809477, + "grad_norm": 1.6712441013840658, + "learning_rate": 1.0389957552153385e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8014584824442863, + "num_tokens": 729959466.0, "step": 615 }, { - "epoch": 1.9329173166926679, - "grad_norm": 0.6356617111003939, - "learning_rate": 1.5083669936526257e-05, - "loss": 0.1016, - "mean_token_accuracy": 0.96930211186409, - "num_tokens": 16664985.0, + "epoch": 0.9692232535417684, + "grad_norm": 3.2548443190376073, + "learning_rate": 1.0254523360385555e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.7999479919672012, + "num_tokens": 735885753.0, "step": 620 }, { - "epoch": 1.9485179407176287, - "grad_norm": 0.6071607901539858, - "learning_rate": 1.502596653202539e-05, - "loss": 0.1021, - "mean_token_accuracy": 0.9688980400562286, - "num_tokens": 16796878.0, + "epoch": 0.9770395701025891, + "grad_norm": 7.139737923902445, + "learning_rate": 1.0147809026271017e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.7991539172828197, + "num_tokens": 741813482.0, "step": 625 }, { - "epoch": 1.9641185647425896, - "grad_norm": 0.5064988276006431, - "learning_rate": 1.4968263127524524e-05, - "loss": 0.0957, - "mean_token_accuracy": 0.9704132974147797, - "num_tokens": 16931288.0, + "epoch": 0.9848558866634098, + "grad_norm": 1.7874591260763464, + "learning_rate": 1.0069883044341846e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.7999210134148598, + "num_tokens": 747741960.0, "step": 630 }, { - "epoch": 1.9797191887675507, - "grad_norm": 0.5537435426659689, - "learning_rate": 1.4910559723023658e-05, - "loss": 0.1087, - "mean_token_accuracy": 0.9669051706790924, - "num_tokens": 17064859.0, + "epoch": 0.9926722032242306, + "grad_norm": 3.2850538471833945, + "learning_rate": 1.0020795431343349e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.8015001997351646, + "num_tokens": 753638158.0, "step": 635 }, { - "epoch": 1.9953198127925118, - "grad_norm": 0.5997048037627525, - "learning_rate": 1.4852856318522795e-05, - "loss": 0.0947, - "mean_token_accuracy": 0.9707705557346344, - "num_tokens": 17199056.0, + "epoch": 1.0, + "grad_norm": 2.3599969643790777, + "learning_rate": 1.0000577694130827e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8004830511411031, + "num_tokens": 759191155.0, "step": 640 }, { - "epoch": 2.0093603744149764, - "grad_norm": 0.45765040060824713, - "learning_rate": 1.479515291402193e-05, - "loss": 0.0677, - "mean_token_accuracy": 0.9769907527499728, - "num_tokens": 17320735.0, - "step": 645 - }, - { - "epoch": 2.0249609984399375, - "grad_norm": 0.6379753427562452, - "learning_rate": 1.4737449509521063e-05, - "loss": 0.0697, - "mean_token_accuracy": 0.9782340943813324, - "num_tokens": 17456135.0, - "step": 650 - }, - { - "epoch": 2.0405616224648986, - "grad_norm": 0.46594436704934616, - "learning_rate": 1.4679746105020198e-05, - "loss": 0.0674, - "mean_token_accuracy": 0.9790053486824035, - "num_tokens": 17591486.0, - "step": 655 - }, - { - "epoch": 2.0561622464898597, - "grad_norm": 0.6276578458248949, - "learning_rate": 1.4622042700519332e-05, - "loss": 0.0686, - "mean_token_accuracy": 0.9787347793579102, - "num_tokens": 17726558.0, - "step": 660 - }, - { - "epoch": 2.0717628705148208, - "grad_norm": 0.5062901549101052, - "learning_rate": 1.4564339296018466e-05, - "loss": 0.0663, - "mean_token_accuracy": 0.979675030708313, - "num_tokens": 17858362.0, - "step": 665 - }, - { - "epoch": 2.0873634945397814, - "grad_norm": 0.5848008860177072, - "learning_rate": 1.45066358915176e-05, - "loss": 0.0676, - "mean_token_accuracy": 0.9788794219493866, - "num_tokens": 17992249.0, - "step": 670 - }, - { - "epoch": 2.1029641185647425, - "grad_norm": 0.5586217868546769, - "learning_rate": 1.4448932487016735e-05, - "loss": 0.066, - "mean_token_accuracy": 0.979490089416504, - "num_tokens": 18126932.0, - "step": 675 - }, - { - "epoch": 2.1185647425897036, - "grad_norm": 0.5456969706238991, - "learning_rate": 1.4391229082515869e-05, - "loss": 0.068, - "mean_token_accuracy": 0.9786436498165131, - "num_tokens": 18261674.0, - "step": 680 - }, - { - "epoch": 2.1341653666146647, - "grad_norm": 0.547513915986846, - "learning_rate": 1.4333525678015003e-05, - "loss": 0.0654, - "mean_token_accuracy": 0.9795780777931213, - "num_tokens": 18395303.0, - "step": 685 - }, - { - "epoch": 2.149765990639626, - "grad_norm": 0.5280062621107973, - "learning_rate": 1.427582227351414e-05, - "loss": 0.0646, - "mean_token_accuracy": 0.9799421310424805, - "num_tokens": 18531153.0, - "step": 690 - }, - { - "epoch": 2.1653666146645865, - "grad_norm": 0.4754449961678256, - "learning_rate": 1.4218118869013274e-05, - "loss": 0.0659, - "mean_token_accuracy": 0.9795533120632172, - "num_tokens": 18672214.0, - "step": 695 - }, - { - "epoch": 2.1809672386895476, - "grad_norm": 0.5115660900975986, - "learning_rate": 1.4160415464512408e-05, - "loss": 0.0679, - "mean_token_accuracy": 0.9789233446121216, - "num_tokens": 18805938.0, - "step": 700 - }, - { - "epoch": 2.1965678627145087, - "grad_norm": 0.5751725861583461, - "learning_rate": 1.4102712060011541e-05, - "loss": 0.0648, - "mean_token_accuracy": 0.9792892873287201, - "num_tokens": 18937866.0, - "step": 705 - }, - { - "epoch": 2.2121684867394698, - "grad_norm": 0.5046032343771815, - "learning_rate": 1.4045008655510677e-05, - "loss": 0.064, - "mean_token_accuracy": 0.9802761554718018, - "num_tokens": 19073783.0, - "step": 710 - }, - { - "epoch": 2.2277691107644304, - "grad_norm": 0.4771833967652389, - "learning_rate": 1.398730525100981e-05, - "loss": 0.062, - "mean_token_accuracy": 0.9805822253227234, - "num_tokens": 19207935.0, - "step": 715 - }, - { - "epoch": 2.2433697347893915, - "grad_norm": 0.6115193129572258, - "learning_rate": 1.3929601846508945e-05, - "loss": 0.0649, - "mean_token_accuracy": 0.9801377832889557, - "num_tokens": 19340335.0, - "step": 720 - }, - { - "epoch": 2.2589703588143526, - "grad_norm": 0.6617445283747385, - "learning_rate": 1.387189844200808e-05, - "loss": 0.0654, - "mean_token_accuracy": 0.9797018051147461, - "num_tokens": 19471321.0, - "step": 725 - }, - { - "epoch": 2.2745709828393137, - "grad_norm": 0.5272477660409615, - "learning_rate": 1.3814195037507214e-05, - "loss": 0.0667, - "mean_token_accuracy": 0.979661637544632, - "num_tokens": 19602047.0, - "step": 730 - }, - { - "epoch": 2.2901716068642743, - "grad_norm": 0.6200363163248618, - "learning_rate": 1.3756491633006348e-05, - "loss": 0.066, - "mean_token_accuracy": 0.9794325530529022, - "num_tokens": 19735871.0, - "step": 735 - }, - { - "epoch": 2.3057722308892354, - "grad_norm": 0.5017649119361518, - "learning_rate": 1.3698788228505481e-05, - "loss": 0.0657, - "mean_token_accuracy": 0.9798206746578216, - "num_tokens": 19869603.0, - "step": 740 - }, - { - "epoch": 2.3213728549141965, - "grad_norm": 0.5600351935619347, - "learning_rate": 1.3641084824004619e-05, - "loss": 0.0637, - "mean_token_accuracy": 0.9808857142925262, - "num_tokens": 20006195.0, - "step": 745 - }, - { - "epoch": 2.3369734789391576, - "grad_norm": 0.625047128391445, - "learning_rate": 1.3583381419503752e-05, - "loss": 0.0607, - "mean_token_accuracy": 0.9811928570270538, - "num_tokens": 20143037.0, - "step": 750 - }, - { - "epoch": 2.3525741029641187, - "grad_norm": 0.5412878260525056, - "learning_rate": 1.3525678015002886e-05, - "loss": 0.0625, - "mean_token_accuracy": 0.9809665322303772, - "num_tokens": 20277564.0, - "step": 755 - }, - { - "epoch": 2.3681747269890794, - "grad_norm": 0.5719463156340375, - "learning_rate": 1.3467974610502022e-05, - "loss": 0.0656, - "mean_token_accuracy": 0.9800610601902008, - "num_tokens": 20413306.0, - "step": 760 - }, - { - "epoch": 2.3837753510140405, - "grad_norm": 0.5934985891542635, - "learning_rate": 1.3410271206001155e-05, - "loss": 0.0643, - "mean_token_accuracy": 0.9803776860237121, - "num_tokens": 20550646.0, - "step": 765 - }, - { - "epoch": 2.3993759750390016, - "grad_norm": 0.623605985698589, - "learning_rate": 1.335256780150029e-05, - "loss": 0.0638, - "mean_token_accuracy": 0.980189961194992, - "num_tokens": 20683074.0, - "step": 770 - }, - { - "epoch": 2.4149765990639627, - "grad_norm": 0.5634033196138294, - "learning_rate": 1.3294864396999425e-05, - "loss": 0.0607, - "mean_token_accuracy": 0.9815069198608398, - "num_tokens": 20820858.0, - "step": 775 - }, - { - "epoch": 2.4305772230889238, - "grad_norm": 0.5067524068480144, - "learning_rate": 1.3237160992498559e-05, - "loss": 0.0635, - "mean_token_accuracy": 0.9803751349449158, - "num_tokens": 20953472.0, - "step": 780 - }, - { - "epoch": 2.4461778471138844, - "grad_norm": 0.5357765170954253, - "learning_rate": 1.3179457587997692e-05, - "loss": 0.0647, - "mean_token_accuracy": 0.9799603164196015, - "num_tokens": 21084458.0, - "step": 785 - }, - { - "epoch": 2.4617784711388455, - "grad_norm": 0.5784103573729823, - "learning_rate": 1.3121754183496826e-05, - "loss": 0.0612, - "mean_token_accuracy": 0.981369924545288, - "num_tokens": 21217206.0, - "step": 790 - }, - { - "epoch": 2.4773790951638066, - "grad_norm": 0.6539595621485318, - "learning_rate": 1.3064050778995963e-05, - "loss": 0.0619, - "mean_token_accuracy": 0.9811006963253022, - "num_tokens": 21351848.0, - "step": 795 - }, - { - "epoch": 2.4929797191887677, - "grad_norm": 0.47881209756399007, - "learning_rate": 1.3006347374495097e-05, - "loss": 0.0601, - "mean_token_accuracy": 0.9817553579807281, - "num_tokens": 21487331.0, - "step": 800 - }, - { - "epoch": 2.508580343213729, - "grad_norm": 0.5246365795702795, - "learning_rate": 1.2948643969994231e-05, - "loss": 0.0606, - "mean_token_accuracy": 0.9815096974372863, - "num_tokens": 21621528.0, - "step": 805 - }, - { - "epoch": 2.5241809672386895, - "grad_norm": 0.5184181249911959, - "learning_rate": 1.2890940565493366e-05, - "loss": 0.0597, - "mean_token_accuracy": 0.9815921604633331, - "num_tokens": 21759899.0, - "step": 810 - }, - { - "epoch": 2.5397815912636506, - "grad_norm": 0.4546161425335325, - "learning_rate": 1.28332371609925e-05, - "loss": 0.0586, - "mean_token_accuracy": 0.9819862425327301, - "num_tokens": 21895472.0, - "step": 815 - }, - { - "epoch": 2.5553822152886116, - "grad_norm": 0.5215846680122781, - "learning_rate": 1.2775533756491634e-05, - "loss": 0.0584, - "mean_token_accuracy": 0.9823523163795471, - "num_tokens": 22028234.0, - "step": 820 - }, - { - "epoch": 2.5709828393135723, - "grad_norm": 0.5953777316721176, - "learning_rate": 1.2717830351990768e-05, - "loss": 0.0577, - "mean_token_accuracy": 0.9820267677307128, - "num_tokens": 22162546.0, - "step": 825 - }, - { - "epoch": 2.5865834633385334, - "grad_norm": 0.5025426378809432, - "learning_rate": 1.2660126947489903e-05, - "loss": 0.0585, - "mean_token_accuracy": 0.9818842828273773, - "num_tokens": 22295641.0, - "step": 830 - }, - { - "epoch": 2.6021840873634945, - "grad_norm": 0.5039953498707387, - "learning_rate": 1.2602423542989037e-05, - "loss": 0.0598, - "mean_token_accuracy": 0.9823204934597015, - "num_tokens": 22428467.0, - "step": 835 - }, - { - "epoch": 2.6177847113884556, - "grad_norm": 0.5504893854668644, - "learning_rate": 1.2544720138488171e-05, - "loss": 0.0584, - "mean_token_accuracy": 0.9822833776473999, - "num_tokens": 22565528.0, - "step": 840 - }, - { - "epoch": 2.6333853354134167, - "grad_norm": 0.5661795657841351, - "learning_rate": 1.2487016733987306e-05, - "loss": 0.058, - "mean_token_accuracy": 0.9823375225067139, - "num_tokens": 22698845.0, - "step": 845 - }, - { - "epoch": 2.6489859594383773, - "grad_norm": 0.5613797362503292, - "learning_rate": 1.2429313329486442e-05, - "loss": 0.0584, - "mean_token_accuracy": 0.9821543335914612, - "num_tokens": 22833219.0, - "step": 850 - }, - { - "epoch": 2.6645865834633384, - "grad_norm": 0.5774656256827283, - "learning_rate": 1.2371609924985576e-05, - "loss": 0.0561, - "mean_token_accuracy": 0.9826458513736724, - "num_tokens": 22964040.0, - "step": 855 - }, - { - "epoch": 2.6801872074882995, - "grad_norm": 0.482343196208055, - "learning_rate": 1.231390652048471e-05, - "loss": 0.0543, - "mean_token_accuracy": 0.9835286378860474, - "num_tokens": 23101855.0, - "step": 860 - }, - { - "epoch": 2.6957878315132606, - "grad_norm": 0.5526473196570735, - "learning_rate": 1.2256203115983845e-05, - "loss": 0.058, - "mean_token_accuracy": 0.9824776828289032, - "num_tokens": 23238097.0, - "step": 865 - }, - { - "epoch": 2.7113884555382217, - "grad_norm": 0.5840397892589643, - "learning_rate": 1.2198499711482979e-05, - "loss": 0.0557, - "mean_token_accuracy": 0.9830466687679291, - "num_tokens": 23372611.0, - "step": 870 - }, - { - "epoch": 2.7269890795631824, - "grad_norm": 0.46214768163185344, - "learning_rate": 1.2140796306982112e-05, - "loss": 0.0548, - "mean_token_accuracy": 0.9835707128047944, - "num_tokens": 23508539.0, - "step": 875 - }, - { - "epoch": 2.7425897035881435, - "grad_norm": 0.5827338732441717, - "learning_rate": 1.2083092902481248e-05, - "loss": 0.0553, - "mean_token_accuracy": 0.9831783056259156, - "num_tokens": 23644764.0, - "step": 880 - }, - { - "epoch": 2.7581903276131046, - "grad_norm": 0.5208245768516603, - "learning_rate": 1.2025389497980382e-05, - "loss": 0.0532, - "mean_token_accuracy": 0.9844220519065857, - "num_tokens": 23782159.0, - "step": 885 - }, - { - "epoch": 2.7737909516380657, - "grad_norm": 0.4956082222868665, - "learning_rate": 1.1967686093479516e-05, - "loss": 0.0542, - "mean_token_accuracy": 0.9836461126804352, - "num_tokens": 23912494.0, - "step": 890 - }, - { - "epoch": 2.7893915756630268, - "grad_norm": 0.557876314767124, - "learning_rate": 1.190998268897865e-05, - "loss": 0.0557, - "mean_token_accuracy": 0.9829520642757416, - "num_tokens": 24043095.0, - "step": 895 - }, - { - "epoch": 2.8049921996879874, - "grad_norm": 0.4530892797848299, - "learning_rate": 1.1852279284477785e-05, - "loss": 0.0551, - "mean_token_accuracy": 0.9832406878471375, - "num_tokens": 24173768.0, - "step": 900 - }, - { - "epoch": 2.8205928237129485, - "grad_norm": 0.5120835399081769, - "learning_rate": 1.1794575879976919e-05, - "loss": 0.054, - "mean_token_accuracy": 0.9839985728263855, - "num_tokens": 24311082.0, - "step": 905 - }, - { - "epoch": 2.8361934477379096, - "grad_norm": 0.5699193393236468, - "learning_rate": 1.1736872475476052e-05, - "loss": 0.054, - "mean_token_accuracy": 0.9838815748691558, - "num_tokens": 24446211.0, - "step": 910 - }, - { - "epoch": 2.8517940717628703, - "grad_norm": 0.5124623073225754, - "learning_rate": 1.167916907097519e-05, - "loss": 0.0512, - "mean_token_accuracy": 0.9842207610607148, - "num_tokens": 24580329.0, - "step": 915 - }, - { - "epoch": 2.8673946957878313, - "grad_norm": 0.5621277019606149, - "learning_rate": 1.1621465666474323e-05, - "loss": 0.054, - "mean_token_accuracy": 0.9836201250553132, - "num_tokens": 24711881.0, - "step": 920 - }, - { - "epoch": 2.8829953198127924, - "grad_norm": 0.5477814912496453, - "learning_rate": 1.1563762261973457e-05, - "loss": 0.0495, - "mean_token_accuracy": 0.9848583757877349, - "num_tokens": 24847049.0, - "step": 925 - }, - { - "epoch": 2.8985959438377535, - "grad_norm": 0.47455570945887876, - "learning_rate": 1.1506058857472593e-05, - "loss": 0.0508, - "mean_token_accuracy": 0.9842885196208954, - "num_tokens": 24980229.0, - "step": 930 - }, - { - "epoch": 2.9141965678627146, - "grad_norm": 0.4687139530424816, - "learning_rate": 1.1448355452971726e-05, - "loss": 0.0524, - "mean_token_accuracy": 0.9845821619033813, - "num_tokens": 25114675.0, - "step": 935 - }, - { - "epoch": 2.9297971918876753, - "grad_norm": 0.5660748619715202, - "learning_rate": 1.139065204847086e-05, - "loss": 0.0482, - "mean_token_accuracy": 0.9855000972747803, - "num_tokens": 25252634.0, - "step": 940 - }, - { - "epoch": 2.9453978159126364, - "grad_norm": 0.4854573299925896, - "learning_rate": 1.1332948643969994e-05, - "loss": 0.052, - "mean_token_accuracy": 0.9846445024013519, - "num_tokens": 25388533.0, - "step": 945 - }, - { - "epoch": 2.9609984399375975, - "grad_norm": 0.5047346397846462, - "learning_rate": 1.127524523946913e-05, - "loss": 0.0524, - "mean_token_accuracy": 0.9843859314918518, - "num_tokens": 25523721.0, - "step": 950 - }, - { - "epoch": 2.9765990639625586, - "grad_norm": 0.5165845784231596, - "learning_rate": 1.1217541834968263e-05, - "loss": 0.0494, - "mean_token_accuracy": 0.9851727843284607, - "num_tokens": 25655853.0, - "step": 955 - }, - { - "epoch": 2.9921996879875197, - "grad_norm": 0.49095039440654903, - "learning_rate": 1.1159838430467397e-05, - "loss": 0.0518, - "mean_token_accuracy": 0.9850000500679016, - "num_tokens": 25792877.0, - "step": 960 - }, - { - "epoch": 3.0062402496099843, - "grad_norm": 0.3490677066256782, - "learning_rate": 1.1102135025966534e-05, - "loss": 0.0362, - "mean_token_accuracy": 0.9879513912730746, - "num_tokens": 25915060.0, - "step": 965 - }, - { - "epoch": 3.0218408736349454, - "grad_norm": 0.9913523718070499, - "learning_rate": 1.1044431621465668e-05, - "loss": 0.0362, - "mean_token_accuracy": 0.9894399225711823, - "num_tokens": 26050591.0, - "step": 970 - }, - { - "epoch": 3.0374414976599065, - "grad_norm": 0.5071611911954856, - "learning_rate": 1.0986728216964802e-05, - "loss": 0.0364, - "mean_token_accuracy": 0.9891938328742981, - "num_tokens": 26184429.0, - "step": 975 - }, - { - "epoch": 3.0530421216848675, - "grad_norm": 0.44176530408204834, - "learning_rate": 1.0929024812463936e-05, - "loss": 0.0358, - "mean_token_accuracy": 0.9892865777015686, - "num_tokens": 26317217.0, - "step": 980 - }, - { - "epoch": 3.068642745709828, - "grad_norm": 0.4474135118791752, - "learning_rate": 1.0871321407963071e-05, - "loss": 0.0359, - "mean_token_accuracy": 0.9892495989799499, - "num_tokens": 26448145.0, - "step": 985 - }, - { - "epoch": 3.0842433697347893, - "grad_norm": 0.4469912153610548, - "learning_rate": 1.0813618003462205e-05, - "loss": 0.0355, - "mean_token_accuracy": 0.9895004034042358, - "num_tokens": 26583779.0, - "step": 990 - }, - { - "epoch": 3.0998439937597504, - "grad_norm": 0.3796984229372117, - "learning_rate": 1.0755914598961339e-05, - "loss": 0.0354, - "mean_token_accuracy": 0.9895397126674652, - "num_tokens": 26717224.0, - "step": 995 - }, - { - "epoch": 3.1154446177847115, - "grad_norm": 0.48554097245214073, - "learning_rate": 1.0698211194460474e-05, - "loss": 0.0352, - "mean_token_accuracy": 0.9896060526371002, - "num_tokens": 26851803.0, - "step": 1000 - }, - { - "epoch": 3.1310452418096726, - "grad_norm": 0.4204735666395639, - "learning_rate": 1.0640507789959608e-05, - "loss": 0.0349, - "mean_token_accuracy": 0.9891953945159913, - "num_tokens": 26984272.0, - "step": 1005 - }, - { - "epoch": 3.1466458658346332, - "grad_norm": 0.43388479747872655, - "learning_rate": 1.0582804385458742e-05, - "loss": 0.0349, - "mean_token_accuracy": 0.9893825590610504, - "num_tokens": 27118280.0, - "step": 1010 - }, - { - "epoch": 3.1622464898595943, - "grad_norm": 0.5126608366935945, - "learning_rate": 1.0525100980957876e-05, - "loss": 0.0363, - "mean_token_accuracy": 0.9891520202159881, - "num_tokens": 27249856.0, - "step": 1015 - }, - { - "epoch": 3.1778471138845554, - "grad_norm": 0.4022500904809516, - "learning_rate": 1.0467397576457013e-05, - "loss": 0.0343, - "mean_token_accuracy": 0.9896589994430542, - "num_tokens": 27383606.0, - "step": 1020 - }, - { - "epoch": 3.1934477379095165, - "grad_norm": 0.36763857285465273, - "learning_rate": 1.0409694171956147e-05, - "loss": 0.0356, - "mean_token_accuracy": 0.9893875420093536, - "num_tokens": 27514324.0, - "step": 1025 - }, - { - "epoch": 3.209048361934477, - "grad_norm": 0.3761184120835254, - "learning_rate": 1.035199076745528e-05, - "loss": 0.0329, - "mean_token_accuracy": 0.9900339007377624, - "num_tokens": 27651167.0, - "step": 1030 - }, - { - "epoch": 3.2246489859594383, - "grad_norm": 0.4170183760865521, - "learning_rate": 1.0294287362954416e-05, - "loss": 0.0339, - "mean_token_accuracy": 0.9897565901279449, - "num_tokens": 27787041.0, - "step": 1035 - }, - { - "epoch": 3.2402496099843994, - "grad_norm": 0.3432109707003376, - "learning_rate": 1.023658395845355e-05, - "loss": 0.0332, - "mean_token_accuracy": 0.9902929067611694, - "num_tokens": 27922540.0, - "step": 1040 - }, - { - "epoch": 3.2558502340093605, - "grad_norm": 0.34743178714773904, - "learning_rate": 1.0178880553952684e-05, - "loss": 0.0338, - "mean_token_accuracy": 0.9896102547645569, - "num_tokens": 28054161.0, - "step": 1045 - }, - { - "epoch": 3.2714508580343216, - "grad_norm": 0.46566002269683127, - "learning_rate": 1.0121177149451817e-05, - "loss": 0.0349, - "mean_token_accuracy": 0.9895286440849305, - "num_tokens": 28188801.0, - "step": 1050 - }, - { - "epoch": 3.287051482059282, - "grad_norm": 0.3383493517355259, - "learning_rate": 1.0063473744950953e-05, - "loss": 0.0331, - "mean_token_accuracy": 0.9901463866233826, - "num_tokens": 28323390.0, - "step": 1055 - }, - { - "epoch": 3.3026521060842433, - "grad_norm": 0.3890941341577385, - "learning_rate": 1.0005770340450087e-05, - "loss": 0.0338, - "mean_token_accuracy": 0.9900007307529449, - "num_tokens": 28461086.0, - "step": 1060 - }, - { - "epoch": 3.3182527301092044, - "grad_norm": 0.8722465278924197, - "learning_rate": 9.948066935949222e-06, - "loss": 0.0344, - "mean_token_accuracy": 0.9897770345211029, - "num_tokens": 28595624.0, - "step": 1065 - }, - { - "epoch": 3.3338533541341655, - "grad_norm": 0.3068180815070705, - "learning_rate": 9.890363531448356e-06, - "loss": 0.0341, - "mean_token_accuracy": 0.9896463751792908, - "num_tokens": 28729146.0, - "step": 1070 - }, - { - "epoch": 3.3494539781591266, - "grad_norm": 0.405693055582265, - "learning_rate": 9.832660126947491e-06, - "loss": 0.0345, - "mean_token_accuracy": 0.9897280871868134, - "num_tokens": 28863209.0, - "step": 1075 - }, - { - "epoch": 3.3650546021840873, - "grad_norm": 0.3651986120751572, - "learning_rate": 9.774956722446625e-06, - "loss": 0.0337, - "mean_token_accuracy": 0.9899032473564148, - "num_tokens": 28997560.0, - "step": 1080 - }, - { - "epoch": 3.3806552262090483, - "grad_norm": 0.4655638046551388, - "learning_rate": 9.717253317945759e-06, - "loss": 0.0336, - "mean_token_accuracy": 0.9898359596729278, - "num_tokens": 29132738.0, - "step": 1085 - }, - { - "epoch": 3.3962558502340094, - "grad_norm": 0.390597341957638, - "learning_rate": 9.659549913444894e-06, - "loss": 0.0342, - "mean_token_accuracy": 0.9898122906684875, - "num_tokens": 29266954.0, - "step": 1090 - }, - { - "epoch": 3.4118564742589705, - "grad_norm": 0.48861692751948993, - "learning_rate": 9.601846508944028e-06, - "loss": 0.0338, - "mean_token_accuracy": 0.9897624969482421, - "num_tokens": 29398154.0, - "step": 1095 - }, - { - "epoch": 3.427457098283931, - "grad_norm": 0.3811068399441777, - "learning_rate": 9.544143104443164e-06, - "loss": 0.0327, - "mean_token_accuracy": 0.9900210738182068, - "num_tokens": 29531832.0, - "step": 1100 - }, - { - "epoch": 3.4430577223088923, - "grad_norm": 0.36698461790623293, - "learning_rate": 9.486439699942298e-06, - "loss": 0.0335, - "mean_token_accuracy": 0.9899003326892852, - "num_tokens": 29666458.0, - "step": 1105 - }, - { - "epoch": 3.4586583463338534, - "grad_norm": 0.3725030509215407, - "learning_rate": 9.428736295441431e-06, - "loss": 0.0337, - "mean_token_accuracy": 0.9900450468063354, - "num_tokens": 29802111.0, - "step": 1110 - }, - { - "epoch": 3.4742589703588145, - "grad_norm": 0.4175587860110882, - "learning_rate": 9.371032890940567e-06, - "loss": 0.0333, - "mean_token_accuracy": 0.9899795711040497, - "num_tokens": 29938336.0, - "step": 1115 - }, - { - "epoch": 3.489859594383775, - "grad_norm": 0.9192949340801146, - "learning_rate": 9.3133294864397e-06, - "loss": 0.0332, - "mean_token_accuracy": 0.9900984108448029, - "num_tokens": 30070284.0, - "step": 1120 - }, - { - "epoch": 3.5054602184087362, - "grad_norm": 0.4382238438396383, - "learning_rate": 9.255626081938836e-06, - "loss": 0.0334, - "mean_token_accuracy": 0.9900668442249299, - "num_tokens": 30201902.0, - "step": 1125 - }, - { - "epoch": 3.5210608424336973, - "grad_norm": 0.4415595784740572, - "learning_rate": 9.19792267743797e-06, - "loss": 0.0345, - "mean_token_accuracy": 0.9897962033748626, - "num_tokens": 30335421.0, - "step": 1130 - }, - { - "epoch": 3.5366614664586584, - "grad_norm": 0.4103246672411594, - "learning_rate": 9.140219272937104e-06, - "loss": 0.0323, - "mean_token_accuracy": 0.9902926802635192, - "num_tokens": 30469531.0, - "step": 1135 - }, - { - "epoch": 3.5522620904836195, - "grad_norm": 0.3613436635984974, - "learning_rate": 9.082515868436237e-06, - "loss": 0.0312, - "mean_token_accuracy": 0.9908117353916168, - "num_tokens": 30608253.0, - "step": 1140 - }, - { - "epoch": 3.56786271450858, - "grad_norm": 0.6129262337079979, - "learning_rate": 9.024812463935373e-06, - "loss": 0.0316, - "mean_token_accuracy": 0.990767502784729, - "num_tokens": 30744344.0, - "step": 1145 - }, - { - "epoch": 3.5834633385335413, - "grad_norm": 0.3718642406716124, - "learning_rate": 8.967109059434508e-06, - "loss": 0.0332, - "mean_token_accuracy": 0.9901099085807801, - "num_tokens": 30882716.0, - "step": 1150 - }, - { - "epoch": 3.5990639625585024, - "grad_norm": 0.3888395053037814, - "learning_rate": 8.909405654933642e-06, - "loss": 0.0328, - "mean_token_accuracy": 0.9903294146060944, - "num_tokens": 31019880.0, - "step": 1155 - }, - { - "epoch": 3.6146645865834635, - "grad_norm": 0.3270250852564694, - "learning_rate": 8.851702250432776e-06, - "loss": 0.0321, - "mean_token_accuracy": 0.9901081144809722, - "num_tokens": 31150261.0, - "step": 1160 - }, - { - "epoch": 3.6302652106084246, - "grad_norm": 0.3651680490380126, - "learning_rate": 8.79399884593191e-06, - "loss": 0.033, - "mean_token_accuracy": 0.9901183784008026, - "num_tokens": 31285179.0, - "step": 1165 - }, - { - "epoch": 3.645865834633385, - "grad_norm": 0.3835485037750142, - "learning_rate": 8.736295441431045e-06, - "loss": 0.034, - "mean_token_accuracy": 0.9899890661239624, - "num_tokens": 31420811.0, - "step": 1170 - }, - { - "epoch": 3.6614664586583463, - "grad_norm": 0.3907588292879908, - "learning_rate": 8.678592036930179e-06, - "loss": 0.0322, - "mean_token_accuracy": 0.9904151439666748, - "num_tokens": 31557161.0, - "step": 1175 - }, - { - "epoch": 3.6770670826833074, - "grad_norm": 0.3507936183329917, - "learning_rate": 8.620888632429315e-06, - "loss": 0.0335, - "mean_token_accuracy": 0.9902589380741119, - "num_tokens": 31687531.0, - "step": 1180 - }, - { - "epoch": 3.6926677067082685, - "grad_norm": 0.36813793522661714, - "learning_rate": 8.563185227928448e-06, - "loss": 0.0312, - "mean_token_accuracy": 0.9907920718193054, - "num_tokens": 31823866.0, - "step": 1185 - }, - { - "epoch": 3.7082683307332296, - "grad_norm": 0.3774995579093435, - "learning_rate": 8.505481823427582e-06, - "loss": 0.0311, - "mean_token_accuracy": 0.990852290391922, - "num_tokens": 31959926.0, - "step": 1190 - }, - { - "epoch": 3.7238689547581902, - "grad_norm": 0.37260395664452817, - "learning_rate": 8.447778418926718e-06, - "loss": 0.0322, - "mean_token_accuracy": 0.9901103973388672, - "num_tokens": 32095024.0, - "step": 1195 - }, - { - "epoch": 3.7394695787831513, - "grad_norm": 0.4010288846696042, - "learning_rate": 8.390075014425851e-06, - "loss": 0.0315, - "mean_token_accuracy": 0.9906571626663208, - "num_tokens": 32231044.0, - "step": 1200 - }, - { - "epoch": 3.7550702028081124, - "grad_norm": 1.096838169906387, - "learning_rate": 8.332371609924987e-06, - "loss": 0.0307, - "mean_token_accuracy": 0.9910696685314179, - "num_tokens": 32368150.0, - "step": 1205 - }, - { - "epoch": 3.770670826833073, - "grad_norm": 0.37562066364740404, - "learning_rate": 8.27466820542412e-06, - "loss": 0.0313, - "mean_token_accuracy": 0.9904199957847595, - "num_tokens": 32500610.0, - "step": 1210 - }, - { - "epoch": 3.786271450858034, - "grad_norm": 0.3830733284662822, - "learning_rate": 8.216964800923255e-06, - "loss": 0.0321, - "mean_token_accuracy": 0.9904833734035492, - "num_tokens": 32635303.0, - "step": 1215 - }, - { - "epoch": 3.8018720748829953, - "grad_norm": 0.3091988680492327, - "learning_rate": 8.15926139642239e-06, - "loss": 0.0312, - "mean_token_accuracy": 0.9907013535499573, - "num_tokens": 32771586.0, - "step": 1220 - }, - { - "epoch": 3.8174726989079564, - "grad_norm": 0.3312248899119601, - "learning_rate": 8.101557991921524e-06, - "loss": 0.0297, - "mean_token_accuracy": 0.9907315850257874, - "num_tokens": 32908003.0, - "step": 1225 - }, - { - "epoch": 3.8330733229329175, - "grad_norm": 0.348853963998793, - "learning_rate": 8.04385458742066e-06, - "loss": 0.0309, - "mean_token_accuracy": 0.9906924843788147, - "num_tokens": 33042994.0, - "step": 1230 - }, - { - "epoch": 3.848673946957878, - "grad_norm": 0.4198481215223838, - "learning_rate": 7.986151182919793e-06, - "loss": 0.0322, - "mean_token_accuracy": 0.9902994751930236, - "num_tokens": 33175265.0, - "step": 1235 - }, - { - "epoch": 3.864274570982839, - "grad_norm": 0.39686447444012835, - "learning_rate": 7.928447778418927e-06, - "loss": 0.0315, - "mean_token_accuracy": 0.9903347849845886, - "num_tokens": 33308175.0, - "step": 1240 - }, - { - "epoch": 3.8798751950078003, - "grad_norm": 0.3428618913867458, - "learning_rate": 7.870744373918062e-06, - "loss": 0.0305, - "mean_token_accuracy": 0.9907864332199097, - "num_tokens": 33444560.0, - "step": 1245 - }, - { - "epoch": 3.8954758190327614, - "grad_norm": 0.41036568455175115, - "learning_rate": 7.813040969417196e-06, - "loss": 0.0312, - "mean_token_accuracy": 0.9906459391117096, - "num_tokens": 33577614.0, - "step": 1250 - }, - { - "epoch": 3.9110764430577225, - "grad_norm": 0.37505675085074575, - "learning_rate": 7.755337564916332e-06, - "loss": 0.0307, - "mean_token_accuracy": 0.9909157037734986, - "num_tokens": 33711210.0, - "step": 1255 - }, - { - "epoch": 3.926677067082683, - "grad_norm": 0.3021326967543721, - "learning_rate": 7.697634160415465e-06, - "loss": 0.0298, - "mean_token_accuracy": 0.9911834299564362, - "num_tokens": 33843044.0, - "step": 1260 - }, - { - "epoch": 3.9422776911076443, - "grad_norm": 0.2580343755150667, - "learning_rate": 7.6399307559146e-06, - "loss": 0.0301, - "mean_token_accuracy": 0.991005277633667, - "num_tokens": 33976958.0, - "step": 1265 - }, - { - "epoch": 3.9578783151326054, - "grad_norm": 0.32406819404307124, - "learning_rate": 7.582227351413735e-06, - "loss": 0.03, - "mean_token_accuracy": 0.9909539341926574, - "num_tokens": 34112537.0, - "step": 1270 - }, - { - "epoch": 3.9734789391575664, - "grad_norm": 0.4072364555030912, - "learning_rate": 7.5245239469128685e-06, - "loss": 0.0299, - "mean_token_accuracy": 0.9912158846855164, - "num_tokens": 34247507.0, - "step": 1275 - }, - { - "epoch": 3.9890795631825275, - "grad_norm": 0.34177409995444047, - "learning_rate": 7.466820542412003e-06, - "loss": 0.0294, - "mean_token_accuracy": 0.9913082957267761, - "num_tokens": 34383614.0, - "step": 1280 - }, - { - "epoch": 4.003120124804992, - "grad_norm": 0.2087928113819105, - "learning_rate": 7.409117137911137e-06, - "loss": 0.0258, - "mean_token_accuracy": 0.9916192359394498, - "num_tokens": 34505394.0, - "step": 1285 - }, - { - "epoch": 4.018720748829953, - "grad_norm": 0.22015089497171852, - "learning_rate": 7.351413733410272e-06, - "loss": 0.0261, - "mean_token_accuracy": 0.9923950254917144, - "num_tokens": 34639693.0, - "step": 1290 - }, - { - "epoch": 4.034321372854914, - "grad_norm": 0.2460585690565452, - "learning_rate": 7.293710328909405e-06, - "loss": 0.0267, - "mean_token_accuracy": 0.9919693648815155, - "num_tokens": 34770892.0, - "step": 1295 - }, - { - "epoch": 4.049921996879875, - "grad_norm": 0.2822290499571362, - "learning_rate": 7.236006924408541e-06, - "loss": 0.0257, - "mean_token_accuracy": 0.9924468576908112, - "num_tokens": 34908168.0, - "step": 1300 - }, - { - "epoch": 4.0655226209048365, - "grad_norm": 0.25669589512793606, - "learning_rate": 7.1783035199076755e-06, - "loss": 0.0255, - "mean_token_accuracy": 0.9925323188304901, - "num_tokens": 35044647.0, - "step": 1305 - }, - { - "epoch": 4.081123244929797, - "grad_norm": 0.30119454787041633, - "learning_rate": 7.120600115406809e-06, - "loss": 0.0264, - "mean_token_accuracy": 0.9920835256576538, - "num_tokens": 35177022.0, - "step": 1310 - }, - { - "epoch": 4.096723868954758, - "grad_norm": 0.2836926501079029, - "learning_rate": 7.062896710905944e-06, - "loss": 0.0261, - "mean_token_accuracy": 0.9923124372959137, - "num_tokens": 35312214.0, - "step": 1315 - }, - { - "epoch": 4.112324492979719, - "grad_norm": 0.2796025997070368, - "learning_rate": 7.005193306405078e-06, - "loss": 0.0261, - "mean_token_accuracy": 0.9922410726547242, - "num_tokens": 35445108.0, - "step": 1320 - }, - { - "epoch": 4.12792511700468, - "grad_norm": 0.23863636949043232, - "learning_rate": 6.947489901904213e-06, - "loss": 0.0259, - "mean_token_accuracy": 0.9924528360366821, - "num_tokens": 35578641.0, - "step": 1325 - }, - { - "epoch": 4.1435257410296416, - "grad_norm": 0.2319631078574882, - "learning_rate": 6.889786497403347e-06, - "loss": 0.0257, - "mean_token_accuracy": 0.9922460615634918, - "num_tokens": 35712950.0, - "step": 1330 - }, - { - "epoch": 4.159126365054602, - "grad_norm": 0.22755436684672453, - "learning_rate": 6.832083092902482e-06, - "loss": 0.0261, - "mean_token_accuracy": 0.992229801416397, - "num_tokens": 35844696.0, - "step": 1335 - }, - { - "epoch": 4.174726989079563, - "grad_norm": 0.27015899956441786, - "learning_rate": 6.774379688401616e-06, - "loss": 0.0256, - "mean_token_accuracy": 0.992426085472107, - "num_tokens": 35981751.0, - "step": 1340 - }, - { - "epoch": 4.190327613104524, - "grad_norm": 0.1982276144930778, - "learning_rate": 6.71667628390075e-06, - "loss": 0.0258, - "mean_token_accuracy": 0.9924410998821258, - "num_tokens": 36115684.0, - "step": 1345 - }, - { - "epoch": 4.205928237129485, - "grad_norm": 0.25665434029238543, - "learning_rate": 6.658972879399886e-06, - "loss": 0.0257, - "mean_token_accuracy": 0.992529320716858, - "num_tokens": 36251146.0, - "step": 1350 - }, - { - "epoch": 4.221528861154447, - "grad_norm": 0.20742945875597904, - "learning_rate": 6.601269474899019e-06, - "loss": 0.0255, - "mean_token_accuracy": 0.9925081074237824, - "num_tokens": 36387342.0, - "step": 1355 - }, - { - "epoch": 4.237129485179407, - "grad_norm": 0.20830237441174027, - "learning_rate": 6.543566070398154e-06, - "loss": 0.0256, - "mean_token_accuracy": 0.99239182472229, - "num_tokens": 36522199.0, - "step": 1360 - }, - { - "epoch": 4.252730109204368, - "grad_norm": 0.18329148966652772, - "learning_rate": 6.485862665897289e-06, - "loss": 0.0252, - "mean_token_accuracy": 0.9924617230892181, - "num_tokens": 36659905.0, - "step": 1365 - }, - { - "epoch": 4.268330733229329, - "grad_norm": 0.2210417236492713, - "learning_rate": 6.4281592613964225e-06, - "loss": 0.0257, - "mean_token_accuracy": 0.9924541592597962, - "num_tokens": 36794596.0, - "step": 1370 - }, - { - "epoch": 4.28393135725429, - "grad_norm": 0.2450525112386853, - "learning_rate": 6.370455856895558e-06, - "loss": 0.026, - "mean_token_accuracy": 0.9922269523143769, - "num_tokens": 36927331.0, - "step": 1375 - }, - { - "epoch": 4.299531981279252, - "grad_norm": 0.2389982156469671, - "learning_rate": 6.312752452394692e-06, - "loss": 0.0255, - "mean_token_accuracy": 0.9925234019756317, - "num_tokens": 37064209.0, - "step": 1380 - }, - { - "epoch": 4.315132605304212, - "grad_norm": 0.22305148751441065, - "learning_rate": 6.255049047893826e-06, - "loss": 0.0253, - "mean_token_accuracy": 0.9923750519752502, - "num_tokens": 37198863.0, - "step": 1385 - }, - { - "epoch": 4.330733229329173, - "grad_norm": 0.23036166204016767, - "learning_rate": 6.19734564339296e-06, - "loss": 0.0257, - "mean_token_accuracy": 0.9924868226051331, - "num_tokens": 37333313.0, - "step": 1390 - }, - { - "epoch": 4.3463338533541345, - "grad_norm": 0.21120793280034894, - "learning_rate": 6.139642238892095e-06, - "loss": 0.0256, - "mean_token_accuracy": 0.9923638105392456, - "num_tokens": 37469908.0, - "step": 1395 - }, - { - "epoch": 4.361934477379095, - "grad_norm": 0.2543427837048502, - "learning_rate": 6.08193883439123e-06, - "loss": 0.0257, - "mean_token_accuracy": 0.9921638011932373, - "num_tokens": 37605045.0, - "step": 1400 - }, - { - "epoch": 4.377535101404056, - "grad_norm": 0.2521878430596593, - "learning_rate": 6.024235429890364e-06, - "loss": 0.0257, - "mean_token_accuracy": 0.9922496914863587, - "num_tokens": 37738642.0, - "step": 1405 - }, - { - "epoch": 4.393135725429017, - "grad_norm": 0.29703714329235237, - "learning_rate": 5.966532025389499e-06, - "loss": 0.0256, - "mean_token_accuracy": 0.9924009621143342, - "num_tokens": 37873937.0, - "step": 1410 - }, - { - "epoch": 4.408736349453978, - "grad_norm": 0.25917181515112186, - "learning_rate": 5.9088286208886326e-06, - "loss": 0.0251, - "mean_token_accuracy": 0.9925038278102875, - "num_tokens": 38010192.0, - "step": 1415 - }, - { - "epoch": 4.4243369734789395, - "grad_norm": 0.2729195248404703, - "learning_rate": 5.851125216387767e-06, - "loss": 0.0254, - "mean_token_accuracy": 0.9924300014972687, - "num_tokens": 38144250.0, - "step": 1420 - }, - { - "epoch": 4.4399375975039, - "grad_norm": 0.19314426189940467, - "learning_rate": 5.793421811886903e-06, - "loss": 0.0254, - "mean_token_accuracy": 0.9922723174095154, - "num_tokens": 38276938.0, - "step": 1425 - }, - { - "epoch": 4.455538221528861, - "grad_norm": 0.26163600842534557, - "learning_rate": 5.7357184073860365e-06, - "loss": 0.0254, - "mean_token_accuracy": 0.9924465179443359, - "num_tokens": 38413184.0, - "step": 1430 - }, - { - "epoch": 4.471138845553822, - "grad_norm": 0.2594397682913733, - "learning_rate": 5.678015002885171e-06, - "loss": 0.0257, - "mean_token_accuracy": 0.9922716200351716, - "num_tokens": 38545079.0, - "step": 1435 - }, - { - "epoch": 4.486739469578783, - "grad_norm": 0.21072322905834717, - "learning_rate": 5.620311598384305e-06, - "loss": 0.0258, - "mean_token_accuracy": 0.9923346698284149, - "num_tokens": 38677670.0, - "step": 1440 - }, - { - "epoch": 4.502340093603744, - "grad_norm": 0.2456711813003879, - "learning_rate": 5.5626081938834396e-06, - "loss": 0.0258, - "mean_token_accuracy": 0.9918740570545197, - "num_tokens": 38810368.0, - "step": 1445 - }, - { - "epoch": 4.517940717628705, - "grad_norm": 0.2724680423947134, - "learning_rate": 5.504904789382573e-06, - "loss": 0.0252, - "mean_token_accuracy": 0.9924174904823303, - "num_tokens": 38944806.0, - "step": 1450 - }, - { - "epoch": 4.533541341653666, - "grad_norm": 0.21008096218444924, - "learning_rate": 5.447201384881709e-06, - "loss": 0.0244, - "mean_token_accuracy": 0.992695277929306, - "num_tokens": 39082214.0, - "step": 1455 - }, - { - "epoch": 4.549141965678627, - "grad_norm": 0.22916295891537064, - "learning_rate": 5.3894979803808435e-06, - "loss": 0.0247, - "mean_token_accuracy": 0.9923828959465026, - "num_tokens": 39219193.0, - "step": 1460 - }, - { - "epoch": 4.564742589703588, - "grad_norm": 0.2331878172390347, - "learning_rate": 5.331794575879977e-06, - "loss": 0.025, - "mean_token_accuracy": 0.9924946069717407, - "num_tokens": 39356196.0, - "step": 1465 - }, - { - "epoch": 4.580343213728549, - "grad_norm": 0.251639715266913, - "learning_rate": 5.274091171379112e-06, - "loss": 0.0259, - "mean_token_accuracy": 0.9923617303371429, - "num_tokens": 39488052.0, - "step": 1470 - }, - { - "epoch": 4.59594383775351, - "grad_norm": 0.23083713655395227, - "learning_rate": 5.216387766878246e-06, - "loss": 0.0253, - "mean_token_accuracy": 0.9925037145614624, - "num_tokens": 39622300.0, - "step": 1475 - }, - { - "epoch": 4.611544461778471, - "grad_norm": 0.19731666531160486, - "learning_rate": 5.158684362377381e-06, - "loss": 0.0247, - "mean_token_accuracy": 0.992599630355835, - "num_tokens": 39761796.0, - "step": 1480 - }, - { - "epoch": 4.627145085803432, - "grad_norm": 0.2673103549266464, - "learning_rate": 5.100980957876515e-06, - "loss": 0.0256, - "mean_token_accuracy": 0.9922535717487335, - "num_tokens": 39895523.0, - "step": 1485 - }, - { - "epoch": 4.642745709828393, - "grad_norm": 0.19942831397711988, - "learning_rate": 5.04327755337565e-06, - "loss": 0.0246, - "mean_token_accuracy": 0.9923189103603363, - "num_tokens": 40032480.0, - "step": 1490 - }, - { - "epoch": 4.658346333853354, - "grad_norm": 0.2390543299543101, - "learning_rate": 4.985574148874784e-06, - "loss": 0.0259, - "mean_token_accuracy": 0.992202627658844, - "num_tokens": 40164238.0, - "step": 1495 - }, - { - "epoch": 4.673946957878315, - "grad_norm": 0.2043619266630321, - "learning_rate": 4.927870744373918e-06, - "loss": 0.0251, - "mean_token_accuracy": 0.9922064006328583, - "num_tokens": 40298050.0, - "step": 1500 - }, - { - "epoch": 4.689547581903276, - "grad_norm": 0.19900313130075178, - "learning_rate": 4.870167339873053e-06, - "loss": 0.0251, - "mean_token_accuracy": 0.9926720380783081, - "num_tokens": 40432899.0, - "step": 1505 - }, - { - "epoch": 4.7051482059282375, - "grad_norm": 0.23864486350990322, - "learning_rate": 4.812463935372187e-06, - "loss": 0.025, - "mean_token_accuracy": 0.9923921763896942, - "num_tokens": 40567752.0, - "step": 1510 - }, - { - "epoch": 4.720748829953198, - "grad_norm": 0.25871353222186716, - "learning_rate": 4.754760530871322e-06, - "loss": 0.0251, - "mean_token_accuracy": 0.9923587918281556, - "num_tokens": 40701848.0, - "step": 1515 - }, - { - "epoch": 4.736349453978159, - "grad_norm": 0.25605007304500094, - "learning_rate": 4.697057126370456e-06, - "loss": 0.0254, - "mean_token_accuracy": 0.9922918856143952, - "num_tokens": 40836084.0, - "step": 1520 - }, - { - "epoch": 4.75195007800312, - "grad_norm": 0.20973968723381536, - "learning_rate": 4.6393537218695904e-06, - "loss": 0.0259, - "mean_token_accuracy": 0.9920334756374359, - "num_tokens": 40967976.0, - "step": 1525 - }, - { - "epoch": 4.767550702028081, - "grad_norm": 0.2721131562347456, - "learning_rate": 4.581650317368725e-06, - "loss": 0.0249, - "mean_token_accuracy": 0.9922464430332184, - "num_tokens": 41101418.0, - "step": 1530 - }, - { - "epoch": 4.7831513260530425, - "grad_norm": 0.19496771982733335, - "learning_rate": 4.52394691286786e-06, - "loss": 0.0251, - "mean_token_accuracy": 0.9924694299697876, - "num_tokens": 41234419.0, - "step": 1535 - }, - { - "epoch": 4.798751950078003, - "grad_norm": 0.1856919453879232, - "learning_rate": 4.466243508366994e-06, - "loss": 0.0245, - "mean_token_accuracy": 0.992746913433075, - "num_tokens": 41371203.0, - "step": 1540 - }, - { - "epoch": 4.814352574102964, - "grad_norm": 0.27502501789045736, - "learning_rate": 4.408540103866128e-06, - "loss": 0.0254, - "mean_token_accuracy": 0.9923159599304199, - "num_tokens": 41505318.0, - "step": 1545 - }, - { - "epoch": 4.829953198127925, - "grad_norm": 0.24734756683413492, - "learning_rate": 4.350836699365263e-06, - "loss": 0.025, - "mean_token_accuracy": 0.9924296736717224, - "num_tokens": 41640529.0, - "step": 1550 - }, - { - "epoch": 4.845553822152886, - "grad_norm": 0.2747226907102041, - "learning_rate": 4.2931332948643974e-06, - "loss": 0.0258, - "mean_token_accuracy": 0.9920921742916107, - "num_tokens": 41772590.0, - "step": 1555 - }, - { - "epoch": 4.8611544461778475, - "grad_norm": 0.1987124720913511, - "learning_rate": 4.235429890363531e-06, - "loss": 0.0246, - "mean_token_accuracy": 0.9926553666591644, - "num_tokens": 41907842.0, - "step": 1560 - }, - { - "epoch": 4.876755070202808, - "grad_norm": 0.2007681676713943, - "learning_rate": 4.177726485862667e-06, - "loss": 0.025, - "mean_token_accuracy": 0.9924448490142822, - "num_tokens": 42041393.0, - "step": 1565 - }, - { - "epoch": 4.892355694227769, - "grad_norm": 0.22496355495648218, - "learning_rate": 4.1200230813618005e-06, - "loss": 0.0249, - "mean_token_accuracy": 0.9926417946815491, - "num_tokens": 42175006.0, - "step": 1570 - }, - { - "epoch": 4.90795631825273, - "grad_norm": 0.20488185006184043, - "learning_rate": 4.062319676860935e-06, - "loss": 0.0247, - "mean_token_accuracy": 0.9925686061382294, - "num_tokens": 42310997.0, - "step": 1575 - }, - { - "epoch": 4.923556942277691, - "grad_norm": 0.20176175269557198, - "learning_rate": 4.00461627236007e-06, - "loss": 0.0246, - "mean_token_accuracy": 0.9925564587116241, - "num_tokens": 42445041.0, - "step": 1580 - }, - { - "epoch": 4.939157566302653, - "grad_norm": 0.27843063254870387, - "learning_rate": 3.946912867859204e-06, - "loss": 0.0255, - "mean_token_accuracy": 0.9923796772956848, - "num_tokens": 42577620.0, - "step": 1585 - }, - { - "epoch": 4.954758190327613, - "grad_norm": 0.1849806436583689, - "learning_rate": 3.889209463358338e-06, - "loss": 0.0252, - "mean_token_accuracy": 0.9921863794326782, - "num_tokens": 42711042.0, - "step": 1590 - }, - { - "epoch": 4.970358814352574, - "grad_norm": 0.2260995129591358, - "learning_rate": 3.831506058857473e-06, - "loss": 0.025, - "mean_token_accuracy": 0.9923262357711792, - "num_tokens": 42844345.0, - "step": 1595 - }, - { - "epoch": 4.985959438377535, - "grad_norm": 0.23583146631665872, - "learning_rate": 3.7738026543566075e-06, - "loss": 0.0252, - "mean_token_accuracy": 0.9923614859580994, - "num_tokens": 42975604.0, - "step": 1600 - }, - { - "epoch": 5.0, - "grad_norm": 0.21933550919111908, - "learning_rate": 3.7160992498557417e-06, - "loss": 0.0221, - "mean_token_accuracy": 0.9925718638632033, - "num_tokens": 43097746.0, - "step": 1605 - }, - { - "epoch": 5.015600624024961, - "grad_norm": 0.1525090326151516, - "learning_rate": 3.6583958453548764e-06, - "loss": 0.0238, - "mean_token_accuracy": 0.9927153348922729, - "num_tokens": 43232905.0, - "step": 1610 - }, - { - "epoch": 5.031201248049922, - "grad_norm": 0.20467619947473584, - "learning_rate": 3.6006924408540106e-06, - "loss": 0.0244, - "mean_token_accuracy": 0.9926062643527984, - "num_tokens": 43366038.0, - "step": 1615 - }, - { - "epoch": 5.046801872074883, - "grad_norm": 0.1360921218261351, - "learning_rate": 3.542989036353145e-06, - "loss": 0.0242, - "mean_token_accuracy": 0.9924696981906891, - "num_tokens": 43499140.0, - "step": 1620 - }, - { - "epoch": 5.062402496099844, - "grad_norm": 0.15296937067760497, - "learning_rate": 3.4852856318522794e-06, - "loss": 0.0242, - "mean_token_accuracy": 0.9926361858844757, - "num_tokens": 43631085.0, - "step": 1625 - }, - { - "epoch": 5.078003120124805, - "grad_norm": 0.1310938697562029, - "learning_rate": 3.427582227351414e-06, - "loss": 0.0238, - "mean_token_accuracy": 0.992695951461792, - "num_tokens": 43766631.0, - "step": 1630 - }, - { - "epoch": 5.093603744149766, - "grad_norm": 0.12923692759517427, - "learning_rate": 3.3698788228505487e-06, - "loss": 0.0241, - "mean_token_accuracy": 0.9928207635879517, - "num_tokens": 43899731.0, - "step": 1635 - }, - { - "epoch": 5.109204368174727, - "grad_norm": 0.19286616296038664, - "learning_rate": 3.312175418349683e-06, - "loss": 0.0235, - "mean_token_accuracy": 0.9928472936153412, - "num_tokens": 44035950.0, - "step": 1640 - }, - { - "epoch": 5.124804992199688, - "grad_norm": 0.13601247264012203, - "learning_rate": 3.254472013848817e-06, - "loss": 0.0241, - "mean_token_accuracy": 0.9925517261028289, - "num_tokens": 44169592.0, - "step": 1645 - }, - { - "epoch": 5.140405616224649, - "grad_norm": 0.12224126619272041, - "learning_rate": 3.196768609347952e-06, - "loss": 0.0239, - "mean_token_accuracy": 0.9928341090679169, - "num_tokens": 44303806.0, - "step": 1650 - }, - { - "epoch": 5.15600624024961, - "grad_norm": 0.1562685200560613, - "learning_rate": 3.139065204847086e-06, - "loss": 0.0235, - "mean_token_accuracy": 0.9929479837417603, - "num_tokens": 44441856.0, - "step": 1655 - }, - { - "epoch": 5.171606864274571, - "grad_norm": 0.1769130623237416, - "learning_rate": 3.081361800346221e-06, - "loss": 0.0243, - "mean_token_accuracy": 0.9923935830593109, - "num_tokens": 44574190.0, - "step": 1660 - }, - { - "epoch": 5.187207488299532, - "grad_norm": 0.13712712227031879, - "learning_rate": 3.0236583958453553e-06, - "loss": 0.0239, - "mean_token_accuracy": 0.9927611231803894, - "num_tokens": 44709889.0, - "step": 1665 - }, - { - "epoch": 5.202808112324493, - "grad_norm": 0.1270063308272785, - "learning_rate": 2.9659549913444895e-06, - "loss": 0.0241, - "mean_token_accuracy": 0.992726308107376, - "num_tokens": 44842799.0, - "step": 1670 - }, - { - "epoch": 5.218408736349454, - "grad_norm": 0.2437413697601591, - "learning_rate": 2.908251586843624e-06, - "loss": 0.0243, - "mean_token_accuracy": 0.9925644099712372, - "num_tokens": 44976216.0, - "step": 1675 - }, - { - "epoch": 5.234009360374415, - "grad_norm": 0.12921566402941945, - "learning_rate": 2.8505481823427584e-06, - "loss": 0.0244, - "mean_token_accuracy": 0.9923673212528229, - "num_tokens": 45108784.0, - "step": 1680 - }, - { - "epoch": 5.249609984399376, - "grad_norm": 0.13105411576678402, - "learning_rate": 2.7928447778418926e-06, - "loss": 0.0242, - "mean_token_accuracy": 0.9926429033279419, - "num_tokens": 45241655.0, - "step": 1685 - }, - { - "epoch": 5.265210608424337, - "grad_norm": 0.12608799588120717, - "learning_rate": 2.7351413733410277e-06, - "loss": 0.024, - "mean_token_accuracy": 0.9926675736904145, - "num_tokens": 45375891.0, - "step": 1690 - }, - { - "epoch": 5.280811232449298, - "grad_norm": 0.12064418233877248, - "learning_rate": 2.677437968840162e-06, - "loss": 0.0243, - "mean_token_accuracy": 0.992779678106308, - "num_tokens": 45507823.0, - "step": 1695 - }, - { - "epoch": 5.296411856474259, - "grad_norm": 0.17004625510690358, - "learning_rate": 2.619734564339296e-06, - "loss": 0.0245, - "mean_token_accuracy": 0.9925733089447022, - "num_tokens": 45639010.0, - "step": 1700 - }, - { - "epoch": 5.31201248049922, - "grad_norm": 0.13527413962954762, - "learning_rate": 2.5620311598384307e-06, - "loss": 0.0238, - "mean_token_accuracy": 0.9928280174732208, - "num_tokens": 45773707.0, - "step": 1705 - }, - { - "epoch": 5.327613104524181, - "grad_norm": 0.15724887606296575, - "learning_rate": 2.504327755337565e-06, - "loss": 0.0238, - "mean_token_accuracy": 0.992822802066803, - "num_tokens": 45909418.0, - "step": 1710 - }, - { - "epoch": 5.343213728549142, - "grad_norm": 0.12042487098433259, - "learning_rate": 2.4466243508366996e-06, - "loss": 0.0235, - "mean_token_accuracy": 0.9930182099342346, - "num_tokens": 46046532.0, - "step": 1715 - }, - { - "epoch": 5.358814352574103, - "grad_norm": 0.12420171711829726, - "learning_rate": 2.388920946335834e-06, - "loss": 0.0242, - "mean_token_accuracy": 0.9925850927829742, - "num_tokens": 46180373.0, - "step": 1720 - }, - { - "epoch": 5.374414976599064, - "grad_norm": 0.13681414490380384, - "learning_rate": 2.3312175418349685e-06, - "loss": 0.0231, - "mean_token_accuracy": 0.9930324614048004, - "num_tokens": 46318628.0, - "step": 1725 - }, - { - "epoch": 5.390015600624025, - "grad_norm": 0.16276711166895205, - "learning_rate": 2.273514137334103e-06, - "loss": 0.0245, - "mean_token_accuracy": 0.9926711559295655, - "num_tokens": 46449238.0, - "step": 1730 - }, - { - "epoch": 5.405616224648986, - "grad_norm": 0.15664237651413515, - "learning_rate": 2.2158107328332373e-06, - "loss": 0.024, - "mean_token_accuracy": 0.9928957402706147, - "num_tokens": 46583827.0, - "step": 1735 - }, - { - "epoch": 5.4212168486739465, - "grad_norm": 0.12142597261174552, - "learning_rate": 2.1581073283323715e-06, - "loss": 0.0245, - "mean_token_accuracy": 0.9924871027469635, - "num_tokens": 46715746.0, - "step": 1740 - }, - { - "epoch": 5.436817472698908, - "grad_norm": 0.15964824128276517, - "learning_rate": 2.100403923831506e-06, - "loss": 0.0236, - "mean_token_accuracy": 0.9927206993103027, - "num_tokens": 46852344.0, - "step": 1745 - }, - { - "epoch": 5.452418096723869, - "grad_norm": 0.15611306327636612, - "learning_rate": 2.042700519330641e-06, - "loss": 0.0244, - "mean_token_accuracy": 0.9925235331058502, - "num_tokens": 46983899.0, - "step": 1750 - }, - { - "epoch": 5.46801872074883, - "grad_norm": 0.14718402845371603, - "learning_rate": 1.984997114829775e-06, - "loss": 0.0233, - "mean_token_accuracy": 0.9928801476955413, - "num_tokens": 47121965.0, - "step": 1755 - }, - { - "epoch": 5.483619344773791, - "grad_norm": 0.17532809899956678, - "learning_rate": 1.9272937103289097e-06, - "loss": 0.0235, - "mean_token_accuracy": 0.9928483843803406, - "num_tokens": 47259573.0, - "step": 1760 - }, - { - "epoch": 5.4992199687987515, - "grad_norm": 0.15620539955228097, - "learning_rate": 1.869590305828044e-06, - "loss": 0.0239, - "mean_token_accuracy": 0.9927187144756318, - "num_tokens": 47393619.0, - "step": 1765 - }, - { - "epoch": 5.514820592823713, - "grad_norm": 0.15452994664407602, - "learning_rate": 1.8118869013271783e-06, - "loss": 0.0233, - "mean_token_accuracy": 0.9930829882621766, - "num_tokens": 47531932.0, - "step": 1770 - }, - { - "epoch": 5.530421216848674, - "grad_norm": 0.1609193936605019, - "learning_rate": 1.754183496826313e-06, - "loss": 0.0241, - "mean_token_accuracy": 0.9925650417804718, - "num_tokens": 47664935.0, - "step": 1775 - }, - { - "epoch": 5.546021840873635, - "grad_norm": 0.18069485411822495, - "learning_rate": 1.6964800923254474e-06, - "loss": 0.0237, - "mean_token_accuracy": 0.9928048968315124, - "num_tokens": 47801019.0, - "step": 1780 - }, - { - "epoch": 5.561622464898596, - "grad_norm": 0.14569556410470508, - "learning_rate": 1.6387766878245816e-06, - "loss": 0.0241, - "mean_token_accuracy": 0.9925419509410858, - "num_tokens": 47933966.0, - "step": 1785 - }, - { - "epoch": 5.577223088923557, - "grad_norm": 0.13725463666507065, - "learning_rate": 1.5810732833237163e-06, - "loss": 0.0237, - "mean_token_accuracy": 0.9926219820976258, - "num_tokens": 48069691.0, - "step": 1790 - }, - { - "epoch": 5.592823712948518, - "grad_norm": 0.15563435263990227, - "learning_rate": 1.5233698788228507e-06, - "loss": 0.0249, - "mean_token_accuracy": 0.99224454164505, - "num_tokens": 48198004.0, - "step": 1795 - }, - { - "epoch": 5.608424336973479, - "grad_norm": 0.19150391237914954, - "learning_rate": 1.4656664743219851e-06, - "loss": 0.0233, - "mean_token_accuracy": 0.9930429100990296, - "num_tokens": 48336169.0, - "step": 1800 - }, - { - "epoch": 5.62402496099844, - "grad_norm": 0.13266766596594196, - "learning_rate": 1.4079630698211198e-06, - "loss": 0.0239, - "mean_token_accuracy": 0.9926740467548371, - "num_tokens": 48471104.0, - "step": 1805 - }, - { - "epoch": 5.639625585023401, - "grad_norm": 0.1788362488762517, - "learning_rate": 1.350259665320254e-06, - "loss": 0.0243, - "mean_token_accuracy": 0.99261354804039, - "num_tokens": 48603882.0, - "step": 1810 - }, - { - "epoch": 5.655226209048362, - "grad_norm": 0.14065816169187795, - "learning_rate": 1.2925562608193884e-06, - "loss": 0.0247, - "mean_token_accuracy": 0.9923071384429931, - "num_tokens": 48733494.0, - "step": 1815 - }, - { - "epoch": 5.670826833073323, - "grad_norm": 0.17040381288127784, - "learning_rate": 1.2348528563185228e-06, - "loss": 0.0237, - "mean_token_accuracy": 0.9929697871208191, - "num_tokens": 48869301.0, - "step": 1820 - }, - { - "epoch": 5.686427457098284, - "grad_norm": 0.16017704059144489, - "learning_rate": 1.1771494518176575e-06, - "loss": 0.0238, - "mean_token_accuracy": 0.9927242577075959, - "num_tokens": 49003656.0, - "step": 1825 - }, - { - "epoch": 5.702028081123245, - "grad_norm": 0.11848368940519473, - "learning_rate": 1.1194460473167917e-06, - "loss": 0.0237, - "mean_token_accuracy": 0.9926847636699676, - "num_tokens": 49139812.0, - "step": 1830 - }, - { - "epoch": 5.717628705148206, - "grad_norm": 0.1517735016635918, - "learning_rate": 1.0617426428159263e-06, - "loss": 0.0237, - "mean_token_accuracy": 0.9928111970424652, - "num_tokens": 49273201.0, - "step": 1835 - }, - { - "epoch": 5.733229329173167, - "grad_norm": 0.18940074866811016, - "learning_rate": 1.0040392383150608e-06, - "loss": 0.0235, - "mean_token_accuracy": 0.9927483320236206, - "num_tokens": 49408994.0, - "step": 1840 - }, - { - "epoch": 5.748829953198128, - "grad_norm": 0.1516633870907396, - "learning_rate": 9.463358338141951e-07, - "loss": 0.0238, - "mean_token_accuracy": 0.9929431319236756, - "num_tokens": 49545733.0, - "step": 1845 - }, - { - "epoch": 5.764430577223089, - "grad_norm": 0.12054317524665162, - "learning_rate": 8.886324293133296e-07, - "loss": 0.0238, - "mean_token_accuracy": 0.992936760187149, - "num_tokens": 49680062.0, - "step": 1850 - }, - { - "epoch": 5.78003120124805, - "grad_norm": 0.1370174323773597, - "learning_rate": 8.30929024812464e-07, - "loss": 0.0238, - "mean_token_accuracy": 0.9928847312927246, - "num_tokens": 49814813.0, - "step": 1855 - }, - { - "epoch": 5.795631825273011, - "grad_norm": 0.14145943543200018, - "learning_rate": 7.732256203115985e-07, - "loss": 0.0235, - "mean_token_accuracy": 0.9928096294403076, - "num_tokens": 49951082.0, - "step": 1860 - }, - { - "epoch": 5.811232449297972, - "grad_norm": 0.26191584460732914, - "learning_rate": 7.155222158107329e-07, - "loss": 0.0237, - "mean_token_accuracy": 0.992737352848053, - "num_tokens": 50086639.0, - "step": 1865 - }, - { - "epoch": 5.826833073322933, - "grad_norm": 0.1321209476672431, - "learning_rate": 6.578188113098672e-07, - "loss": 0.0237, - "mean_token_accuracy": 0.9928857266902924, - "num_tokens": 50222788.0, - "step": 1870 - }, - { - "epoch": 5.842433697347894, - "grad_norm": 0.10316470824727872, - "learning_rate": 6.001154068090018e-07, - "loss": 0.0239, - "mean_token_accuracy": 0.9926657855510712, - "num_tokens": 50355976.0, - "step": 1875 - }, - { - "epoch": 5.858034321372855, - "grad_norm": 0.13714009102884916, - "learning_rate": 5.424120023081362e-07, - "loss": 0.0237, - "mean_token_accuracy": 0.992606920003891, - "num_tokens": 50491743.0, - "step": 1880 - }, - { - "epoch": 5.873634945397816, - "grad_norm": 0.16210992403980792, - "learning_rate": 4.847085978072707e-07, - "loss": 0.024, - "mean_token_accuracy": 0.9926069259643555, - "num_tokens": 50624599.0, - "step": 1885 - }, - { - "epoch": 5.889235569422777, - "grad_norm": 0.14443601980669765, - "learning_rate": 4.270051933064051e-07, - "loss": 0.0241, - "mean_token_accuracy": 0.99270578622818, - "num_tokens": 50758742.0, - "step": 1890 - }, - { - "epoch": 5.904836193447738, - "grad_norm": 0.1127113758113436, - "learning_rate": 3.6930178880553954e-07, - "loss": 0.024, - "mean_token_accuracy": 0.9925020098686218, - "num_tokens": 50891503.0, - "step": 1895 - }, - { - "epoch": 5.920436817472699, - "grad_norm": 0.13570504066067704, - "learning_rate": 3.11598384304674e-07, - "loss": 0.0244, - "mean_token_accuracy": 0.9926927983760834, - "num_tokens": 51022815.0, - "step": 1900 - }, - { - "epoch": 5.9360374414976596, - "grad_norm": 0.14715744807730147, - "learning_rate": 2.5389497980380845e-07, - "loss": 0.0232, - "mean_token_accuracy": 0.993007630109787, - "num_tokens": 51161194.0, - "step": 1905 - }, - { - "epoch": 5.951638065522621, - "grad_norm": 0.13374492992277562, - "learning_rate": 1.9619157530294288e-07, - "loss": 0.0235, - "mean_token_accuracy": 0.9928073644638061, - "num_tokens": 51297114.0, - "step": 1910 - }, - { - "epoch": 5.967238689547582, - "grad_norm": 0.14188936039069744, - "learning_rate": 1.3848817080207733e-07, - "loss": 0.0233, - "mean_token_accuracy": 0.9929638624191284, - "num_tokens": 51434451.0, - "step": 1915 - }, - { - "epoch": 5.982839313572543, - "grad_norm": 0.1212138838070634, - "learning_rate": 8.078476630121177e-08, - "loss": 0.0235, - "mean_token_accuracy": 0.9928582549095154, - "num_tokens": 51570117.0, - "step": 1920 - }, - { - "epoch": 5.998439937597504, - "grad_norm": 0.12760759914906278, - "learning_rate": 2.308136180034622e-08, - "loss": 0.024, - "mean_token_accuracy": 0.9928393125534057, - "num_tokens": 51703514.0, - "step": 1925 - }, - { - "epoch": 6.0, - "mean_token_accuracy": 0.9926674365997314, - "num_tokens": 51716911.0, - "step": 1926, - "total_flos": 309043048284160.0, - "train_loss": 0.11322491052526659, - "train_runtime": 3926.7142, - "train_samples_per_second": 31.316, - "train_steps_per_second": 0.49 + "epoch": 1.0, + "step": 640, + "total_flos": 1.1033250327691264e+16, + "train_loss": 0.6782059136778116, + "train_runtime": 78870.76, + "train_samples_per_second": 4.152, + "train_steps_per_second": 0.008 } ], "logging_steps": 5, - "max_steps": 1926, + "max_steps": 640, "num_input_tokens_seen": 0, - "num_train_epochs": 6, - "save_steps": 500, + "num_train_epochs": 1, + "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { @@ -3503,7 +1188,7 @@ "attributes": {} } }, - "total_flos": 309043048284160.0, + "total_flos": 1.1033250327691264e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null