{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007816316560820713, "grad_norm": 69.75188475573104, "learning_rate": 2.0000000000000003e-06, "loss": 2.5559, "mean_token_accuracy": 0.6205124616622925, "num_tokens": 5924597.0, "step": 5 }, { "epoch": 0.015632633121641426, "grad_norm": 37.890360906113195, "learning_rate": 4.5e-06, "loss": 2.096, "mean_token_accuracy": 0.6493684396147728, "num_tokens": 11853358.0, "step": 10 }, { "epoch": 0.02344894968246214, "grad_norm": 37.89921398442958, "learning_rate": 7e-06, "loss": 1.2923, "mean_token_accuracy": 0.699188905954361, "num_tokens": 17774953.0, "step": 15 }, { "epoch": 0.03126526624328285, "grad_norm": 3.9611680059042, "learning_rate": 9.5e-06, "loss": 1.0185, "mean_token_accuracy": 0.736442020535469, "num_tokens": 23743902.0, "step": 20 }, { "epoch": 0.039081582804103565, "grad_norm": 6.339716255577251, "learning_rate": 9.999075719055307e-06, "loss": 0.8959, "mean_token_accuracy": 0.7516884453594684, "num_tokens": 29670699.0, "step": 25 }, { "epoch": 0.04689789936492428, "grad_norm": 3.2460756840648544, "learning_rate": 9.995321478440751e-06, "loss": 0.8664, "mean_token_accuracy": 0.7534175351262092, "num_tokens": 35627815.0, "step": 30 }, { "epoch": 0.05471421592574499, "grad_norm": 3.4636834905707827, "learning_rate": 9.988681918400355e-06, "loss": 0.85, "mean_token_accuracy": 0.7533099494874478, "num_tokens": 41558055.0, "step": 35 }, { "epoch": 0.0625305324865657, "grad_norm": 2.995605141331059, "learning_rate": 9.9791613005318e-06, "loss": 0.8151, "mean_token_accuracy": 0.7614992260932922, "num_tokens": 47493638.0, "step": 40 }, { "epoch": 0.07034684904738642, "grad_norm": 3.1347703455639584, "learning_rate": 9.966765735638018e-06, "loss": 0.7855, "mean_token_accuracy": 0.7702661901712418, "num_tokens": 53415769.0, "step": 45 }, { "epoch": 0.07816316560820713, "grad_norm": 4.194109438368802, "learning_rate": 9.951503179804989e-06, "loss": 0.763, "mean_token_accuracy": 0.7751710690557957, "num_tokens": 59313984.0, "step": 50 }, { "epoch": 0.08597948216902784, "grad_norm": 2.5694956316167405, "learning_rate": 9.933383429295124e-06, "loss": 0.7461, "mean_token_accuracy": 0.7756987683475017, "num_tokens": 65218898.0, "step": 55 }, { "epoch": 0.09379579872984856, "grad_norm": 2.4022434258446457, "learning_rate": 9.912418114259548e-06, "loss": 0.7305, "mean_token_accuracy": 0.7788987644016743, "num_tokens": 71140560.0, "step": 60 }, { "epoch": 0.10161211529066927, "grad_norm": 2.0354431025023314, "learning_rate": 9.888620691273284e-06, "loss": 0.7078, "mean_token_accuracy": 0.7819231644272804, "num_tokens": 77078478.0, "step": 65 }, { "epoch": 0.10942843185148998, "grad_norm": 1.88497692389339, "learning_rate": 9.862006434698169e-06, "loss": 0.6963, "mean_token_accuracy": 0.7847778849303723, "num_tokens": 82996564.0, "step": 70 }, { "epoch": 0.1172447484123107, "grad_norm": 1.9200062767141564, "learning_rate": 9.832592426879006e-06, "loss": 0.688, "mean_token_accuracy": 0.7845040634274483, "num_tokens": 88951976.0, "step": 75 }, { "epoch": 0.1250610649731314, "grad_norm": 2.210483530160085, "learning_rate": 9.800397547179276e-06, "loss": 0.6829, "mean_token_accuracy": 0.7839573793113231, "num_tokens": 94895652.0, "step": 80 }, { "epoch": 0.13287738153395212, "grad_norm": 2.0791394687934845, "learning_rate": 9.765442459863428e-06, "loss": 0.682, "mean_token_accuracy": 0.7845308348536492, "num_tokens": 100832079.0, "step": 85 }, { "epoch": 0.14069369809477283, "grad_norm": 2.148171577637037, "learning_rate": 9.72774960083353e-06, "loss": 0.6906, "mean_token_accuracy": 0.7805382929742336, "num_tokens": 106787613.0, "step": 90 }, { "epoch": 0.14851001465559355, "grad_norm": 2.0898108754881326, "learning_rate": 9.687343163228806e-06, "loss": 0.6684, "mean_token_accuracy": 0.7865298599004745, "num_tokens": 112717599.0, "step": 95 }, { "epoch": 0.15632633121641426, "grad_norm": 4.0822591130362955, "learning_rate": 9.644249081897277e-06, "loss": 0.6648, "mean_token_accuracy": 0.7872977338731288, "num_tokens": 118650442.0, "step": 100 }, { "epoch": 0.16414264777723497, "grad_norm": 2.441738444276975, "learning_rate": 9.598495016749493e-06, "loss": 0.6689, "mean_token_accuracy": 0.7859964817762375, "num_tokens": 124574766.0, "step": 105 }, { "epoch": 0.1719589643380557, "grad_norm": 2.075995258307634, "learning_rate": 9.55011033500505e-06, "loss": 0.6605, "mean_token_accuracy": 0.787811417132616, "num_tokens": 130516200.0, "step": 110 }, { "epoch": 0.1797752808988764, "grad_norm": 2.609489028108713, "learning_rate": 9.499126092343237e-06, "loss": 0.661, "mean_token_accuracy": 0.7875787198543549, "num_tokens": 136457829.0, "step": 115 }, { "epoch": 0.1875915974596971, "grad_norm": 3.7440429066647734, "learning_rate": 9.445575012969977e-06, "loss": 0.6709, "mean_token_accuracy": 0.7853954270482063, "num_tokens": 142385131.0, "step": 120 }, { "epoch": 0.19540791402051783, "grad_norm": 2.8751982634480044, "learning_rate": 9.38949146861382e-06, "loss": 0.663, "mean_token_accuracy": 0.7872662946581841, "num_tokens": 148289898.0, "step": 125 }, { "epoch": 0.20322423058133854, "grad_norm": 2.925083855527294, "learning_rate": 9.33091145646446e-06, "loss": 0.6486, "mean_token_accuracy": 0.7910164006054401, "num_tokens": 154232386.0, "step": 130 }, { "epoch": 0.21104054714215925, "grad_norm": 3.73974121507644, "learning_rate": 9.26987257606797e-06, "loss": 0.653, "mean_token_accuracy": 0.7898711428046227, "num_tokens": 160150772.0, "step": 135 }, { "epoch": 0.21885686370297996, "grad_norm": 2.074910644810112, "learning_rate": 9.206414005193539e-06, "loss": 0.6564, "mean_token_accuracy": 0.7886676676571369, "num_tokens": 166088160.0, "step": 140 }, { "epoch": 0.22667318026380068, "grad_norm": 3.765010702267194, "learning_rate": 9.140576474687263e-06, "loss": 0.665, "mean_token_accuracy": 0.7880136586725712, "num_tokens": 172032929.0, "step": 145 }, { "epoch": 0.2344894968246214, "grad_norm": 2.3920249014704944, "learning_rate": 9.072402242329067e-06, "loss": 0.6503, "mean_token_accuracy": 0.7899810753762722, "num_tokens": 178005406.0, "step": 150 }, { "epoch": 0.2423058133854421, "grad_norm": 4.535876880687944, "learning_rate": 9.001935065709569e-06, "loss": 0.6427, "mean_token_accuracy": 0.7929855234920978, "num_tokens": 183927144.0, "step": 155 }, { "epoch": 0.2501221299462628, "grad_norm": 3.44122188439969, "learning_rate": 8.929220174144304e-06, "loss": 0.6489, "mean_token_accuracy": 0.7914013616740704, "num_tokens": 189848723.0, "step": 160 }, { "epoch": 0.25793844650708353, "grad_norm": 2.2378659722902188, "learning_rate": 8.85430423964332e-06, "loss": 0.6386, "mean_token_accuracy": 0.792429718375206, "num_tokens": 195777877.0, "step": 165 }, { "epoch": 0.26575476306790424, "grad_norm": 2.982598937419653, "learning_rate": 8.777235346954753e-06, "loss": 0.649, "mean_token_accuracy": 0.790771734714508, "num_tokens": 201713378.0, "step": 170 }, { "epoch": 0.27357107962872496, "grad_norm": 1.9062511611198192, "learning_rate": 8.698062962701691e-06, "loss": 0.652, "mean_token_accuracy": 0.790651909261942, "num_tokens": 207661581.0, "step": 175 }, { "epoch": 0.28138739618954567, "grad_norm": 2.7700238166088877, "learning_rate": 8.616837903632026e-06, "loss": 0.6438, "mean_token_accuracy": 0.7914554052054882, "num_tokens": 213597723.0, "step": 180 }, { "epoch": 0.2892037127503664, "grad_norm": 4.298205320427554, "learning_rate": 8.533612304001763e-06, "loss": 0.6569, "mean_token_accuracy": 0.7872735880315304, "num_tokens": 219543570.0, "step": 185 }, { "epoch": 0.2970200293111871, "grad_norm": 2.62809039414661, "learning_rate": 8.44843958211269e-06, "loss": 0.646, "mean_token_accuracy": 0.7901849329471589, "num_tokens": 225490076.0, "step": 190 }, { "epoch": 0.3048363458720078, "grad_norm": 2.540505165257454, "learning_rate": 8.361374406025853e-06, "loss": 0.6452, "mean_token_accuracy": 0.7908910043537617, "num_tokens": 231429929.0, "step": 195 }, { "epoch": 0.3126526624328285, "grad_norm": 1.763996580236139, "learning_rate": 8.272472658472906e-06, "loss": 0.6529, "mean_token_accuracy": 0.7878620237112045, "num_tokens": 237384555.0, "step": 200 }, { "epoch": 0.32046897899364923, "grad_norm": 1.6233120878672673, "learning_rate": 8.181791400987807e-06, "loss": 0.6343, "mean_token_accuracy": 0.7936271652579308, "num_tokens": 243313192.0, "step": 205 }, { "epoch": 0.32828529555446995, "grad_norm": 1.9405753464802942, "learning_rate": 8.089388837281915e-06, "loss": 0.6439, "mean_token_accuracy": 0.7914594881236553, "num_tokens": 249220870.0, "step": 210 }, { "epoch": 0.33610161211529066, "grad_norm": 3.488125244229796, "learning_rate": 7.995324275885961e-06, "loss": 0.6351, "mean_token_accuracy": 0.7946518436074257, "num_tokens": 255166697.0, "step": 215 }, { "epoch": 0.3439179286761114, "grad_norm": 8.607586639921637, "learning_rate": 7.89965809208291e-06, "loss": 0.6426, "mean_token_accuracy": 0.7916349656879902, "num_tokens": 261089161.0, "step": 220 }, { "epoch": 0.3517342452369321, "grad_norm": 1.6660014877602962, "learning_rate": 7.802451689156122e-06, "loss": 0.6481, "mean_token_accuracy": 0.79035182595253, "num_tokens": 267021428.0, "step": 225 }, { "epoch": 0.3595505617977528, "grad_norm": 1.6153459083711095, "learning_rate": 7.70376745897768e-06, "loss": 0.6414, "mean_token_accuracy": 0.7924111239612103, "num_tokens": 272961221.0, "step": 230 }, { "epoch": 0.3673668783585735, "grad_norm": 1.7376528456449458, "learning_rate": 7.6036687419622215e-06, "loss": 0.6359, "mean_token_accuracy": 0.7931422784924507, "num_tokens": 278866458.0, "step": 235 }, { "epoch": 0.3751831949193942, "grad_norm": 1.7509363427645832, "learning_rate": 7.5022197864119175e-06, "loss": 0.6455, "mean_token_accuracy": 0.7915025249123573, "num_tokens": 284785499.0, "step": 240 }, { "epoch": 0.38299951148021494, "grad_norm": 2.863849300726178, "learning_rate": 7.399485707278744e-06, "loss": 0.6478, "mean_token_accuracy": 0.7907331958413124, "num_tokens": 290732280.0, "step": 245 }, { "epoch": 0.39081582804103565, "grad_norm": 1.4846518170955887, "learning_rate": 7.295532444370485e-06, "loss": 0.6451, "mean_token_accuracy": 0.7914663501083851, "num_tokens": 296654941.0, "step": 250 }, { "epoch": 0.39863214460185636, "grad_norm": 1.7383678766587618, "learning_rate": 7.190426720027306e-06, "loss": 0.644, "mean_token_accuracy": 0.7916645854711533, "num_tokens": 302605292.0, "step": 255 }, { "epoch": 0.4064484611626771, "grad_norm": 2.2690859031654087, "learning_rate": 7.084235996296068e-06, "loss": 0.6409, "mean_token_accuracy": 0.792822826653719, "num_tokens": 308552365.0, "step": 260 }, { "epoch": 0.4142647777234978, "grad_norm": 1.6536907812645472, "learning_rate": 6.977028431629839e-06, "loss": 0.6418, "mean_token_accuracy": 0.7921351306140423, "num_tokens": 314489877.0, "step": 265 }, { "epoch": 0.4220810942843185, "grad_norm": 1.6417211396491032, "learning_rate": 6.86887283714044e-06, "loss": 0.6376, "mean_token_accuracy": 0.7937514387071133, "num_tokens": 320414279.0, "step": 270 }, { "epoch": 0.4298974108451392, "grad_norm": 2.1351664741224963, "learning_rate": 6.7598386324320745e-06, "loss": 0.6298, "mean_token_accuracy": 0.7947640925645828, "num_tokens": 326349818.0, "step": 275 }, { "epoch": 0.43771372740595993, "grad_norm": 1.4007468087271036, "learning_rate": 6.649995801044391e-06, "loss": 0.6414, "mean_token_accuracy": 0.7926677256822586, "num_tokens": 332276019.0, "step": 280 }, { "epoch": 0.44553004396678064, "grad_norm": 1.6121431265331962, "learning_rate": 6.539414845533596e-06, "loss": 0.6393, "mean_token_accuracy": 0.7921099595725536, "num_tokens": 338209339.0, "step": 285 }, { "epoch": 0.45334636052760136, "grad_norm": 2.031155969828718, "learning_rate": 6.428166742220423e-06, "loss": 0.625, "mean_token_accuracy": 0.7955845050513745, "num_tokens": 344137484.0, "step": 290 }, { "epoch": 0.46116267708842207, "grad_norm": 1.4450498950991713, "learning_rate": 6.316322895634029e-06, "loss": 0.6374, "mean_token_accuracy": 0.7931911982595921, "num_tokens": 350078203.0, "step": 295 }, { "epoch": 0.4689789936492428, "grad_norm": 1.9913252644164074, "learning_rate": 6.20395509268104e-06, "loss": 0.6214, "mean_token_accuracy": 0.7970752798020839, "num_tokens": 356025763.0, "step": 300 }, { "epoch": 0.4767953102100635, "grad_norm": 2.8315822034325837, "learning_rate": 6.0911354565691594e-06, "loss": 0.6304, "mean_token_accuracy": 0.796255373954773, "num_tokens": 361992798.0, "step": 305 }, { "epoch": 0.4846116267708842, "grad_norm": 1.8374439736436459, "learning_rate": 5.977936400514943e-06, "loss": 0.6307, "mean_token_accuracy": 0.7953431971371174, "num_tokens": 367913461.0, "step": 310 }, { "epoch": 0.4924279433317049, "grad_norm": 2.3033335141428, "learning_rate": 5.864430581265406e-06, "loss": 0.6356, "mean_token_accuracy": 0.7944584995508194, "num_tokens": 373852019.0, "step": 315 }, { "epoch": 0.5002442598925256, "grad_norm": 1.6907176529892718, "learning_rate": 5.750690852463339e-06, "loss": 0.6347, "mean_token_accuracy": 0.7937369205057621, "num_tokens": 379764522.0, "step": 320 }, { "epoch": 0.5080605764533463, "grad_norm": 1.818966581747677, "learning_rate": 5.636790217886243e-06, "loss": 0.6253, "mean_token_accuracy": 0.7948664158582688, "num_tokens": 385692482.0, "step": 325 }, { "epoch": 0.5158768930141671, "grad_norm": 1.6206125873755466, "learning_rate": 5.522801784588895e-06, "loss": 0.631, "mean_token_accuracy": 0.7939370617270469, "num_tokens": 391635856.0, "step": 330 }, { "epoch": 0.5236932095749878, "grad_norm": 3.3420400728114945, "learning_rate": 5.408798715979626e-06, "loss": 0.6341, "mean_token_accuracy": 0.7946567349135876, "num_tokens": 397545573.0, "step": 335 }, { "epoch": 0.5315095261358085, "grad_norm": 4.102411753363632, "learning_rate": 5.294854184860437e-06, "loss": 0.6268, "mean_token_accuracy": 0.7956276901066304, "num_tokens": 403475982.0, "step": 340 }, { "epoch": 0.5393258426966292, "grad_norm": 3.1767409652573853, "learning_rate": 5.1810413264610724e-06, "loss": 0.6276, "mean_token_accuracy": 0.7952337145805359, "num_tokens": 409418002.0, "step": 345 }, { "epoch": 0.5471421592574499, "grad_norm": 3.166395119010415, "learning_rate": 5.067433191497221e-06, "loss": 0.6322, "mean_token_accuracy": 0.7954030476510525, "num_tokens": 415344226.0, "step": 350 }, { "epoch": 0.5549584758182706, "grad_norm": 1.526943049343455, "learning_rate": 4.954102699282953e-06, "loss": 0.6359, "mean_token_accuracy": 0.7941608227789402, "num_tokens": 421277466.0, "step": 355 }, { "epoch": 0.5627747923790913, "grad_norm": 1.9099044626436186, "learning_rate": 4.841122590927511e-06, "loss": 0.618, "mean_token_accuracy": 0.7975563704967499, "num_tokens": 427241607.0, "step": 360 }, { "epoch": 0.570591108939912, "grad_norm": 2.5623011943198137, "learning_rate": 4.7285653826464605e-06, "loss": 0.6272, "mean_token_accuracy": 0.7963444076478481, "num_tokens": 433157588.0, "step": 365 }, { "epoch": 0.5784074255007328, "grad_norm": 1.6826723465337794, "learning_rate": 4.616503319217202e-06, "loss": 0.6205, "mean_token_accuracy": 0.7979116909205913, "num_tokens": 439093218.0, "step": 370 }, { "epoch": 0.5862237420615535, "grad_norm": 2.4233422176625905, "learning_rate": 4.5050083276087155e-06, "loss": 0.6371, "mean_token_accuracy": 0.7943588711321354, "num_tokens": 445010423.0, "step": 375 }, { "epoch": 0.5940400586223742, "grad_norm": 1.8195780340101564, "learning_rate": 4.394151970815259e-06, "loss": 0.613, "mean_token_accuracy": 0.799777788668871, "num_tokens": 450918292.0, "step": 380 }, { "epoch": 0.6018563751831949, "grad_norm": 1.8194352528770072, "learning_rate": 4.284005401923723e-06, "loss": 0.6225, "mean_token_accuracy": 0.7965258292853832, "num_tokens": 456832151.0, "step": 385 }, { "epoch": 0.6096726917440156, "grad_norm": 1.7307705633009496, "learning_rate": 4.174639318444044e-06, "loss": 0.6191, "mean_token_accuracy": 0.7983451545238495, "num_tokens": 462764585.0, "step": 390 }, { "epoch": 0.6174890083048363, "grad_norm": 1.5492850018471778, "learning_rate": 4.066123916932069e-06, "loss": 0.6232, "mean_token_accuracy": 0.7965681925415993, "num_tokens": 468701502.0, "step": 395 }, { "epoch": 0.625305324865657, "grad_norm": 1.5785740475386758, "learning_rate": 3.95852884793392e-06, "loss": 0.639, "mean_token_accuracy": 0.7930570214986801, "num_tokens": 474663601.0, "step": 400 }, { "epoch": 0.6331216414264778, "grad_norm": 1.7178497128631158, "learning_rate": 3.851923171280848e-06, "loss": 0.631, "mean_token_accuracy": 0.7956325292587281, "num_tokens": 480597092.0, "step": 405 }, { "epoch": 0.6409379579872985, "grad_norm": 2.2284125586269634, "learning_rate": 3.7463753117632086e-06, "loss": 0.6194, "mean_token_accuracy": 0.7979160696268082, "num_tokens": 486517715.0, "step": 410 }, { "epoch": 0.6487542745481192, "grad_norm": 1.6485020103488872, "learning_rate": 3.6419530152120585e-06, "loss": 0.6155, "mean_token_accuracy": 0.7989446625113488, "num_tokens": 492472305.0, "step": 415 }, { "epoch": 0.6565705911089399, "grad_norm": 1.6583017170770122, "learning_rate": 3.5387233050165305e-06, "loss": 0.6154, "mean_token_accuracy": 0.7981764920055866, "num_tokens": 498385685.0, "step": 420 }, { "epoch": 0.6643869076697606, "grad_norm": 1.7340707542937976, "learning_rate": 3.436752439104914e-06, "loss": 0.6232, "mean_token_accuracy": 0.7975495472550392, "num_tokens": 504307250.0, "step": 425 }, { "epoch": 0.6722032242305813, "grad_norm": 1.7732002119177575, "learning_rate": 3.336105867417036e-06, "loss": 0.6136, "mean_token_accuracy": 0.7990594677627086, "num_tokens": 510245141.0, "step": 430 }, { "epoch": 0.680019540791402, "grad_norm": 1.7788709835674736, "learning_rate": 3.236848189895271e-06, "loss": 0.6221, "mean_token_accuracy": 0.7987750940024853, "num_tokens": 516171739.0, "step": 435 }, { "epoch": 0.6878358573522227, "grad_norm": 1.8951328728941093, "learning_rate": 3.1390431150210858e-06, "loss": 0.6216, "mean_token_accuracy": 0.7972325548529625, "num_tokens": 522101344.0, "step": 440 }, { "epoch": 0.6956521739130435, "grad_norm": 2.265125941545771, "learning_rate": 3.0427534189238056e-06, "loss": 0.6272, "mean_token_accuracy": 0.797095137834549, "num_tokens": 528042612.0, "step": 445 }, { "epoch": 0.7034684904738642, "grad_norm": 1.895267259607391, "learning_rate": 2.9480409050877836e-06, "loss": 0.6146, "mean_token_accuracy": 0.7996291488409042, "num_tokens": 533972183.0, "step": 450 }, { "epoch": 0.7112848070346849, "grad_norm": 2.1073222786880508, "learning_rate": 2.854966364683872e-06, "loss": 0.6066, "mean_token_accuracy": 0.8017579860985279, "num_tokens": 539882836.0, "step": 455 }, { "epoch": 0.7191011235955056, "grad_norm": 1.586483401475392, "learning_rate": 2.7635895375506516e-06, "loss": 0.6218, "mean_token_accuracy": 0.79697345495224, "num_tokens": 545834579.0, "step": 460 }, { "epoch": 0.7269174401563263, "grad_norm": 1.678175813746661, "learning_rate": 2.6739690738504428e-06, "loss": 0.6218, "mean_token_accuracy": 0.7964953184127808, "num_tokens": 551762612.0, "step": 465 }, { "epoch": 0.734733756717147, "grad_norm": 4.728635128027955, "learning_rate": 2.5861624964247402e-06, "loss": 0.6129, "mean_token_accuracy": 0.7991872586309909, "num_tokens": 557733732.0, "step": 470 }, { "epoch": 0.7425500732779677, "grad_norm": 3.1433754367730327, "learning_rate": 2.5002261638732066e-06, "loss": 0.6259, "mean_token_accuracy": 0.796341958642006, "num_tokens": 563678120.0, "step": 475 }, { "epoch": 0.7503663898387885, "grad_norm": 1.5793999144249435, "learning_rate": 2.416215234379941e-06, "loss": 0.6171, "mean_token_accuracy": 0.7986149400472641, "num_tokens": 569641651.0, "step": 480 }, { "epoch": 0.7581827063996092, "grad_norm": 1.5637633654884855, "learning_rate": 2.3341836303102336e-06, "loss": 0.6168, "mean_token_accuracy": 0.7989203184843063, "num_tokens": 575596629.0, "step": 485 }, { "epoch": 0.7659990229604299, "grad_norm": 1.7578654658273658, "learning_rate": 2.2541840036005227e-06, "loss": 0.6152, "mean_token_accuracy": 0.8000254578888416, "num_tokens": 581506425.0, "step": 490 }, { "epoch": 0.7738153395212506, "grad_norm": 1.6111848358937764, "learning_rate": 2.1762677019637836e-06, "loss": 0.615, "mean_token_accuracy": 0.7983125224709511, "num_tokens": 587441928.0, "step": 495 }, { "epoch": 0.7816316560820713, "grad_norm": 2.729056351863656, "learning_rate": 2.100484735932027e-06, "loss": 0.6219, "mean_token_accuracy": 0.7974658064544201, "num_tokens": 593387525.0, "step": 500 }, { "epoch": 0.789447972642892, "grad_norm": 2.0026132736871256, "learning_rate": 2.0268837467570714e-06, "loss": 0.6113, "mean_token_accuracy": 0.7999734558165074, "num_tokens": 599357871.0, "step": 505 }, { "epoch": 0.7972642892037127, "grad_norm": 3.202086545883362, "learning_rate": 1.955511975190185e-06, "loss": 0.6167, "mean_token_accuracy": 0.7985524848103523, "num_tokens": 605266801.0, "step": 510 }, { "epoch": 0.8050806057645334, "grad_norm": 2.5845078387849876, "learning_rate": 1.8864152311606342e-06, "loss": 0.6184, "mean_token_accuracy": 0.7977212890982628, "num_tokens": 611200207.0, "step": 515 }, { "epoch": 0.8128969223253542, "grad_norm": 1.3459195379746025, "learning_rate": 1.8196378643726092e-06, "loss": 0.6141, "mean_token_accuracy": 0.7991946995258331, "num_tokens": 617152907.0, "step": 520 }, { "epoch": 0.8207132388861749, "grad_norm": 1.6903121443946976, "learning_rate": 1.7552227358393933e-06, "loss": 0.6166, "mean_token_accuracy": 0.7996014229953289, "num_tokens": 623085161.0, "step": 525 }, { "epoch": 0.8285295554469956, "grad_norm": 5.000507187078701, "learning_rate": 1.6932111903730453e-06, "loss": 0.6106, "mean_token_accuracy": 0.7996949210762978, "num_tokens": 629027147.0, "step": 530 }, { "epoch": 0.8363458720078163, "grad_norm": 2.026922156270765, "learning_rate": 1.6336430300472606e-06, "loss": 0.6045, "mean_token_accuracy": 0.800903269648552, "num_tokens": 634987448.0, "step": 535 }, { "epoch": 0.844162188568637, "grad_norm": 4.687498745163573, "learning_rate": 1.576556488650428e-06, "loss": 0.6128, "mean_token_accuracy": 0.7995632983744144, "num_tokens": 640920908.0, "step": 540 }, { "epoch": 0.8519785051294577, "grad_norm": 1.7305252046419992, "learning_rate": 1.5219882071452967e-06, "loss": 0.6059, "mean_token_accuracy": 0.8014576397836208, "num_tokens": 646854354.0, "step": 545 }, { "epoch": 0.8597948216902784, "grad_norm": 1.5220809585740145, "learning_rate": 1.4699732101510026e-06, "loss": 0.6098, "mean_token_accuracy": 0.7980151705443859, "num_tokens": 652785154.0, "step": 550 }, { "epoch": 0.8676111382510991, "grad_norm": 2.3863927864336896, "learning_rate": 1.4205448834625275e-06, "loss": 0.6174, "mean_token_accuracy": 0.7987048149108886, "num_tokens": 658699371.0, "step": 555 }, { "epoch": 0.8754274548119199, "grad_norm": 4.490834509368819, "learning_rate": 1.37373495262205e-06, "loss": 0.6176, "mean_token_accuracy": 0.7986162424087524, "num_tokens": 664648725.0, "step": 560 }, { "epoch": 0.8832437713727406, "grad_norm": 1.8330616045846946, "learning_rate": 1.3295734625559315e-06, "loss": 0.6157, "mean_token_accuracy": 0.798362709581852, "num_tokens": 670559073.0, "step": 565 }, { "epoch": 0.8910600879335613, "grad_norm": 1.8866171760203552, "learning_rate": 1.2880887582903884e-06, "loss": 0.6163, "mean_token_accuracy": 0.7984154649078846, "num_tokens": 676506420.0, "step": 570 }, { "epoch": 0.898876404494382, "grad_norm": 1.7452455628625234, "learning_rate": 1.2493074667582584e-06, "loss": 0.621, "mean_token_accuracy": 0.7975537806749344, "num_tokens": 682448693.0, "step": 575 }, { "epoch": 0.9066927210552027, "grad_norm": 2.030867984009538, "learning_rate": 1.213254479708519e-06, "loss": 0.6163, "mean_token_accuracy": 0.7988204933702946, "num_tokens": 688390022.0, "step": 580 }, { "epoch": 0.9145090376160234, "grad_norm": 1.550865659075602, "learning_rate": 1.179952937729534e-06, "loss": 0.6193, "mean_token_accuracy": 0.7983230344951153, "num_tokens": 694331611.0, "step": 585 }, { "epoch": 0.9223253541768441, "grad_norm": 1.571410028230591, "learning_rate": 1.149424215396281e-06, "loss": 0.6136, "mean_token_accuracy": 0.7990704528987408, "num_tokens": 700261331.0, "step": 590 }, { "epoch": 0.9301416707376648, "grad_norm": 2.163421002394794, "learning_rate": 1.1216879075510877e-06, "loss": 0.6141, "mean_token_accuracy": 0.7987876988947391, "num_tokens": 706193693.0, "step": 595 }, { "epoch": 0.9379579872984856, "grad_norm": 3.7294851958982975, "learning_rate": 1.0967618167267032e-06, "loss": 0.5997, "mean_token_accuracy": 0.8031502008438111, "num_tokens": 712120509.0, "step": 600 }, { "epoch": 0.9457743038593063, "grad_norm": 1.937019840071183, "learning_rate": 1.0746619417197436e-06, "loss": 0.6079, "mean_token_accuracy": 0.8023772671818733, "num_tokens": 718044202.0, "step": 605 }, { "epoch": 0.953590620420127, "grad_norm": 1.822048015248325, "learning_rate": 1.0554024673218808e-06, "loss": 0.6102, "mean_token_accuracy": 0.8010022938251495, "num_tokens": 723993614.0, "step": 610 }, { "epoch": 0.9614069369809477, "grad_norm": 1.6712441013840658, "learning_rate": 1.0389957552153385e-06, "loss": 0.6053, "mean_token_accuracy": 0.8014584824442863, "num_tokens": 729959466.0, "step": 615 }, { "epoch": 0.9692232535417684, "grad_norm": 3.2548443190376073, "learning_rate": 1.0254523360385555e-06, "loss": 0.6146, "mean_token_accuracy": 0.7999479919672012, "num_tokens": 735885753.0, "step": 620 }, { "epoch": 0.9770395701025891, "grad_norm": 7.139737923902445, "learning_rate": 1.0147809026271017e-06, "loss": 0.6145, "mean_token_accuracy": 0.7991539172828197, "num_tokens": 741813482.0, "step": 625 }, { "epoch": 0.9848558866634098, "grad_norm": 1.7874591260763464, "learning_rate": 1.0069883044341846e-06, "loss": 0.6142, "mean_token_accuracy": 0.7999210134148598, "num_tokens": 747741960.0, "step": 630 }, { "epoch": 0.9926722032242306, "grad_norm": 3.2850538471833945, "learning_rate": 1.0020795431343349e-06, "loss": 0.6074, "mean_token_accuracy": 0.8015001997351646, "num_tokens": 753638158.0, "step": 635 }, { "epoch": 1.0, "grad_norm": 2.3599969643790777, "learning_rate": 1.0000577694130827e-06, "loss": 0.5746, "mean_token_accuracy": 0.8004830511411031, "num_tokens": 759191155.0, "step": 640 }, { "epoch": 1.0, "step": 640, "total_flos": 1.1033250327691264e+16, "train_loss": 0.6782059136778116, "train_runtime": 78870.76, "train_samples_per_second": 4.152, "train_steps_per_second": 0.008 } ], "logging_steps": 5, "max_steps": 640, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1033250327691264e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }