|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 640, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007816316560820713, |
|
"grad_norm": 69.75188475573104, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.5559, |
|
"mean_token_accuracy": 0.6205124616622925, |
|
"num_tokens": 5924597.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015632633121641426, |
|
"grad_norm": 37.890360906113195, |
|
"learning_rate": 4.5e-06, |
|
"loss": 2.096, |
|
"mean_token_accuracy": 0.6493684396147728, |
|
"num_tokens": 11853358.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02344894968246214, |
|
"grad_norm": 37.89921398442958, |
|
"learning_rate": 7e-06, |
|
"loss": 1.2923, |
|
"mean_token_accuracy": 0.699188905954361, |
|
"num_tokens": 17774953.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03126526624328285, |
|
"grad_norm": 3.9611680059042, |
|
"learning_rate": 9.5e-06, |
|
"loss": 1.0185, |
|
"mean_token_accuracy": 0.736442020535469, |
|
"num_tokens": 23743902.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.039081582804103565, |
|
"grad_norm": 6.339716255577251, |
|
"learning_rate": 9.999075719055307e-06, |
|
"loss": 0.8959, |
|
"mean_token_accuracy": 0.7516884453594684, |
|
"num_tokens": 29670699.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04689789936492428, |
|
"grad_norm": 3.2460756840648544, |
|
"learning_rate": 9.995321478440751e-06, |
|
"loss": 0.8664, |
|
"mean_token_accuracy": 0.7534175351262092, |
|
"num_tokens": 35627815.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05471421592574499, |
|
"grad_norm": 3.4636834905707827, |
|
"learning_rate": 9.988681918400355e-06, |
|
"loss": 0.85, |
|
"mean_token_accuracy": 0.7533099494874478, |
|
"num_tokens": 41558055.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0625305324865657, |
|
"grad_norm": 2.995605141331059, |
|
"learning_rate": 9.9791613005318e-06, |
|
"loss": 0.8151, |
|
"mean_token_accuracy": 0.7614992260932922, |
|
"num_tokens": 47493638.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07034684904738642, |
|
"grad_norm": 3.1347703455639584, |
|
"learning_rate": 9.966765735638018e-06, |
|
"loss": 0.7855, |
|
"mean_token_accuracy": 0.7702661901712418, |
|
"num_tokens": 53415769.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07816316560820713, |
|
"grad_norm": 4.194109438368802, |
|
"learning_rate": 9.951503179804989e-06, |
|
"loss": 0.763, |
|
"mean_token_accuracy": 0.7751710690557957, |
|
"num_tokens": 59313984.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08597948216902784, |
|
"grad_norm": 2.5694956316167405, |
|
"learning_rate": 9.933383429295124e-06, |
|
"loss": 0.7461, |
|
"mean_token_accuracy": 0.7756987683475017, |
|
"num_tokens": 65218898.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09379579872984856, |
|
"grad_norm": 2.4022434258446457, |
|
"learning_rate": 9.912418114259548e-06, |
|
"loss": 0.7305, |
|
"mean_token_accuracy": 0.7788987644016743, |
|
"num_tokens": 71140560.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10161211529066927, |
|
"grad_norm": 2.0354431025023314, |
|
"learning_rate": 9.888620691273284e-06, |
|
"loss": 0.7078, |
|
"mean_token_accuracy": 0.7819231644272804, |
|
"num_tokens": 77078478.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.10942843185148998, |
|
"grad_norm": 1.88497692389339, |
|
"learning_rate": 9.862006434698169e-06, |
|
"loss": 0.6963, |
|
"mean_token_accuracy": 0.7847778849303723, |
|
"num_tokens": 82996564.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1172447484123107, |
|
"grad_norm": 1.9200062767141564, |
|
"learning_rate": 9.832592426879006e-06, |
|
"loss": 0.688, |
|
"mean_token_accuracy": 0.7845040634274483, |
|
"num_tokens": 88951976.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1250610649731314, |
|
"grad_norm": 2.210483530160085, |
|
"learning_rate": 9.800397547179276e-06, |
|
"loss": 0.6829, |
|
"mean_token_accuracy": 0.7839573793113231, |
|
"num_tokens": 94895652.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13287738153395212, |
|
"grad_norm": 2.0791394687934845, |
|
"learning_rate": 9.765442459863428e-06, |
|
"loss": 0.682, |
|
"mean_token_accuracy": 0.7845308348536492, |
|
"num_tokens": 100832079.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.14069369809477283, |
|
"grad_norm": 2.148171577637037, |
|
"learning_rate": 9.72774960083353e-06, |
|
"loss": 0.6906, |
|
"mean_token_accuracy": 0.7805382929742336, |
|
"num_tokens": 106787613.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14851001465559355, |
|
"grad_norm": 2.0898108754881326, |
|
"learning_rate": 9.687343163228806e-06, |
|
"loss": 0.6684, |
|
"mean_token_accuracy": 0.7865298599004745, |
|
"num_tokens": 112717599.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.15632633121641426, |
|
"grad_norm": 4.0822591130362955, |
|
"learning_rate": 9.644249081897277e-06, |
|
"loss": 0.6648, |
|
"mean_token_accuracy": 0.7872977338731288, |
|
"num_tokens": 118650442.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16414264777723497, |
|
"grad_norm": 2.441738444276975, |
|
"learning_rate": 9.598495016749493e-06, |
|
"loss": 0.6689, |
|
"mean_token_accuracy": 0.7859964817762375, |
|
"num_tokens": 124574766.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1719589643380557, |
|
"grad_norm": 2.075995258307634, |
|
"learning_rate": 9.55011033500505e-06, |
|
"loss": 0.6605, |
|
"mean_token_accuracy": 0.787811417132616, |
|
"num_tokens": 130516200.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 2.609489028108713, |
|
"learning_rate": 9.499126092343237e-06, |
|
"loss": 0.661, |
|
"mean_token_accuracy": 0.7875787198543549, |
|
"num_tokens": 136457829.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1875915974596971, |
|
"grad_norm": 3.7440429066647734, |
|
"learning_rate": 9.445575012969977e-06, |
|
"loss": 0.6709, |
|
"mean_token_accuracy": 0.7853954270482063, |
|
"num_tokens": 142385131.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19540791402051783, |
|
"grad_norm": 2.8751982634480044, |
|
"learning_rate": 9.38949146861382e-06, |
|
"loss": 0.663, |
|
"mean_token_accuracy": 0.7872662946581841, |
|
"num_tokens": 148289898.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.20322423058133854, |
|
"grad_norm": 2.925083855527294, |
|
"learning_rate": 9.33091145646446e-06, |
|
"loss": 0.6486, |
|
"mean_token_accuracy": 0.7910164006054401, |
|
"num_tokens": 154232386.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.21104054714215925, |
|
"grad_norm": 3.73974121507644, |
|
"learning_rate": 9.26987257606797e-06, |
|
"loss": 0.653, |
|
"mean_token_accuracy": 0.7898711428046227, |
|
"num_tokens": 160150772.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.21885686370297996, |
|
"grad_norm": 2.074910644810112, |
|
"learning_rate": 9.206414005193539e-06, |
|
"loss": 0.6564, |
|
"mean_token_accuracy": 0.7886676676571369, |
|
"num_tokens": 166088160.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.22667318026380068, |
|
"grad_norm": 3.765010702267194, |
|
"learning_rate": 9.140576474687263e-06, |
|
"loss": 0.665, |
|
"mean_token_accuracy": 0.7880136586725712, |
|
"num_tokens": 172032929.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2344894968246214, |
|
"grad_norm": 2.3920249014704944, |
|
"learning_rate": 9.072402242329067e-06, |
|
"loss": 0.6503, |
|
"mean_token_accuracy": 0.7899810753762722, |
|
"num_tokens": 178005406.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2423058133854421, |
|
"grad_norm": 4.535876880687944, |
|
"learning_rate": 9.001935065709569e-06, |
|
"loss": 0.6427, |
|
"mean_token_accuracy": 0.7929855234920978, |
|
"num_tokens": 183927144.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2501221299462628, |
|
"grad_norm": 3.44122188439969, |
|
"learning_rate": 8.929220174144304e-06, |
|
"loss": 0.6489, |
|
"mean_token_accuracy": 0.7914013616740704, |
|
"num_tokens": 189848723.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.25793844650708353, |
|
"grad_norm": 2.2378659722902188, |
|
"learning_rate": 8.85430423964332e-06, |
|
"loss": 0.6386, |
|
"mean_token_accuracy": 0.792429718375206, |
|
"num_tokens": 195777877.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.26575476306790424, |
|
"grad_norm": 2.982598937419653, |
|
"learning_rate": 8.777235346954753e-06, |
|
"loss": 0.649, |
|
"mean_token_accuracy": 0.790771734714508, |
|
"num_tokens": 201713378.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.27357107962872496, |
|
"grad_norm": 1.9062511611198192, |
|
"learning_rate": 8.698062962701691e-06, |
|
"loss": 0.652, |
|
"mean_token_accuracy": 0.790651909261942, |
|
"num_tokens": 207661581.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.28138739618954567, |
|
"grad_norm": 2.7700238166088877, |
|
"learning_rate": 8.616837903632026e-06, |
|
"loss": 0.6438, |
|
"mean_token_accuracy": 0.7914554052054882, |
|
"num_tokens": 213597723.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2892037127503664, |
|
"grad_norm": 4.298205320427554, |
|
"learning_rate": 8.533612304001763e-06, |
|
"loss": 0.6569, |
|
"mean_token_accuracy": 0.7872735880315304, |
|
"num_tokens": 219543570.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2970200293111871, |
|
"grad_norm": 2.62809039414661, |
|
"learning_rate": 8.44843958211269e-06, |
|
"loss": 0.646, |
|
"mean_token_accuracy": 0.7901849329471589, |
|
"num_tokens": 225490076.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3048363458720078, |
|
"grad_norm": 2.540505165257454, |
|
"learning_rate": 8.361374406025853e-06, |
|
"loss": 0.6452, |
|
"mean_token_accuracy": 0.7908910043537617, |
|
"num_tokens": 231429929.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3126526624328285, |
|
"grad_norm": 1.763996580236139, |
|
"learning_rate": 8.272472658472906e-06, |
|
"loss": 0.6529, |
|
"mean_token_accuracy": 0.7878620237112045, |
|
"num_tokens": 237384555.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32046897899364923, |
|
"grad_norm": 1.6233120878672673, |
|
"learning_rate": 8.181791400987807e-06, |
|
"loss": 0.6343, |
|
"mean_token_accuracy": 0.7936271652579308, |
|
"num_tokens": 243313192.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.32828529555446995, |
|
"grad_norm": 1.9405753464802942, |
|
"learning_rate": 8.089388837281915e-06, |
|
"loss": 0.6439, |
|
"mean_token_accuracy": 0.7914594881236553, |
|
"num_tokens": 249220870.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.33610161211529066, |
|
"grad_norm": 3.488125244229796, |
|
"learning_rate": 7.995324275885961e-06, |
|
"loss": 0.6351, |
|
"mean_token_accuracy": 0.7946518436074257, |
|
"num_tokens": 255166697.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3439179286761114, |
|
"grad_norm": 8.607586639921637, |
|
"learning_rate": 7.89965809208291e-06, |
|
"loss": 0.6426, |
|
"mean_token_accuracy": 0.7916349656879902, |
|
"num_tokens": 261089161.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3517342452369321, |
|
"grad_norm": 1.6660014877602962, |
|
"learning_rate": 7.802451689156122e-06, |
|
"loss": 0.6481, |
|
"mean_token_accuracy": 0.79035182595253, |
|
"num_tokens": 267021428.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 1.6153459083711095, |
|
"learning_rate": 7.70376745897768e-06, |
|
"loss": 0.6414, |
|
"mean_token_accuracy": 0.7924111239612103, |
|
"num_tokens": 272961221.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3673668783585735, |
|
"grad_norm": 1.7376528456449458, |
|
"learning_rate": 7.6036687419622215e-06, |
|
"loss": 0.6359, |
|
"mean_token_accuracy": 0.7931422784924507, |
|
"num_tokens": 278866458.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3751831949193942, |
|
"grad_norm": 1.7509363427645832, |
|
"learning_rate": 7.5022197864119175e-06, |
|
"loss": 0.6455, |
|
"mean_token_accuracy": 0.7915025249123573, |
|
"num_tokens": 284785499.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.38299951148021494, |
|
"grad_norm": 2.863849300726178, |
|
"learning_rate": 7.399485707278744e-06, |
|
"loss": 0.6478, |
|
"mean_token_accuracy": 0.7907331958413124, |
|
"num_tokens": 290732280.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.39081582804103565, |
|
"grad_norm": 1.4846518170955887, |
|
"learning_rate": 7.295532444370485e-06, |
|
"loss": 0.6451, |
|
"mean_token_accuracy": 0.7914663501083851, |
|
"num_tokens": 296654941.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.39863214460185636, |
|
"grad_norm": 1.7383678766587618, |
|
"learning_rate": 7.190426720027306e-06, |
|
"loss": 0.644, |
|
"mean_token_accuracy": 0.7916645854711533, |
|
"num_tokens": 302605292.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4064484611626771, |
|
"grad_norm": 2.2690859031654087, |
|
"learning_rate": 7.084235996296068e-06, |
|
"loss": 0.6409, |
|
"mean_token_accuracy": 0.792822826653719, |
|
"num_tokens": 308552365.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4142647777234978, |
|
"grad_norm": 1.6536907812645472, |
|
"learning_rate": 6.977028431629839e-06, |
|
"loss": 0.6418, |
|
"mean_token_accuracy": 0.7921351306140423, |
|
"num_tokens": 314489877.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4220810942843185, |
|
"grad_norm": 1.6417211396491032, |
|
"learning_rate": 6.86887283714044e-06, |
|
"loss": 0.6376, |
|
"mean_token_accuracy": 0.7937514387071133, |
|
"num_tokens": 320414279.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4298974108451392, |
|
"grad_norm": 2.1351664741224963, |
|
"learning_rate": 6.7598386324320745e-06, |
|
"loss": 0.6298, |
|
"mean_token_accuracy": 0.7947640925645828, |
|
"num_tokens": 326349818.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.43771372740595993, |
|
"grad_norm": 1.4007468087271036, |
|
"learning_rate": 6.649995801044391e-06, |
|
"loss": 0.6414, |
|
"mean_token_accuracy": 0.7926677256822586, |
|
"num_tokens": 332276019.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.44553004396678064, |
|
"grad_norm": 1.6121431265331962, |
|
"learning_rate": 6.539414845533596e-06, |
|
"loss": 0.6393, |
|
"mean_token_accuracy": 0.7921099595725536, |
|
"num_tokens": 338209339.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.45334636052760136, |
|
"grad_norm": 2.031155969828718, |
|
"learning_rate": 6.428166742220423e-06, |
|
"loss": 0.625, |
|
"mean_token_accuracy": 0.7955845050513745, |
|
"num_tokens": 344137484.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.46116267708842207, |
|
"grad_norm": 1.4450498950991713, |
|
"learning_rate": 6.316322895634029e-06, |
|
"loss": 0.6374, |
|
"mean_token_accuracy": 0.7931911982595921, |
|
"num_tokens": 350078203.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.4689789936492428, |
|
"grad_norm": 1.9913252644164074, |
|
"learning_rate": 6.20395509268104e-06, |
|
"loss": 0.6214, |
|
"mean_token_accuracy": 0.7970752798020839, |
|
"num_tokens": 356025763.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4767953102100635, |
|
"grad_norm": 2.8315822034325837, |
|
"learning_rate": 6.0911354565691594e-06, |
|
"loss": 0.6304, |
|
"mean_token_accuracy": 0.796255373954773, |
|
"num_tokens": 361992798.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.4846116267708842, |
|
"grad_norm": 1.8374439736436459, |
|
"learning_rate": 5.977936400514943e-06, |
|
"loss": 0.6307, |
|
"mean_token_accuracy": 0.7953431971371174, |
|
"num_tokens": 367913461.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4924279433317049, |
|
"grad_norm": 2.3033335141428, |
|
"learning_rate": 5.864430581265406e-06, |
|
"loss": 0.6356, |
|
"mean_token_accuracy": 0.7944584995508194, |
|
"num_tokens": 373852019.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5002442598925256, |
|
"grad_norm": 1.6907176529892718, |
|
"learning_rate": 5.750690852463339e-06, |
|
"loss": 0.6347, |
|
"mean_token_accuracy": 0.7937369205057621, |
|
"num_tokens": 379764522.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5080605764533463, |
|
"grad_norm": 1.818966581747677, |
|
"learning_rate": 5.636790217886243e-06, |
|
"loss": 0.6253, |
|
"mean_token_accuracy": 0.7948664158582688, |
|
"num_tokens": 385692482.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5158768930141671, |
|
"grad_norm": 1.6206125873755466, |
|
"learning_rate": 5.522801784588895e-06, |
|
"loss": 0.631, |
|
"mean_token_accuracy": 0.7939370617270469, |
|
"num_tokens": 391635856.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5236932095749878, |
|
"grad_norm": 3.3420400728114945, |
|
"learning_rate": 5.408798715979626e-06, |
|
"loss": 0.6341, |
|
"mean_token_accuracy": 0.7946567349135876, |
|
"num_tokens": 397545573.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5315095261358085, |
|
"grad_norm": 4.102411753363632, |
|
"learning_rate": 5.294854184860437e-06, |
|
"loss": 0.6268, |
|
"mean_token_accuracy": 0.7956276901066304, |
|
"num_tokens": 403475982.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 3.1767409652573853, |
|
"learning_rate": 5.1810413264610724e-06, |
|
"loss": 0.6276, |
|
"mean_token_accuracy": 0.7952337145805359, |
|
"num_tokens": 409418002.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5471421592574499, |
|
"grad_norm": 3.166395119010415, |
|
"learning_rate": 5.067433191497221e-06, |
|
"loss": 0.6322, |
|
"mean_token_accuracy": 0.7954030476510525, |
|
"num_tokens": 415344226.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5549584758182706, |
|
"grad_norm": 1.526943049343455, |
|
"learning_rate": 4.954102699282953e-06, |
|
"loss": 0.6359, |
|
"mean_token_accuracy": 0.7941608227789402, |
|
"num_tokens": 421277466.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5627747923790913, |
|
"grad_norm": 1.9099044626436186, |
|
"learning_rate": 4.841122590927511e-06, |
|
"loss": 0.618, |
|
"mean_token_accuracy": 0.7975563704967499, |
|
"num_tokens": 427241607.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.570591108939912, |
|
"grad_norm": 2.5623011943198137, |
|
"learning_rate": 4.7285653826464605e-06, |
|
"loss": 0.6272, |
|
"mean_token_accuracy": 0.7963444076478481, |
|
"num_tokens": 433157588.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5784074255007328, |
|
"grad_norm": 1.6826723465337794, |
|
"learning_rate": 4.616503319217202e-06, |
|
"loss": 0.6205, |
|
"mean_token_accuracy": 0.7979116909205913, |
|
"num_tokens": 439093218.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5862237420615535, |
|
"grad_norm": 2.4233422176625905, |
|
"learning_rate": 4.5050083276087155e-06, |
|
"loss": 0.6371, |
|
"mean_token_accuracy": 0.7943588711321354, |
|
"num_tokens": 445010423.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5940400586223742, |
|
"grad_norm": 1.8195780340101564, |
|
"learning_rate": 4.394151970815259e-06, |
|
"loss": 0.613, |
|
"mean_token_accuracy": 0.799777788668871, |
|
"num_tokens": 450918292.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6018563751831949, |
|
"grad_norm": 1.8194352528770072, |
|
"learning_rate": 4.284005401923723e-06, |
|
"loss": 0.6225, |
|
"mean_token_accuracy": 0.7965258292853832, |
|
"num_tokens": 456832151.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6096726917440156, |
|
"grad_norm": 1.7307705633009496, |
|
"learning_rate": 4.174639318444044e-06, |
|
"loss": 0.6191, |
|
"mean_token_accuracy": 0.7983451545238495, |
|
"num_tokens": 462764585.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6174890083048363, |
|
"grad_norm": 1.5492850018471778, |
|
"learning_rate": 4.066123916932069e-06, |
|
"loss": 0.6232, |
|
"mean_token_accuracy": 0.7965681925415993, |
|
"num_tokens": 468701502.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.625305324865657, |
|
"grad_norm": 1.5785740475386758, |
|
"learning_rate": 3.95852884793392e-06, |
|
"loss": 0.639, |
|
"mean_token_accuracy": 0.7930570214986801, |
|
"num_tokens": 474663601.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6331216414264778, |
|
"grad_norm": 1.7178497128631158, |
|
"learning_rate": 3.851923171280848e-06, |
|
"loss": 0.631, |
|
"mean_token_accuracy": 0.7956325292587281, |
|
"num_tokens": 480597092.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6409379579872985, |
|
"grad_norm": 2.2284125586269634, |
|
"learning_rate": 3.7463753117632086e-06, |
|
"loss": 0.6194, |
|
"mean_token_accuracy": 0.7979160696268082, |
|
"num_tokens": 486517715.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6487542745481192, |
|
"grad_norm": 1.6485020103488872, |
|
"learning_rate": 3.6419530152120585e-06, |
|
"loss": 0.6155, |
|
"mean_token_accuracy": 0.7989446625113488, |
|
"num_tokens": 492472305.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6565705911089399, |
|
"grad_norm": 1.6583017170770122, |
|
"learning_rate": 3.5387233050165305e-06, |
|
"loss": 0.6154, |
|
"mean_token_accuracy": 0.7981764920055866, |
|
"num_tokens": 498385685.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6643869076697606, |
|
"grad_norm": 1.7340707542937976, |
|
"learning_rate": 3.436752439104914e-06, |
|
"loss": 0.6232, |
|
"mean_token_accuracy": 0.7975495472550392, |
|
"num_tokens": 504307250.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6722032242305813, |
|
"grad_norm": 1.7732002119177575, |
|
"learning_rate": 3.336105867417036e-06, |
|
"loss": 0.6136, |
|
"mean_token_accuracy": 0.7990594677627086, |
|
"num_tokens": 510245141.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.680019540791402, |
|
"grad_norm": 1.7788709835674736, |
|
"learning_rate": 3.236848189895271e-06, |
|
"loss": 0.6221, |
|
"mean_token_accuracy": 0.7987750940024853, |
|
"num_tokens": 516171739.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6878358573522227, |
|
"grad_norm": 1.8951328728941093, |
|
"learning_rate": 3.1390431150210858e-06, |
|
"loss": 0.6216, |
|
"mean_token_accuracy": 0.7972325548529625, |
|
"num_tokens": 522101344.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 2.265125941545771, |
|
"learning_rate": 3.0427534189238056e-06, |
|
"loss": 0.6272, |
|
"mean_token_accuracy": 0.797095137834549, |
|
"num_tokens": 528042612.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7034684904738642, |
|
"grad_norm": 1.895267259607391, |
|
"learning_rate": 2.9480409050877836e-06, |
|
"loss": 0.6146, |
|
"mean_token_accuracy": 0.7996291488409042, |
|
"num_tokens": 533972183.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7112848070346849, |
|
"grad_norm": 2.1073222786880508, |
|
"learning_rate": 2.854966364683872e-06, |
|
"loss": 0.6066, |
|
"mean_token_accuracy": 0.8017579860985279, |
|
"num_tokens": 539882836.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 1.586483401475392, |
|
"learning_rate": 2.7635895375506516e-06, |
|
"loss": 0.6218, |
|
"mean_token_accuracy": 0.79697345495224, |
|
"num_tokens": 545834579.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7269174401563263, |
|
"grad_norm": 1.678175813746661, |
|
"learning_rate": 2.6739690738504428e-06, |
|
"loss": 0.6218, |
|
"mean_token_accuracy": 0.7964953184127808, |
|
"num_tokens": 551762612.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.734733756717147, |
|
"grad_norm": 4.728635128027955, |
|
"learning_rate": 2.5861624964247402e-06, |
|
"loss": 0.6129, |
|
"mean_token_accuracy": 0.7991872586309909, |
|
"num_tokens": 557733732.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7425500732779677, |
|
"grad_norm": 3.1433754367730327, |
|
"learning_rate": 2.5002261638732066e-06, |
|
"loss": 0.6259, |
|
"mean_token_accuracy": 0.796341958642006, |
|
"num_tokens": 563678120.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7503663898387885, |
|
"grad_norm": 1.5793999144249435, |
|
"learning_rate": 2.416215234379941e-06, |
|
"loss": 0.6171, |
|
"mean_token_accuracy": 0.7986149400472641, |
|
"num_tokens": 569641651.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7581827063996092, |
|
"grad_norm": 1.5637633654884855, |
|
"learning_rate": 2.3341836303102336e-06, |
|
"loss": 0.6168, |
|
"mean_token_accuracy": 0.7989203184843063, |
|
"num_tokens": 575596629.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7659990229604299, |
|
"grad_norm": 1.7578654658273658, |
|
"learning_rate": 2.2541840036005227e-06, |
|
"loss": 0.6152, |
|
"mean_token_accuracy": 0.8000254578888416, |
|
"num_tokens": 581506425.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7738153395212506, |
|
"grad_norm": 1.6111848358937764, |
|
"learning_rate": 2.1762677019637836e-06, |
|
"loss": 0.615, |
|
"mean_token_accuracy": 0.7983125224709511, |
|
"num_tokens": 587441928.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.7816316560820713, |
|
"grad_norm": 2.729056351863656, |
|
"learning_rate": 2.100484735932027e-06, |
|
"loss": 0.6219, |
|
"mean_token_accuracy": 0.7974658064544201, |
|
"num_tokens": 593387525.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.789447972642892, |
|
"grad_norm": 2.0026132736871256, |
|
"learning_rate": 2.0268837467570714e-06, |
|
"loss": 0.6113, |
|
"mean_token_accuracy": 0.7999734558165074, |
|
"num_tokens": 599357871.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.7972642892037127, |
|
"grad_norm": 3.202086545883362, |
|
"learning_rate": 1.955511975190185e-06, |
|
"loss": 0.6167, |
|
"mean_token_accuracy": 0.7985524848103523, |
|
"num_tokens": 605266801.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8050806057645334, |
|
"grad_norm": 2.5845078387849876, |
|
"learning_rate": 1.8864152311606342e-06, |
|
"loss": 0.6184, |
|
"mean_token_accuracy": 0.7977212890982628, |
|
"num_tokens": 611200207.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.8128969223253542, |
|
"grad_norm": 1.3459195379746025, |
|
"learning_rate": 1.8196378643726092e-06, |
|
"loss": 0.6141, |
|
"mean_token_accuracy": 0.7991946995258331, |
|
"num_tokens": 617152907.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8207132388861749, |
|
"grad_norm": 1.6903121443946976, |
|
"learning_rate": 1.7552227358393933e-06, |
|
"loss": 0.6166, |
|
"mean_token_accuracy": 0.7996014229953289, |
|
"num_tokens": 623085161.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.8285295554469956, |
|
"grad_norm": 5.000507187078701, |
|
"learning_rate": 1.6932111903730453e-06, |
|
"loss": 0.6106, |
|
"mean_token_accuracy": 0.7996949210762978, |
|
"num_tokens": 629027147.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8363458720078163, |
|
"grad_norm": 2.026922156270765, |
|
"learning_rate": 1.6336430300472606e-06, |
|
"loss": 0.6045, |
|
"mean_token_accuracy": 0.800903269648552, |
|
"num_tokens": 634987448.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.844162188568637, |
|
"grad_norm": 4.687498745163573, |
|
"learning_rate": 1.576556488650428e-06, |
|
"loss": 0.6128, |
|
"mean_token_accuracy": 0.7995632983744144, |
|
"num_tokens": 640920908.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8519785051294577, |
|
"grad_norm": 1.7305252046419992, |
|
"learning_rate": 1.5219882071452967e-06, |
|
"loss": 0.6059, |
|
"mean_token_accuracy": 0.8014576397836208, |
|
"num_tokens": 646854354.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.8597948216902784, |
|
"grad_norm": 1.5220809585740145, |
|
"learning_rate": 1.4699732101510026e-06, |
|
"loss": 0.6098, |
|
"mean_token_accuracy": 0.7980151705443859, |
|
"num_tokens": 652785154.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8676111382510991, |
|
"grad_norm": 2.3863927864336896, |
|
"learning_rate": 1.4205448834625275e-06, |
|
"loss": 0.6174, |
|
"mean_token_accuracy": 0.7987048149108886, |
|
"num_tokens": 658699371.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.8754274548119199, |
|
"grad_norm": 4.490834509368819, |
|
"learning_rate": 1.37373495262205e-06, |
|
"loss": 0.6176, |
|
"mean_token_accuracy": 0.7986162424087524, |
|
"num_tokens": 664648725.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8832437713727406, |
|
"grad_norm": 1.8330616045846946, |
|
"learning_rate": 1.3295734625559315e-06, |
|
"loss": 0.6157, |
|
"mean_token_accuracy": 0.798362709581852, |
|
"num_tokens": 670559073.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.8910600879335613, |
|
"grad_norm": 1.8866171760203552, |
|
"learning_rate": 1.2880887582903884e-06, |
|
"loss": 0.6163, |
|
"mean_token_accuracy": 0.7984154649078846, |
|
"num_tokens": 676506420.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 1.7452455628625234, |
|
"learning_rate": 1.2493074667582584e-06, |
|
"loss": 0.621, |
|
"mean_token_accuracy": 0.7975537806749344, |
|
"num_tokens": 682448693.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.9066927210552027, |
|
"grad_norm": 2.030867984009538, |
|
"learning_rate": 1.213254479708519e-06, |
|
"loss": 0.6163, |
|
"mean_token_accuracy": 0.7988204933702946, |
|
"num_tokens": 688390022.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9145090376160234, |
|
"grad_norm": 1.550865659075602, |
|
"learning_rate": 1.179952937729534e-06, |
|
"loss": 0.6193, |
|
"mean_token_accuracy": 0.7983230344951153, |
|
"num_tokens": 694331611.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.9223253541768441, |
|
"grad_norm": 1.571410028230591, |
|
"learning_rate": 1.149424215396281e-06, |
|
"loss": 0.6136, |
|
"mean_token_accuracy": 0.7990704528987408, |
|
"num_tokens": 700261331.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9301416707376648, |
|
"grad_norm": 2.163421002394794, |
|
"learning_rate": 1.1216879075510877e-06, |
|
"loss": 0.6141, |
|
"mean_token_accuracy": 0.7987876988947391, |
|
"num_tokens": 706193693.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.9379579872984856, |
|
"grad_norm": 3.7294851958982975, |
|
"learning_rate": 1.0967618167267032e-06, |
|
"loss": 0.5997, |
|
"mean_token_accuracy": 0.8031502008438111, |
|
"num_tokens": 712120509.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9457743038593063, |
|
"grad_norm": 1.937019840071183, |
|
"learning_rate": 1.0746619417197436e-06, |
|
"loss": 0.6079, |
|
"mean_token_accuracy": 0.8023772671818733, |
|
"num_tokens": 718044202.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.953590620420127, |
|
"grad_norm": 1.822048015248325, |
|
"learning_rate": 1.0554024673218808e-06, |
|
"loss": 0.6102, |
|
"mean_token_accuracy": 0.8010022938251495, |
|
"num_tokens": 723993614.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9614069369809477, |
|
"grad_norm": 1.6712441013840658, |
|
"learning_rate": 1.0389957552153385e-06, |
|
"loss": 0.6053, |
|
"mean_token_accuracy": 0.8014584824442863, |
|
"num_tokens": 729959466.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.9692232535417684, |
|
"grad_norm": 3.2548443190376073, |
|
"learning_rate": 1.0254523360385555e-06, |
|
"loss": 0.6146, |
|
"mean_token_accuracy": 0.7999479919672012, |
|
"num_tokens": 735885753.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9770395701025891, |
|
"grad_norm": 7.139737923902445, |
|
"learning_rate": 1.0147809026271017e-06, |
|
"loss": 0.6145, |
|
"mean_token_accuracy": 0.7991539172828197, |
|
"num_tokens": 741813482.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9848558866634098, |
|
"grad_norm": 1.7874591260763464, |
|
"learning_rate": 1.0069883044341846e-06, |
|
"loss": 0.6142, |
|
"mean_token_accuracy": 0.7999210134148598, |
|
"num_tokens": 747741960.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9926722032242306, |
|
"grad_norm": 3.2850538471833945, |
|
"learning_rate": 1.0020795431343349e-06, |
|
"loss": 0.6074, |
|
"mean_token_accuracy": 0.8015001997351646, |
|
"num_tokens": 753638158.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.3599969643790777, |
|
"learning_rate": 1.0000577694130827e-06, |
|
"loss": 0.5746, |
|
"mean_token_accuracy": 0.8004830511411031, |
|
"num_tokens": 759191155.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 640, |
|
"total_flos": 1.1033250327691264e+16, |
|
"train_loss": 0.6782059136778116, |
|
"train_runtime": 78870.76, |
|
"train_samples_per_second": 4.152, |
|
"train_steps_per_second": 0.008 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 640, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1033250327691264e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|