|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 1408, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007106057914372002, |
|
"grad_norm": 19.49246232730451, |
|
"learning_rate": 1.86046511627907e-06, |
|
"loss": 2.4934, |
|
"mean_token_accuracy": 0.5258669804781675, |
|
"num_tokens": 4763584.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014212115828744005, |
|
"grad_norm": 12.095302419910517, |
|
"learning_rate": 4.186046511627907e-06, |
|
"loss": 2.3185, |
|
"mean_token_accuracy": 0.5427483215928077, |
|
"num_tokens": 9531720.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021318173743116006, |
|
"grad_norm": 6.796780116822856, |
|
"learning_rate": 6.511627906976745e-06, |
|
"loss": 1.8082, |
|
"mean_token_accuracy": 0.5906881660223007, |
|
"num_tokens": 14272673.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02842423165748801, |
|
"grad_norm": 2.028130586531004, |
|
"learning_rate": 8.837209302325582e-06, |
|
"loss": 1.4468, |
|
"mean_token_accuracy": 0.639113237708807, |
|
"num_tokens": 19045659.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03553028957186001, |
|
"grad_norm": 1.0588899013746833, |
|
"learning_rate": 1.116279069767442e-05, |
|
"loss": 1.2276, |
|
"mean_token_accuracy": 0.6780799143016338, |
|
"num_tokens": 23811446.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04263634748623201, |
|
"grad_norm": 0.6488648360367262, |
|
"learning_rate": 1.3488372093023257e-05, |
|
"loss": 1.088, |
|
"mean_token_accuracy": 0.7027405865490437, |
|
"num_tokens": 28572862.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04974240540060401, |
|
"grad_norm": 0.5415057756218988, |
|
"learning_rate": 1.5813953488372095e-05, |
|
"loss": 1.0242, |
|
"mean_token_accuracy": 0.7144973143935204, |
|
"num_tokens": 33349882.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05684846331497602, |
|
"grad_norm": 0.48382195510047, |
|
"learning_rate": 1.813953488372093e-05, |
|
"loss": 0.9712, |
|
"mean_token_accuracy": 0.7250196196138858, |
|
"num_tokens": 38137624.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06395452122934801, |
|
"grad_norm": 0.49727864530874677, |
|
"learning_rate": 1.99999761632652e-05, |
|
"loss": 0.9247, |
|
"mean_token_accuracy": 0.7344170436263084, |
|
"num_tokens": 42901997.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07106057914372002, |
|
"grad_norm": 0.4396836808821335, |
|
"learning_rate": 1.999914189080485e-05, |
|
"loss": 0.8892, |
|
"mean_token_accuracy": 0.7411722339689731, |
|
"num_tokens": 47660479.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07816663705809203, |
|
"grad_norm": 0.4277947889677102, |
|
"learning_rate": 1.9997115907865857e-05, |
|
"loss": 0.8745, |
|
"mean_token_accuracy": 0.7443610817193985, |
|
"num_tokens": 52437558.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08527269497246402, |
|
"grad_norm": 0.4233524880605398, |
|
"learning_rate": 1.999389848273882e-05, |
|
"loss": 0.8603, |
|
"mean_token_accuracy": 0.747463022172451, |
|
"num_tokens": 57198957.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09237875288683603, |
|
"grad_norm": 0.409057413495153, |
|
"learning_rate": 1.998949004149094e-05, |
|
"loss": 0.8537, |
|
"mean_token_accuracy": 0.7483336836099624, |
|
"num_tokens": 61961974.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.09948481080120802, |
|
"grad_norm": 0.46927095285141973, |
|
"learning_rate": 1.9983891167909617e-05, |
|
"loss": 0.8375, |
|
"mean_token_accuracy": 0.7526809796690941, |
|
"num_tokens": 66725536.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10659086871558003, |
|
"grad_norm": 0.4106777573300543, |
|
"learning_rate": 1.9977102603425134e-05, |
|
"loss": 0.8309, |
|
"mean_token_accuracy": 0.7542230375111103, |
|
"num_tokens": 71469095.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11369692662995204, |
|
"grad_norm": 0.40584561150068, |
|
"learning_rate": 1.996912524701247e-05, |
|
"loss": 0.8258, |
|
"mean_token_accuracy": 0.7558744698762894, |
|
"num_tokens": 76221430.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12080298454432403, |
|
"grad_norm": 0.4436779763048366, |
|
"learning_rate": 1.995996015507227e-05, |
|
"loss": 0.8152, |
|
"mean_token_accuracy": 0.7582607261836529, |
|
"num_tokens": 80991235.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.12790904245869603, |
|
"grad_norm": 0.4011663933394793, |
|
"learning_rate": 1.9949608541290924e-05, |
|
"loss": 0.8128, |
|
"mean_token_accuracy": 0.7592827767133713, |
|
"num_tokens": 85760262.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.13501510037306805, |
|
"grad_norm": 0.4484975750242541, |
|
"learning_rate": 1.9938071776479875e-05, |
|
"loss": 0.8015, |
|
"mean_token_accuracy": 0.7621250681579113, |
|
"num_tokens": 90505979.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.14212115828744004, |
|
"grad_norm": 0.3973118091711915, |
|
"learning_rate": 1.992535138839406e-05, |
|
"loss": 0.7956, |
|
"mean_token_accuracy": 0.7619979940354824, |
|
"num_tokens": 95259341.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14212115828744004, |
|
"eval_loss": 0.7710759043693542, |
|
"eval_mean_token_accuracy": 0.7617696215186203, |
|
"eval_num_tokens": 95259341.0, |
|
"eval_runtime": 149.4719, |
|
"eval_samples_per_second": 24.346, |
|
"eval_steps_per_second": 0.763, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14922721620181204, |
|
"grad_norm": 0.4055637275946486, |
|
"learning_rate": 1.991144906152962e-05, |
|
"loss": 0.804, |
|
"mean_token_accuracy": 0.7599063582718373, |
|
"num_tokens": 100023224.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.15633327411618406, |
|
"grad_norm": 0.5425556091455714, |
|
"learning_rate": 1.9896366636900826e-05, |
|
"loss": 0.7951, |
|
"mean_token_accuracy": 0.7621362045407295, |
|
"num_tokens": 104771415.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16343933203055605, |
|
"grad_norm": 0.4581521538010428, |
|
"learning_rate": 1.9880106111796266e-05, |
|
"loss": 0.7903, |
|
"mean_token_accuracy": 0.7619842484593391, |
|
"num_tokens": 109537100.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.17054538994492804, |
|
"grad_norm": 0.42123272880797796, |
|
"learning_rate": 1.9862669639514382e-05, |
|
"loss": 0.7886, |
|
"mean_token_accuracy": 0.7638748176395893, |
|
"num_tokens": 114304125.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17765144785930007, |
|
"grad_norm": 0.3957407600605023, |
|
"learning_rate": 1.9844059529078297e-05, |
|
"loss": 0.7763, |
|
"mean_token_accuracy": 0.7664125673472881, |
|
"num_tokens": 119067772.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.18475750577367206, |
|
"grad_norm": 1.1428359800811156, |
|
"learning_rate": 1.9824278244930052e-05, |
|
"loss": 0.7736, |
|
"mean_token_accuracy": 0.7676285386085511, |
|
"num_tokens": 123805148.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.19186356368804405, |
|
"grad_norm": 0.4240204964738557, |
|
"learning_rate": 1.9803328406604252e-05, |
|
"loss": 0.7841, |
|
"mean_token_accuracy": 0.763801097869873, |
|
"num_tokens": 128568367.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.19896962160241605, |
|
"grad_norm": 0.41090851408734014, |
|
"learning_rate": 1.9781212788381177e-05, |
|
"loss": 0.7819, |
|
"mean_token_accuracy": 0.7644710555672646, |
|
"num_tokens": 133343344.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.20607567951678807, |
|
"grad_norm": 0.38627786124116875, |
|
"learning_rate": 1.9757934318919386e-05, |
|
"loss": 0.7586, |
|
"mean_token_accuracy": 0.7707050330936909, |
|
"num_tokens": 138078681.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.21318173743116006, |
|
"grad_norm": 0.35413336695126674, |
|
"learning_rate": 1.973349608086791e-05, |
|
"loss": 0.7579, |
|
"mean_token_accuracy": 0.7715410716831684, |
|
"num_tokens": 142812366.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22028779534553206, |
|
"grad_norm": 0.44320259589774924, |
|
"learning_rate": 1.9707901310458017e-05, |
|
"loss": 0.7649, |
|
"mean_token_accuracy": 0.7688324272632598, |
|
"num_tokens": 147567508.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.22739385325990408, |
|
"grad_norm": 0.4272220313623565, |
|
"learning_rate": 1.9681153397074658e-05, |
|
"loss": 0.779, |
|
"mean_token_accuracy": 0.7649676457047463, |
|
"num_tokens": 152348975.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.23449991117427607, |
|
"grad_norm": 0.36879913627107413, |
|
"learning_rate": 1.9653255882807625e-05, |
|
"loss": 0.7547, |
|
"mean_token_accuracy": 0.7709379114210606, |
|
"num_tokens": 157094616.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.24160596908864806, |
|
"grad_norm": 0.39000799041219736, |
|
"learning_rate": 1.9624212461982497e-05, |
|
"loss": 0.7594, |
|
"mean_token_accuracy": 0.7707360699772835, |
|
"num_tokens": 161849805.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2487120270030201, |
|
"grad_norm": 0.37089862448174604, |
|
"learning_rate": 1.9594026980671423e-05, |
|
"loss": 0.7555, |
|
"mean_token_accuracy": 0.7713063634932041, |
|
"num_tokens": 166609253.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.25581808491739205, |
|
"grad_norm": 0.3895218613587052, |
|
"learning_rate": 1.9562703436183783e-05, |
|
"loss": 0.7641, |
|
"mean_token_accuracy": 0.7704252451658249, |
|
"num_tokens": 171374935.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2629241428317641, |
|
"grad_norm": 0.41062410007633293, |
|
"learning_rate": 1.953024597653688e-05, |
|
"loss": 0.7549, |
|
"mean_token_accuracy": 0.771364139765501, |
|
"num_tokens": 176152999.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2700302007461361, |
|
"grad_norm": 0.41614535212567993, |
|
"learning_rate": 1.9496658899906605e-05, |
|
"loss": 0.7479, |
|
"mean_token_accuracy": 0.7711076475679874, |
|
"num_tokens": 180913997.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.27713625866050806, |
|
"grad_norm": 0.3927226802114006, |
|
"learning_rate": 1.946194665405828e-05, |
|
"loss": 0.7563, |
|
"mean_token_accuracy": 0.7716620303690434, |
|
"num_tokens": 185674484.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2842423165748801, |
|
"grad_norm": 0.3794392681437391, |
|
"learning_rate": 1.9426113835757637e-05, |
|
"loss": 0.7537, |
|
"mean_token_accuracy": 0.7706545531749726, |
|
"num_tokens": 190438181.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2842423165748801, |
|
"eval_loss": 0.7250556349754333, |
|
"eval_mean_token_accuracy": 0.7720574731366676, |
|
"eval_num_tokens": 190438181.0, |
|
"eval_runtime": 149.9052, |
|
"eval_samples_per_second": 24.275, |
|
"eval_steps_per_second": 0.76, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2913483744892521, |
|
"grad_norm": 0.40984266753765214, |
|
"learning_rate": 1.9389165190162114e-05, |
|
"loss": 0.753, |
|
"mean_token_accuracy": 0.7713685400784016, |
|
"num_tokens": 195189498.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.29845443240362407, |
|
"grad_norm": 0.39761042206791, |
|
"learning_rate": 1.935110561019246e-05, |
|
"loss": 0.7424, |
|
"mean_token_accuracy": 0.7745413303375244, |
|
"num_tokens": 199953059.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3055604903179961, |
|
"grad_norm": 0.37462688474606465, |
|
"learning_rate": 1.931194013588481e-05, |
|
"loss": 0.7504, |
|
"mean_token_accuracy": 0.7734048135578633, |
|
"num_tokens": 204732059.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3126665482323681, |
|
"grad_norm": 0.39514621265905453, |
|
"learning_rate": 1.927167395372324e-05, |
|
"loss": 0.746, |
|
"mean_token_accuracy": 0.7722579926252365, |
|
"num_tokens": 209507798.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3197726061467401, |
|
"grad_norm": 0.3734047512754032, |
|
"learning_rate": 1.9230312395952955e-05, |
|
"loss": 0.7444, |
|
"mean_token_accuracy": 0.7728776805102825, |
|
"num_tokens": 214261827.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3268786640611121, |
|
"grad_norm": 0.39160079857564095, |
|
"learning_rate": 1.9187860939874176e-05, |
|
"loss": 0.7509, |
|
"mean_token_accuracy": 0.771727342903614, |
|
"num_tokens": 219027585.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3339847219754841, |
|
"grad_norm": 0.38214353202928264, |
|
"learning_rate": 1.9144325207116785e-05, |
|
"loss": 0.7388, |
|
"mean_token_accuracy": 0.7766963444650173, |
|
"num_tokens": 223775141.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3410907798898561, |
|
"grad_norm": 0.40133338005632113, |
|
"learning_rate": 1.909971096289591e-05, |
|
"loss": 0.7454, |
|
"mean_token_accuracy": 0.7735047489404678, |
|
"num_tokens": 228541180.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3481968378042281, |
|
"grad_norm": 0.4094942578953397, |
|
"learning_rate": 1.9054024115248448e-05, |
|
"loss": 0.7401, |
|
"mean_token_accuracy": 0.7752777233719825, |
|
"num_tokens": 233303417.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.35530289571860013, |
|
"grad_norm": 0.37342705131252163, |
|
"learning_rate": 1.90072707142507e-05, |
|
"loss": 0.746, |
|
"mean_token_accuracy": 0.7732348993420601, |
|
"num_tokens": 238086815.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3624089536329721, |
|
"grad_norm": 0.37950791393566236, |
|
"learning_rate": 1.8959456951217187e-05, |
|
"loss": 0.7324, |
|
"mean_token_accuracy": 0.7766066655516625, |
|
"num_tokens": 242856686.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.3695150115473441, |
|
"grad_norm": 0.36431061688841127, |
|
"learning_rate": 1.8910589157880766e-05, |
|
"loss": 0.7389, |
|
"mean_token_accuracy": 0.7757058747112751, |
|
"num_tokens": 247606311.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.37662106946171614, |
|
"grad_norm": 0.4264408595860345, |
|
"learning_rate": 1.8860673805554167e-05, |
|
"loss": 0.74, |
|
"mean_token_accuracy": 0.7750592313706874, |
|
"num_tokens": 252376279.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3837271273760881, |
|
"grad_norm": 0.3649269356510504, |
|
"learning_rate": 1.8809717504273e-05, |
|
"loss": 0.7294, |
|
"mean_token_accuracy": 0.7773133426904678, |
|
"num_tokens": 257157622.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.39083318529046013, |
|
"grad_norm": 0.4595703905257225, |
|
"learning_rate": 1.8757727001920446e-05, |
|
"loss": 0.7376, |
|
"mean_token_accuracy": 0.7763620682060719, |
|
"num_tokens": 261918809.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3979392432048321, |
|
"grad_norm": 0.5280527224785482, |
|
"learning_rate": 1.8704709183333653e-05, |
|
"loss": 0.7329, |
|
"mean_token_accuracy": 0.7755794525146484, |
|
"num_tokens": 266684284.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4050453011192041, |
|
"grad_norm": 0.37695307187112215, |
|
"learning_rate": 1.8650671069392034e-05, |
|
"loss": 0.7331, |
|
"mean_token_accuracy": 0.776630100607872, |
|
"num_tokens": 271450936.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.41215135903357614, |
|
"grad_norm": 0.4776029246227445, |
|
"learning_rate": 1.85956198160875e-05, |
|
"loss": 0.7262, |
|
"mean_token_accuracy": 0.7792015597224236, |
|
"num_tokens": 276196347.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4192574169479481, |
|
"grad_norm": 0.40416881485925626, |
|
"learning_rate": 1.853956271357685e-05, |
|
"loss": 0.7207, |
|
"mean_token_accuracy": 0.7793174132704734, |
|
"num_tokens": 280956754.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.4263634748623201, |
|
"grad_norm": 0.36640042813026624, |
|
"learning_rate": 1.8482507185216365e-05, |
|
"loss": 0.7417, |
|
"mean_token_accuracy": 0.7740650460124016, |
|
"num_tokens": 285730218.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4263634748623201, |
|
"eval_loss": 0.7050633430480957, |
|
"eval_mean_token_accuracy": 0.7762745759989086, |
|
"eval_num_tokens": 285730218.0, |
|
"eval_runtime": 149.2344, |
|
"eval_samples_per_second": 24.384, |
|
"eval_steps_per_second": 0.764, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.43346953277669215, |
|
"grad_norm": 0.39913292298186365, |
|
"learning_rate": 1.842446078657877e-05, |
|
"loss": 0.7328, |
|
"mean_token_accuracy": 0.7760866671800614, |
|
"num_tokens": 290497109.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.4405755906910641, |
|
"grad_norm": 1.1471762547137223, |
|
"learning_rate": 1.8365431204452683e-05, |
|
"loss": 0.7364, |
|
"mean_token_accuracy": 0.7759052954614163, |
|
"num_tokens": 295276239.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.44768164860543613, |
|
"grad_norm": 0.37744675876656947, |
|
"learning_rate": 1.8305426255824713e-05, |
|
"loss": 0.7317, |
|
"mean_token_accuracy": 0.7751947946846485, |
|
"num_tokens": 300042876.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.45478770651980815, |
|
"grad_norm": 0.47691506104424974, |
|
"learning_rate": 1.824445388684426e-05, |
|
"loss": 0.7277, |
|
"mean_token_accuracy": 0.7777360931038857, |
|
"num_tokens": 304798068.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4618937644341801, |
|
"grad_norm": 0.3588696331537949, |
|
"learning_rate": 1.8182522171771293e-05, |
|
"loss": 0.726, |
|
"mean_token_accuracy": 0.7783321216702461, |
|
"num_tokens": 309546102.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.46899982234855214, |
|
"grad_norm": 0.4952808278830928, |
|
"learning_rate": 1.8119639311907074e-05, |
|
"loss": 0.738, |
|
"mean_token_accuracy": 0.7744948998093605, |
|
"num_tokens": 314307721.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.47610588026292416, |
|
"grad_norm": 0.37908339080858117, |
|
"learning_rate": 1.805581363450813e-05, |
|
"loss": 0.7309, |
|
"mean_token_accuracy": 0.7773234643042087, |
|
"num_tokens": 319073411.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.48321193817729613, |
|
"grad_norm": 0.3852822861687584, |
|
"learning_rate": 1.7991053591683508e-05, |
|
"loss": 0.731, |
|
"mean_token_accuracy": 0.7765265628695488, |
|
"num_tokens": 323831947.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.49031799609166815, |
|
"grad_norm": 0.36165483826370953, |
|
"learning_rate": 1.7925367759275495e-05, |
|
"loss": 0.7232, |
|
"mean_token_accuracy": 0.7792682178318501, |
|
"num_tokens": 328590613.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.4974240540060402, |
|
"grad_norm": 0.4313963783745435, |
|
"learning_rate": 1.7858764835723984e-05, |
|
"loss": 0.7247, |
|
"mean_token_accuracy": 0.7771173417568207, |
|
"num_tokens": 333348383.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5045301119204122, |
|
"grad_norm": 0.4013920878913357, |
|
"learning_rate": 1.7791253640914566e-05, |
|
"loss": 0.7236, |
|
"mean_token_accuracy": 0.778332532197237, |
|
"num_tokens": 338109943.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5116361698347841, |
|
"grad_norm": 0.4057100730404507, |
|
"learning_rate": 1.7722843115010564e-05, |
|
"loss": 0.7221, |
|
"mean_token_accuracy": 0.7787548579275608, |
|
"num_tokens": 342887490.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5187422277491561, |
|
"grad_norm": 0.3675393081924701, |
|
"learning_rate": 1.7653542317269134e-05, |
|
"loss": 0.7171, |
|
"mean_token_accuracy": 0.7794929854571819, |
|
"num_tokens": 347628813.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5258482856635281, |
|
"grad_norm": 0.3635771034527727, |
|
"learning_rate": 1.7583360424841595e-05, |
|
"loss": 0.7272, |
|
"mean_token_accuracy": 0.7774313412606716, |
|
"num_tokens": 352403713.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5329543435779002, |
|
"grad_norm": 0.38584219141761933, |
|
"learning_rate": 1.7512306731558133e-05, |
|
"loss": 0.7194, |
|
"mean_token_accuracy": 0.7801453106105327, |
|
"num_tokens": 357150848.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5400604014922722, |
|
"grad_norm": 0.36346011493475233, |
|
"learning_rate": 1.744039064669709e-05, |
|
"loss": 0.7253, |
|
"mean_token_accuracy": 0.7780666872859001, |
|
"num_tokens": 361924581.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5471664594066442, |
|
"grad_norm": 0.404515192018998, |
|
"learning_rate": 1.7367621693738917e-05, |
|
"loss": 0.715, |
|
"mean_token_accuracy": 0.7817073427140713, |
|
"num_tokens": 366676773.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5542725173210161, |
|
"grad_norm": 0.36988377045483584, |
|
"learning_rate": 1.7294009509105052e-05, |
|
"loss": 0.7131, |
|
"mean_token_accuracy": 0.7806239545345306, |
|
"num_tokens": 371452085.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5613785752353881, |
|
"grad_norm": 0.36659851484970235, |
|
"learning_rate": 1.7219563840881783e-05, |
|
"loss": 0.7116, |
|
"mean_token_accuracy": 0.782407358288765, |
|
"num_tokens": 376207953.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5684846331497602, |
|
"grad_norm": 0.3624183655015318, |
|
"learning_rate": 1.71442945475294e-05, |
|
"loss": 0.7169, |
|
"mean_token_accuracy": 0.7801115453243256, |
|
"num_tokens": 380979250.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5684846331497602, |
|
"eval_loss": 0.6918764114379883, |
|
"eval_mean_token_accuracy": 0.7797647902840062, |
|
"eval_num_tokens": 380979250.0, |
|
"eval_runtime": 150.2483, |
|
"eval_samples_per_second": 24.22, |
|
"eval_steps_per_second": 0.759, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5755906910641322, |
|
"grad_norm": 0.3779480713941837, |
|
"learning_rate": 1.7068211596576662e-05, |
|
"loss": 0.716, |
|
"mean_token_accuracy": 0.7807160533964634, |
|
"num_tokens": 385752024.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5826967489785042, |
|
"grad_norm": 0.3956915407192127, |
|
"learning_rate": 1.699132506330086e-05, |
|
"loss": 0.7168, |
|
"mean_token_accuracy": 0.780977015197277, |
|
"num_tokens": 390510208.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5898028068928762, |
|
"grad_norm": 0.37756059351173565, |
|
"learning_rate": 1.691364512939358e-05, |
|
"loss": 0.7138, |
|
"mean_token_accuracy": 0.7802788965404034, |
|
"num_tokens": 395264854.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5969088648072481, |
|
"grad_norm": 0.4087318897216221, |
|
"learning_rate": 1.6835182081612426e-05, |
|
"loss": 0.7136, |
|
"mean_token_accuracy": 0.782038314640522, |
|
"num_tokens": 400017717.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6040149227216202, |
|
"grad_norm": 0.40966025914497906, |
|
"learning_rate": 1.6755946310418777e-05, |
|
"loss": 0.7162, |
|
"mean_token_accuracy": 0.7809364423155785, |
|
"num_tokens": 404785855.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6111209806359922, |
|
"grad_norm": 0.34380470044932104, |
|
"learning_rate": 1.6675948308601826e-05, |
|
"loss": 0.7088, |
|
"mean_token_accuracy": 0.7824217259883881, |
|
"num_tokens": 409545265.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6182270385503642, |
|
"grad_norm": 0.3999223329715891, |
|
"learning_rate": 1.6595198669889086e-05, |
|
"loss": 0.7178, |
|
"mean_token_accuracy": 0.7794642865657806, |
|
"num_tokens": 414313757.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6253330964647362, |
|
"grad_norm": 0.4093202361120024, |
|
"learning_rate": 1.6513708087543507e-05, |
|
"loss": 0.7112, |
|
"mean_token_accuracy": 0.7812661081552505, |
|
"num_tokens": 419067741.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6324391543791081, |
|
"grad_norm": 0.3927913861855057, |
|
"learning_rate": 1.643148735294744e-05, |
|
"loss": 0.7085, |
|
"mean_token_accuracy": 0.7821477875113487, |
|
"num_tokens": 423849699.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.6395452122934802, |
|
"grad_norm": 0.38988729195797683, |
|
"learning_rate": 1.634854735417356e-05, |
|
"loss": 0.7184, |
|
"mean_token_accuracy": 0.7806262195110321, |
|
"num_tokens": 428613216.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6466512702078522, |
|
"grad_norm": 0.35125767095510474, |
|
"learning_rate": 1.6264899074543038e-05, |
|
"loss": 0.7244, |
|
"mean_token_accuracy": 0.7782423093914985, |
|
"num_tokens": 433373732.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.6537573281222242, |
|
"grad_norm": 0.3717296312246723, |
|
"learning_rate": 1.6180553591171064e-05, |
|
"loss": 0.7134, |
|
"mean_token_accuracy": 0.7801944658160209, |
|
"num_tokens": 438144634.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6608633860365962, |
|
"grad_norm": 0.3514665073580472, |
|
"learning_rate": 1.6095522073499968e-05, |
|
"loss": 0.7094, |
|
"mean_token_accuracy": 0.782074099034071, |
|
"num_tokens": 442899589.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.6679694439509682, |
|
"grad_norm": 0.3635349909050601, |
|
"learning_rate": 1.600981578182011e-05, |
|
"loss": 0.7125, |
|
"mean_token_accuracy": 0.7808018557727336, |
|
"num_tokens": 447672633.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6750755018653402, |
|
"grad_norm": 0.3548670898985585, |
|
"learning_rate": 1.5923446065778715e-05, |
|
"loss": 0.7162, |
|
"mean_token_accuracy": 0.7795430406928062, |
|
"num_tokens": 452431439.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6821815597797122, |
|
"grad_norm": 0.34932388566686257, |
|
"learning_rate": 1.5836424362876933e-05, |
|
"loss": 0.6984, |
|
"mean_token_accuracy": 0.7855889156460762, |
|
"num_tokens": 457177703.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6892876176940842, |
|
"grad_norm": 0.3981248379617205, |
|
"learning_rate": 1.5748762196955198e-05, |
|
"loss": 0.7036, |
|
"mean_token_accuracy": 0.7827964283525943, |
|
"num_tokens": 461930774.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6963936756084562, |
|
"grad_norm": 0.3393463040601756, |
|
"learning_rate": 1.5660471176667194e-05, |
|
"loss": 0.7092, |
|
"mean_token_accuracy": 0.7816402152180671, |
|
"num_tokens": 466702045.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7034997335228282, |
|
"grad_norm": 0.3747245118209944, |
|
"learning_rate": 1.5571562993942594e-05, |
|
"loss": 0.7063, |
|
"mean_token_accuracy": 0.7829745762050152, |
|
"num_tokens": 471461872.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.7106057914372003, |
|
"grad_norm": 0.356673539185162, |
|
"learning_rate": 1.5482049422438732e-05, |
|
"loss": 0.7052, |
|
"mean_token_accuracy": 0.7823217682540416, |
|
"num_tokens": 476233238.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7106057914372003, |
|
"eval_loss": 0.6809196472167969, |
|
"eval_mean_token_accuracy": 0.7825243828589457, |
|
"eval_num_tokens": 476233238.0, |
|
"eval_runtime": 150.1867, |
|
"eval_samples_per_second": 24.23, |
|
"eval_steps_per_second": 0.759, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7177118493515722, |
|
"grad_norm": 0.342268344236882, |
|
"learning_rate": 1.5391942315981506e-05, |
|
"loss": 0.7124, |
|
"mean_token_accuracy": 0.7804363466799259, |
|
"num_tokens": 481010410.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.7248179072659442, |
|
"grad_norm": 0.41188433212292186, |
|
"learning_rate": 1.530125360699561e-05, |
|
"loss": 0.7089, |
|
"mean_token_accuracy": 0.7815835013985634, |
|
"num_tokens": 485757825.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7319239651803162, |
|
"grad_norm": 0.456891730601901, |
|
"learning_rate": 1.520999530492441e-05, |
|
"loss": 0.7022, |
|
"mean_token_accuracy": 0.7851340644061565, |
|
"num_tokens": 490512360.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.7390300230946882, |
|
"grad_norm": 0.347445377971525, |
|
"learning_rate": 1.511817949463956e-05, |
|
"loss": 0.7066, |
|
"mean_token_accuracy": 0.7829876273870469, |
|
"num_tokens": 495265025.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7461360810090603, |
|
"grad_norm": 0.3649440044406707, |
|
"learning_rate": 1.5025818334840695e-05, |
|
"loss": 0.7057, |
|
"mean_token_accuracy": 0.7825053557753563, |
|
"num_tokens": 500030371.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7532421389234323, |
|
"grad_norm": 0.3848276798074321, |
|
"learning_rate": 1.493292405644531e-05, |
|
"loss": 0.6916, |
|
"mean_token_accuracy": 0.7862150557339191, |
|
"num_tokens": 504787581.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7603481968378042, |
|
"grad_norm": 0.3545112606618106, |
|
"learning_rate": 1.4839508960969071e-05, |
|
"loss": 0.7041, |
|
"mean_token_accuracy": 0.7828620508313179, |
|
"num_tokens": 509570758.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.7674542547521762, |
|
"grad_norm": 0.37573645816136086, |
|
"learning_rate": 1.4745585418896799e-05, |
|
"loss": 0.7022, |
|
"mean_token_accuracy": 0.7837928868830204, |
|
"num_tokens": 514321600.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7745603126665482, |
|
"grad_norm": 0.367683939418967, |
|
"learning_rate": 1.4651165868044301e-05, |
|
"loss": 0.6995, |
|
"mean_token_accuracy": 0.7847208097577095, |
|
"num_tokens": 519082348.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.7816663705809203, |
|
"grad_norm": 0.37333173963401106, |
|
"learning_rate": 1.45562628119113e-05, |
|
"loss": 0.7008, |
|
"mean_token_accuracy": 0.7843301363289357, |
|
"num_tokens": 523847671.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7887724284952923, |
|
"grad_norm": 0.35304534000101323, |
|
"learning_rate": 1.446088881802566e-05, |
|
"loss": 0.7113, |
|
"mean_token_accuracy": 0.780696228891611, |
|
"num_tokens": 528620044.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.7958784864096642, |
|
"grad_norm": 0.37158140908427656, |
|
"learning_rate": 1.4365056516279126e-05, |
|
"loss": 0.7016, |
|
"mean_token_accuracy": 0.7839049801230431, |
|
"num_tokens": 533367563.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8029845443240362, |
|
"grad_norm": 0.3666636970376427, |
|
"learning_rate": 1.426877859725482e-05, |
|
"loss": 0.7013, |
|
"mean_token_accuracy": 0.7832373000681401, |
|
"num_tokens": 538117561.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.8100906022384082, |
|
"grad_norm": 0.3480002426515768, |
|
"learning_rate": 1.4172067810546689e-05, |
|
"loss": 0.7024, |
|
"mean_token_accuracy": 0.7843490958213806, |
|
"num_tokens": 542889289.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8171966601527803, |
|
"grad_norm": 0.34159899298495605, |
|
"learning_rate": 1.4074936963071135e-05, |
|
"loss": 0.7034, |
|
"mean_token_accuracy": 0.7836663112044334, |
|
"num_tokens": 547637481.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8243027180671523, |
|
"grad_norm": 0.346312037596486, |
|
"learning_rate": 1.3977398917371074e-05, |
|
"loss": 0.6952, |
|
"mean_token_accuracy": 0.7860016152262688, |
|
"num_tokens": 552402659.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8314087759815243, |
|
"grad_norm": 0.4021455809002062, |
|
"learning_rate": 1.3879466589912598e-05, |
|
"loss": 0.6938, |
|
"mean_token_accuracy": 0.785366540402174, |
|
"num_tokens": 557156063.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.8385148338958962, |
|
"grad_norm": 0.35612007953198216, |
|
"learning_rate": 1.3781152949374527e-05, |
|
"loss": 0.7012, |
|
"mean_token_accuracy": 0.7838830970227718, |
|
"num_tokens": 561916767.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8456208918102682, |
|
"grad_norm": 0.3655479416355732, |
|
"learning_rate": 1.3682471014931031e-05, |
|
"loss": 0.7019, |
|
"mean_token_accuracy": 0.7831911854445934, |
|
"num_tokens": 566684863.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.8527269497246402, |
|
"grad_norm": 0.35487507828239484, |
|
"learning_rate": 1.3583433854527557e-05, |
|
"loss": 0.6967, |
|
"mean_token_accuracy": 0.7847252510488033, |
|
"num_tokens": 571452634.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8527269497246402, |
|
"eval_loss": 0.672866702079773, |
|
"eval_mean_token_accuracy": 0.784141309951481, |
|
"eval_num_tokens": 571452634.0, |
|
"eval_runtime": 149.8383, |
|
"eval_samples_per_second": 24.286, |
|
"eval_steps_per_second": 0.761, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8598330076390123, |
|
"grad_norm": 0.3456230748389429, |
|
"learning_rate": 1.3484054583150315e-05, |
|
"loss": 0.6906, |
|
"mean_token_accuracy": 0.7867132879793644, |
|
"num_tokens": 576198167.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.8669390655533843, |
|
"grad_norm": 0.4168654830920555, |
|
"learning_rate": 1.3384346361089535e-05, |
|
"loss": 0.6885, |
|
"mean_token_accuracy": 0.7866604030132294, |
|
"num_tokens": 580952899.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8740451234677563, |
|
"grad_norm": 0.3723864301027309, |
|
"learning_rate": 1.3284322392196703e-05, |
|
"loss": 0.6943, |
|
"mean_token_accuracy": 0.7859079904854298, |
|
"num_tokens": 585731060.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.8811511813821282, |
|
"grad_norm": 0.3470961871922244, |
|
"learning_rate": 1.3183995922136048e-05, |
|
"loss": 0.712, |
|
"mean_token_accuracy": 0.7812197484076023, |
|
"num_tokens": 590504105.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.8882572392965002, |
|
"grad_norm": 0.36809607687600115, |
|
"learning_rate": 1.308338023663049e-05, |
|
"loss": 0.7012, |
|
"mean_token_accuracy": 0.7837964847683907, |
|
"num_tokens": 595263948.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.8953632972108723, |
|
"grad_norm": 0.40488523074148874, |
|
"learning_rate": 1.2982488659702269e-05, |
|
"loss": 0.696, |
|
"mean_token_accuracy": 0.7849378556013107, |
|
"num_tokens": 600009699.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9024693551252443, |
|
"grad_norm": 0.36084599960401653, |
|
"learning_rate": 1.2881334551908524e-05, |
|
"loss": 0.6932, |
|
"mean_token_accuracy": 0.785707937926054, |
|
"num_tokens": 604750827.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.9095754130396163, |
|
"grad_norm": 0.47712658403892855, |
|
"learning_rate": 1.2779931308572022e-05, |
|
"loss": 0.6932, |
|
"mean_token_accuracy": 0.7863863408565521, |
|
"num_tokens": 609500130.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9166814709539882, |
|
"grad_norm": 0.3494828834988836, |
|
"learning_rate": 1.2678292358007274e-05, |
|
"loss": 0.6859, |
|
"mean_token_accuracy": 0.7878520257771016, |
|
"num_tokens": 614261653.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.9237875288683602, |
|
"grad_norm": 0.38391809182329456, |
|
"learning_rate": 1.2576431159742298e-05, |
|
"loss": 0.7083, |
|
"mean_token_accuracy": 0.7823263764381408, |
|
"num_tokens": 619054673.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9308935867827323, |
|
"grad_norm": 0.341786835416634, |
|
"learning_rate": 1.247436120273624e-05, |
|
"loss": 0.7049, |
|
"mean_token_accuracy": 0.7822027482092381, |
|
"num_tokens": 623817276.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.9379996446971043, |
|
"grad_norm": 0.43974221008421865, |
|
"learning_rate": 1.237209600359311e-05, |
|
"loss": 0.6935, |
|
"mean_token_accuracy": 0.7854240909218788, |
|
"num_tokens": 628574712.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9451057026114763, |
|
"grad_norm": 0.4040988616310412, |
|
"learning_rate": 1.226964910477183e-05, |
|
"loss": 0.6898, |
|
"mean_token_accuracy": 0.7870114140212536, |
|
"num_tokens": 633335145.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.9522117605258483, |
|
"grad_norm": 0.38559148131636184, |
|
"learning_rate": 1.2167034072792887e-05, |
|
"loss": 0.6937, |
|
"mean_token_accuracy": 0.7853186056017876, |
|
"num_tokens": 638086757.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9593178184402202, |
|
"grad_norm": 0.35182778835643425, |
|
"learning_rate": 1.2064264496441786e-05, |
|
"loss": 0.6893, |
|
"mean_token_accuracy": 0.7859195664525032, |
|
"num_tokens": 642864800.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.9664238763545923, |
|
"grad_norm": 0.37801611596609275, |
|
"learning_rate": 1.1961353984969557e-05, |
|
"loss": 0.689, |
|
"mean_token_accuracy": 0.7867573000490665, |
|
"num_tokens": 647632233.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9735299342689643, |
|
"grad_norm": 0.3676417856657156, |
|
"learning_rate": 1.1858316166290542e-05, |
|
"loss": 0.6933, |
|
"mean_token_accuracy": 0.7860686622560025, |
|
"num_tokens": 652408527.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.9806359921833363, |
|
"grad_norm": 0.32727128259002763, |
|
"learning_rate": 1.1755164685177733e-05, |
|
"loss": 0.6909, |
|
"mean_token_accuracy": 0.7855750493705272, |
|
"num_tokens": 657175363.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9877420500977083, |
|
"grad_norm": 0.346480246739393, |
|
"learning_rate": 1.1651913201455865e-05, |
|
"loss": 0.6901, |
|
"mean_token_accuracy": 0.78620011433959, |
|
"num_tokens": 661940460.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.9948481080120803, |
|
"grad_norm": 0.3382141590702366, |
|
"learning_rate": 1.154857538819249e-05, |
|
"loss": 0.6935, |
|
"mean_token_accuracy": 0.7859153963625432, |
|
"num_tokens": 666706092.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9948481080120803, |
|
"eval_loss": 0.6661998629570007, |
|
"eval_mean_token_accuracy": 0.7860168757145864, |
|
"eval_num_tokens": 666706092.0, |
|
"eval_runtime": 150.5473, |
|
"eval_samples_per_second": 24.172, |
|
"eval_steps_per_second": 0.757, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0014212115828744, |
|
"grad_norm": 1.0385755552019693, |
|
"learning_rate": 1.144516492988736e-05, |
|
"loss": 0.6823, |
|
"mean_token_accuracy": 0.7878129852784647, |
|
"num_tokens": 671095094.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.0085272694972465, |
|
"grad_norm": 0.34901946667846845, |
|
"learning_rate": 1.134169552066023e-05, |
|
"loss": 0.6613, |
|
"mean_token_accuracy": 0.7918078258633614, |
|
"num_tokens": 675850987.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0156333274116185, |
|
"grad_norm": 0.35371147066198844, |
|
"learning_rate": 1.1238180862437431e-05, |
|
"loss": 0.6616, |
|
"mean_token_accuracy": 0.7930883727967739, |
|
"num_tokens": 680610147.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.0227393853259905, |
|
"grad_norm": 0.371067048471263, |
|
"learning_rate": 1.1134634663137373e-05, |
|
"loss": 0.6552, |
|
"mean_token_accuracy": 0.7940364375710487, |
|
"num_tokens": 685353908.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0298454432403623, |
|
"grad_norm": 0.3481611518266803, |
|
"learning_rate": 1.1031070634855314e-05, |
|
"loss": 0.6593, |
|
"mean_token_accuracy": 0.7930267058312893, |
|
"num_tokens": 690111045.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.0369515011547343, |
|
"grad_norm": 0.35013978798256634, |
|
"learning_rate": 1.0927502492047492e-05, |
|
"loss": 0.6673, |
|
"mean_token_accuracy": 0.7919997818768024, |
|
"num_tokens": 694881554.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.0440575590691064, |
|
"grad_norm": 0.33993734586176455, |
|
"learning_rate": 1.0823943949715022e-05, |
|
"loss": 0.67, |
|
"mean_token_accuracy": 0.7910104177892208, |
|
"num_tokens": 699670214.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.0511636169834784, |
|
"grad_norm": 0.3412860380013659, |
|
"learning_rate": 1.0720408721587671e-05, |
|
"loss": 0.6715, |
|
"mean_token_accuracy": 0.7910432547330857, |
|
"num_tokens": 704426344.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0582696748978504, |
|
"grad_norm": 0.3627320420498548, |
|
"learning_rate": 1.061691051830783e-05, |
|
"loss": 0.668, |
|
"mean_token_accuracy": 0.7916376106441021, |
|
"num_tokens": 709184272.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.0653757328122224, |
|
"grad_norm": 0.3551359244101548, |
|
"learning_rate": 1.0513463045614873e-05, |
|
"loss": 0.6732, |
|
"mean_token_accuracy": 0.7899613387882709, |
|
"num_tokens": 713964117.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0724817907265944, |
|
"grad_norm": 0.35291410759778385, |
|
"learning_rate": 1.0410080002530188e-05, |
|
"loss": 0.6653, |
|
"mean_token_accuracy": 0.7933160819113254, |
|
"num_tokens": 718714498.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.0795878486409665, |
|
"grad_norm": 0.5183974245642026, |
|
"learning_rate": 1.030677507954307e-05, |
|
"loss": 0.669, |
|
"mean_token_accuracy": 0.7922067753970623, |
|
"num_tokens": 723480315.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.0866939065553385, |
|
"grad_norm": 0.36178536189786653, |
|
"learning_rate": 1.0203561956797777e-05, |
|
"loss": 0.6592, |
|
"mean_token_accuracy": 0.7924857877194882, |
|
"num_tokens": 728257943.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.0937999644697105, |
|
"grad_norm": 0.350161084492629, |
|
"learning_rate": 1.0100454302281917e-05, |
|
"loss": 0.6708, |
|
"mean_token_accuracy": 0.7903590828180314, |
|
"num_tokens": 733027792.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1009060223840825, |
|
"grad_norm": 0.3924504778281434, |
|
"learning_rate": 9.997465770016488e-06, |
|
"loss": 0.665, |
|
"mean_token_accuracy": 0.7942788422107696, |
|
"num_tokens": 737777094.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.1080120802984545, |
|
"grad_norm": 0.3784959263919085, |
|
"learning_rate": 9.894609998247735e-06, |
|
"loss": 0.667, |
|
"mean_token_accuracy": 0.7912828728556633, |
|
"num_tokens": 742543159.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1151181382128263, |
|
"grad_norm": 0.3498944950913003, |
|
"learning_rate": 9.791900607641104e-06, |
|
"loss": 0.6635, |
|
"mean_token_accuracy": 0.7930950812995434, |
|
"num_tokens": 747308343.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.1222241961271984, |
|
"grad_norm": 0.3820207111204085, |
|
"learning_rate": 9.68935119947753e-06, |
|
"loss": 0.668, |
|
"mean_token_accuracy": 0.7914499528706074, |
|
"num_tokens": 752079771.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1293302540415704, |
|
"grad_norm": 0.37200401458209914, |
|
"learning_rate": 9.586975353852284e-06, |
|
"loss": 0.6639, |
|
"mean_token_accuracy": 0.7913541235029697, |
|
"num_tokens": 756847538.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.1364363119559424, |
|
"grad_norm": 0.3953405586571805, |
|
"learning_rate": 9.484786627876655e-06, |
|
"loss": 0.6697, |
|
"mean_token_accuracy": 0.790704844892025, |
|
"num_tokens": 761615758.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1364363119559424, |
|
"eval_loss": 0.6620959043502808, |
|
"eval_mean_token_accuracy": 0.7867940282612517, |
|
"eval_num_tokens": 761615758.0, |
|
"eval_runtime": 149.9826, |
|
"eval_samples_per_second": 24.263, |
|
"eval_steps_per_second": 0.76, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1435423698703144, |
|
"grad_norm": 0.3501237218712183, |
|
"learning_rate": 9.382798553882605e-06, |
|
"loss": 0.6685, |
|
"mean_token_accuracy": 0.789706601947546, |
|
"num_tokens": 766396725.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.1506484277846865, |
|
"grad_norm": 0.35257144104430527, |
|
"learning_rate": 9.281024637630794e-06, |
|
"loss": 0.656, |
|
"mean_token_accuracy": 0.7935691051185131, |
|
"num_tokens": 771153408.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.1577544856990585, |
|
"grad_norm": 0.3686235797828206, |
|
"learning_rate": 9.179478356522055e-06, |
|
"loss": 0.6617, |
|
"mean_token_accuracy": 0.7928701542317868, |
|
"num_tokens": 775910085.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.1648605436134305, |
|
"grad_norm": 0.35944613382478946, |
|
"learning_rate": 9.078173157812669e-06, |
|
"loss": 0.6673, |
|
"mean_token_accuracy": 0.7925907090306282, |
|
"num_tokens": 780683650.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.1719666015278025, |
|
"grad_norm": 0.33694316783063644, |
|
"learning_rate": 8.97712245683359e-06, |
|
"loss": 0.6686, |
|
"mean_token_accuracy": 0.7904776819050312, |
|
"num_tokens": 785473480.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.1790726594421745, |
|
"grad_norm": 0.37743982477075316, |
|
"learning_rate": 8.876339635213951e-06, |
|
"loss": 0.6672, |
|
"mean_token_accuracy": 0.7913396395742893, |
|
"num_tokens": 790244466.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.1861787173565466, |
|
"grad_norm": 0.3852825297524554, |
|
"learning_rate": 8.775838039108975e-06, |
|
"loss": 0.6577, |
|
"mean_token_accuracy": 0.7940163776278496, |
|
"num_tokens": 794986608.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.1932847752709184, |
|
"grad_norm": 0.3584263516575822, |
|
"learning_rate": 8.67563097743263e-06, |
|
"loss": 0.6589, |
|
"mean_token_accuracy": 0.7941929534077644, |
|
"num_tokens": 799743654.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2003908331852904, |
|
"grad_norm": 0.36222679284602094, |
|
"learning_rate": 8.575731720095194e-06, |
|
"loss": 0.6558, |
|
"mean_token_accuracy": 0.7949050404131413, |
|
"num_tokens": 804510301.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.2074968910996624, |
|
"grad_norm": 0.36601316470081835, |
|
"learning_rate": 8.476153496245978e-06, |
|
"loss": 0.6765, |
|
"mean_token_accuracy": 0.7888801738619804, |
|
"num_tokens": 809295294.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2146029490140344, |
|
"grad_norm": 0.35840290876863273, |
|
"learning_rate": 8.376909492521465e-06, |
|
"loss": 0.6651, |
|
"mean_token_accuracy": 0.7920581080019474, |
|
"num_tokens": 814063402.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.2217090069284064, |
|
"grad_norm": 0.35918850454361245, |
|
"learning_rate": 8.278012851299082e-06, |
|
"loss": 0.6604, |
|
"mean_token_accuracy": 0.793212516605854, |
|
"num_tokens": 818822580.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.2288150648427785, |
|
"grad_norm": 0.36931586095521574, |
|
"learning_rate": 8.179476668956799e-06, |
|
"loss": 0.6697, |
|
"mean_token_accuracy": 0.789932218939066, |
|
"num_tokens": 823577622.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.2359211227571505, |
|
"grad_norm": 0.3618218146346168, |
|
"learning_rate": 8.081313994138857e-06, |
|
"loss": 0.6573, |
|
"mean_token_accuracy": 0.7943486146628856, |
|
"num_tokens": 828319732.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.2430271806715225, |
|
"grad_norm": 0.3947688956702076, |
|
"learning_rate": 7.983537826027808e-06, |
|
"loss": 0.6677, |
|
"mean_token_accuracy": 0.7911505416035652, |
|
"num_tokens": 833074724.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.2501332385858945, |
|
"grad_norm": 0.49837585254908734, |
|
"learning_rate": 7.886161112623072e-06, |
|
"loss": 0.6549, |
|
"mean_token_accuracy": 0.7948375590145588, |
|
"num_tokens": 837847437.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.2572392965002666, |
|
"grad_norm": 0.3875626366429437, |
|
"learning_rate": 7.789196749026349e-06, |
|
"loss": 0.6519, |
|
"mean_token_accuracy": 0.7962292313575745, |
|
"num_tokens": 842595165.0, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.2643453544146386, |
|
"grad_norm": 0.3461153977326627, |
|
"learning_rate": 7.692657575733928e-06, |
|
"loss": 0.6591, |
|
"mean_token_accuracy": 0.7930607885122299, |
|
"num_tokens": 847377566.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2714514123290104, |
|
"grad_norm": 0.3539458234583982, |
|
"learning_rate": 7.596556376936328e-06, |
|
"loss": 0.6585, |
|
"mean_token_accuracy": 0.7939456604421139, |
|
"num_tokens": 852138878.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.2785574702433826, |
|
"grad_norm": 0.3776944250010228, |
|
"learning_rate": 7.500905878825335e-06, |
|
"loss": 0.6552, |
|
"mean_token_accuracy": 0.7954832412302494, |
|
"num_tokens": 856908856.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2785574702433826, |
|
"eval_loss": 0.6587108373641968, |
|
"eval_mean_token_accuracy": 0.7879830112582759, |
|
"eval_num_tokens": 856908856.0, |
|
"eval_runtime": 150.1109, |
|
"eval_samples_per_second": 24.242, |
|
"eval_steps_per_second": 0.759, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2856635281577544, |
|
"grad_norm": 0.35417475896948203, |
|
"learning_rate": 7.405718747908743e-06, |
|
"loss": 0.6554, |
|
"mean_token_accuracy": 0.7936457127332688, |
|
"num_tokens": 861668793.0, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.2927695860721264, |
|
"grad_norm": 0.3717646188257627, |
|
"learning_rate": 7.311007589332986e-06, |
|
"loss": 0.6587, |
|
"mean_token_accuracy": 0.7932697109878063, |
|
"num_tokens": 866418403.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.2998756439864985, |
|
"grad_norm": 0.38878740217599195, |
|
"learning_rate": 7.216784945213913e-06, |
|
"loss": 0.6625, |
|
"mean_token_accuracy": 0.7936202257871627, |
|
"num_tokens": 871159945.0, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.3069817019008705, |
|
"grad_norm": 0.3422730868265931, |
|
"learning_rate": 7.123063292975889e-06, |
|
"loss": 0.6525, |
|
"mean_token_accuracy": 0.794795686006546, |
|
"num_tokens": 875924929.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3140877598152425, |
|
"grad_norm": 0.3694651695886196, |
|
"learning_rate": 7.02985504369949e-06, |
|
"loss": 0.6547, |
|
"mean_token_accuracy": 0.7951326429843902, |
|
"num_tokens": 880666672.0, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.3211938177296145, |
|
"grad_norm": 0.3621570732014425, |
|
"learning_rate": 6.937172540477944e-06, |
|
"loss": 0.6654, |
|
"mean_token_accuracy": 0.7919820554554462, |
|
"num_tokens": 885436601.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.3282998756439865, |
|
"grad_norm": 0.7187487529688721, |
|
"learning_rate": 6.8450280567826074e-06, |
|
"loss": 0.6636, |
|
"mean_token_accuracy": 0.792605972290039, |
|
"num_tokens": 890209266.0, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.3354059335583586, |
|
"grad_norm": 0.3607746297969546, |
|
"learning_rate": 6.753433794837663e-06, |
|
"loss": 0.655, |
|
"mean_token_accuracy": 0.7943654432892799, |
|
"num_tokens": 894978447.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.3425119914727306, |
|
"grad_norm": 0.34656870317990024, |
|
"learning_rate": 6.662401884004226e-06, |
|
"loss": 0.6594, |
|
"mean_token_accuracy": 0.7929094567894935, |
|
"num_tokens": 899731953.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.3496180493871024, |
|
"grad_norm": 0.3556403292186795, |
|
"learning_rate": 6.571944379174128e-06, |
|
"loss": 0.6557, |
|
"mean_token_accuracy": 0.7939096741378308, |
|
"num_tokens": 904484204.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.3567241073014746, |
|
"grad_norm": 0.3481029203946856, |
|
"learning_rate": 6.482073259173533e-06, |
|
"loss": 0.6558, |
|
"mean_token_accuracy": 0.795223805308342, |
|
"num_tokens": 909254980.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.3638301652158464, |
|
"grad_norm": 0.36246749377385634, |
|
"learning_rate": 6.39280042517666e-06, |
|
"loss": 0.6576, |
|
"mean_token_accuracy": 0.794485367834568, |
|
"num_tokens": 914013636.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.3709362231302185, |
|
"grad_norm": 0.35744165552118257, |
|
"learning_rate": 6.304137699129758e-06, |
|
"loss": 0.6521, |
|
"mean_token_accuracy": 0.7954901576042175, |
|
"num_tokens": 918774652.0, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.3780422810445905, |
|
"grad_norm": 0.34190890162690535, |
|
"learning_rate": 6.216096822185591e-06, |
|
"loss": 0.6596, |
|
"mean_token_accuracy": 0.7934505857527256, |
|
"num_tokens": 923523836.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.3851483389589625, |
|
"grad_norm": 0.3417362920171981, |
|
"learning_rate": 6.12868945314862e-06, |
|
"loss": 0.6647, |
|
"mean_token_accuracy": 0.7919908218085766, |
|
"num_tokens": 928304038.0, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.3922543968733345, |
|
"grad_norm": 0.3435048840223176, |
|
"learning_rate": 6.041927166931078e-06, |
|
"loss": 0.6577, |
|
"mean_token_accuracy": 0.7943571574985981, |
|
"num_tokens": 933073919.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.3993604547877065, |
|
"grad_norm": 0.40933619958354717, |
|
"learning_rate": 5.9558214530201784e-06, |
|
"loss": 0.6575, |
|
"mean_token_accuracy": 0.7943412482738494, |
|
"num_tokens": 937846004.0, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.4064665127020786, |
|
"grad_norm": 0.3515117903938119, |
|
"learning_rate": 5.870383713956601e-06, |
|
"loss": 0.6599, |
|
"mean_token_accuracy": 0.7938548773527145, |
|
"num_tokens": 942601267.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.4135725706164506, |
|
"grad_norm": 0.38419237136120965, |
|
"learning_rate": 5.785625263824531e-06, |
|
"loss": 0.6552, |
|
"mean_token_accuracy": 0.7948469713330268, |
|
"num_tokens": 947375335.0, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.4206786285308226, |
|
"grad_norm": 0.38108767014595607, |
|
"learning_rate": 5.701557326753375e-06, |
|
"loss": 0.6504, |
|
"mean_token_accuracy": 0.7960710853338242, |
|
"num_tokens": 952105402.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4206786285308226, |
|
"eval_loss": 0.6552348136901855, |
|
"eval_mean_token_accuracy": 0.7893575147578591, |
|
"eval_num_tokens": 952105402.0, |
|
"eval_runtime": 149.5045, |
|
"eval_samples_per_second": 24.34, |
|
"eval_steps_per_second": 0.763, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4277846864451944, |
|
"grad_norm": 0.3429174906843955, |
|
"learning_rate": 5.6181910354314265e-06, |
|
"loss": 0.6596, |
|
"mean_token_accuracy": 0.7940759062767029, |
|
"num_tokens": 956874826.0, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.4348907443595666, |
|
"grad_norm": 0.37307240441832, |
|
"learning_rate": 5.5355374296316e-06, |
|
"loss": 0.6589, |
|
"mean_token_accuracy": 0.7940549589693546, |
|
"num_tokens": 961632193.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.4419968022739384, |
|
"grad_norm": 0.3463607871324184, |
|
"learning_rate": 5.4536074547495055e-06, |
|
"loss": 0.6576, |
|
"mean_token_accuracy": 0.7948333404958248, |
|
"num_tokens": 966392410.0, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.4491028601883105, |
|
"grad_norm": 0.34652042003246536, |
|
"learning_rate": 5.372411960353996e-06, |
|
"loss": 0.6636, |
|
"mean_token_accuracy": 0.7924063883721828, |
|
"num_tokens": 971170949.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.4562089181026825, |
|
"grad_norm": 0.33480848996072166, |
|
"learning_rate": 5.2919616987504205e-06, |
|
"loss": 0.6436, |
|
"mean_token_accuracy": 0.7979453206062317, |
|
"num_tokens": 975920452.0, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.4633149760170545, |
|
"grad_norm": 0.35444982881545845, |
|
"learning_rate": 5.212267323556754e-06, |
|
"loss": 0.6488, |
|
"mean_token_accuracy": 0.7975021339952946, |
|
"num_tokens": 980657772.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.4704210339314265, |
|
"grad_norm": 0.3236146641950486, |
|
"learning_rate": 5.1333393882927776e-06, |
|
"loss": 0.6656, |
|
"mean_token_accuracy": 0.7911154888570309, |
|
"num_tokens": 985424225.0, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.4775270918457986, |
|
"grad_norm": 0.36258575664825715, |
|
"learning_rate": 5.055188344982549e-06, |
|
"loss": 0.653, |
|
"mean_token_accuracy": 0.7950214244425297, |
|
"num_tokens": 990170268.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.4846331497601706, |
|
"grad_norm": 0.3626547592700128, |
|
"learning_rate": 4.977824542770279e-06, |
|
"loss": 0.6645, |
|
"mean_token_accuracy": 0.7932340361177921, |
|
"num_tokens": 994933612.0, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.4917392076745426, |
|
"grad_norm": 0.3424014804959087, |
|
"learning_rate": 4.901258226549855e-06, |
|
"loss": 0.6499, |
|
"mean_token_accuracy": 0.7964041963219642, |
|
"num_tokens": 999695033.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.4988452655889146, |
|
"grad_norm": 0.3457300931458133, |
|
"learning_rate": 4.825499535608169e-06, |
|
"loss": 0.659, |
|
"mean_token_accuracy": 0.7942204736173153, |
|
"num_tokens": 1004453306.0, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.5059513235032864, |
|
"grad_norm": 0.407404649365606, |
|
"learning_rate": 4.750558502282403e-06, |
|
"loss": 0.6466, |
|
"mean_token_accuracy": 0.7969782948493958, |
|
"num_tokens": 1009222958.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.5130573814176587, |
|
"grad_norm": 0.36455466451492874, |
|
"learning_rate": 4.676445050631517e-06, |
|
"loss": 0.6669, |
|
"mean_token_accuracy": 0.7919491566717625, |
|
"num_tokens": 1013988411.0, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.5201634393320305, |
|
"grad_norm": 0.3352428252596833, |
|
"learning_rate": 4.603168995122048e-06, |
|
"loss": 0.653, |
|
"mean_token_accuracy": 0.7959543123841286, |
|
"num_tokens": 1018736541.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.5272694972464027, |
|
"grad_norm": 0.336146168164258, |
|
"learning_rate": 4.530740039328427e-06, |
|
"loss": 0.6527, |
|
"mean_token_accuracy": 0.795566051453352, |
|
"num_tokens": 1023492540.0, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.5343755551607745, |
|
"grad_norm": 0.35998475713849004, |
|
"learning_rate": 4.4591677746479935e-06, |
|
"loss": 0.6542, |
|
"mean_token_accuracy": 0.7954114884138107, |
|
"num_tokens": 1028251642.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.5414816130751465, |
|
"grad_norm": 0.4210390284353885, |
|
"learning_rate": 4.38846167903085e-06, |
|
"loss": 0.6501, |
|
"mean_token_accuracy": 0.7963161066174507, |
|
"num_tokens": 1033004523.0, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.5485876709895185, |
|
"grad_norm": 0.4960482537413715, |
|
"learning_rate": 4.318631115724741e-06, |
|
"loss": 0.6553, |
|
"mean_token_accuracy": 0.7946652464568615, |
|
"num_tokens": 1037760729.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.5556937289038906, |
|
"grad_norm": 0.3799332566110809, |
|
"learning_rate": 4.2496853320351424e-06, |
|
"loss": 0.6607, |
|
"mean_token_accuracy": 0.7947109803557396, |
|
"num_tokens": 1042523723.0, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.5627997868182626, |
|
"grad_norm": 0.33946558046244235, |
|
"learning_rate": 4.1816334581006656e-06, |
|
"loss": 0.6651, |
|
"mean_token_accuracy": 0.792590418457985, |
|
"num_tokens": 1047291640.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5627997868182626, |
|
"eval_loss": 0.6525910496711731, |
|
"eval_mean_token_accuracy": 0.7899543700510996, |
|
"eval_num_tokens": 1047291640.0, |
|
"eval_runtime": 150.3086, |
|
"eval_samples_per_second": 24.21, |
|
"eval_steps_per_second": 0.758, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5699058447326346, |
|
"grad_norm": 0.32287060668756534, |
|
"learning_rate": 4.114484505684019e-06, |
|
"loss": 0.6541, |
|
"mean_token_accuracy": 0.7952132284641266, |
|
"num_tokens": 1052042031.0, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.5770119026470066, |
|
"grad_norm": 0.3370631693518313, |
|
"learning_rate": 4.048247366978606e-06, |
|
"loss": 0.658, |
|
"mean_token_accuracy": 0.7935857936739922, |
|
"num_tokens": 1056804804.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.5841179605613784, |
|
"grad_norm": 0.414769500614358, |
|
"learning_rate": 3.9829308134309995e-06, |
|
"loss": 0.6475, |
|
"mean_token_accuracy": 0.7969807527959347, |
|
"num_tokens": 1061577783.0, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.5912240184757507, |
|
"grad_norm": 0.35600756916022547, |
|
"learning_rate": 3.9185434945793725e-06, |
|
"loss": 0.6559, |
|
"mean_token_accuracy": 0.7951311826705932, |
|
"num_tokens": 1066355020.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.5983300763901225, |
|
"grad_norm": 0.36188029855593157, |
|
"learning_rate": 3.855093936908081e-06, |
|
"loss": 0.6664, |
|
"mean_token_accuracy": 0.7921065390110016, |
|
"num_tokens": 1071139121.0, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.6054361343044947, |
|
"grad_norm": 0.3632538405728989, |
|
"learning_rate": 3.7925905427185504e-06, |
|
"loss": 0.6569, |
|
"mean_token_accuracy": 0.7936886362731457, |
|
"num_tokens": 1075914044.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6125421922188665, |
|
"grad_norm": 0.3669437696101656, |
|
"learning_rate": 3.7310415890166e-06, |
|
"loss": 0.6512, |
|
"mean_token_accuracy": 0.7960372731089592, |
|
"num_tokens": 1080682478.0, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.6196482501332385, |
|
"grad_norm": 0.3331543509156551, |
|
"learning_rate": 3.6704552264163695e-06, |
|
"loss": 0.6561, |
|
"mean_token_accuracy": 0.7935027062892914, |
|
"num_tokens": 1085456231.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.6267543080476106, |
|
"grad_norm": 0.3299992523767914, |
|
"learning_rate": 3.6108394780609513e-06, |
|
"loss": 0.6506, |
|
"mean_token_accuracy": 0.7957184061408042, |
|
"num_tokens": 1090215557.0, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.6338603659619826, |
|
"grad_norm": 0.3450779481067245, |
|
"learning_rate": 3.552202238559953e-06, |
|
"loss": 0.6429, |
|
"mean_token_accuracy": 0.798128329962492, |
|
"num_tokens": 1094959524.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.6409664238763546, |
|
"grad_norm": 0.3531321257611881, |
|
"learning_rate": 3.4945512729440413e-06, |
|
"loss": 0.6503, |
|
"mean_token_accuracy": 0.7954187601804733, |
|
"num_tokens": 1099731395.0, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.6480724817907266, |
|
"grad_norm": 0.3498741705682094, |
|
"learning_rate": 3.437894215636661e-06, |
|
"loss": 0.6578, |
|
"mean_token_accuracy": 0.7941137261688709, |
|
"num_tokens": 1104494157.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.6551785397050987, |
|
"grad_norm": 0.3777101584751149, |
|
"learning_rate": 3.382238569443045e-06, |
|
"loss": 0.6529, |
|
"mean_token_accuracy": 0.7957448020577431, |
|
"num_tokens": 1109252674.0, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.6622845976194704, |
|
"grad_norm": 0.3915263029819705, |
|
"learning_rate": 3.3275917045566596e-06, |
|
"loss": 0.6517, |
|
"mean_token_accuracy": 0.7957381546497345, |
|
"num_tokens": 1114004017.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.6693906555338427, |
|
"grad_norm": 0.33568853526043657, |
|
"learning_rate": 3.2739608575832056e-06, |
|
"loss": 0.6412, |
|
"mean_token_accuracy": 0.7980836987495422, |
|
"num_tokens": 1118768157.0, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.6764967134482145, |
|
"grad_norm": 0.34752341789739466, |
|
"learning_rate": 3.2213531305823125e-06, |
|
"loss": 0.6613, |
|
"mean_token_accuracy": 0.7935202896595002, |
|
"num_tokens": 1123535145.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.6836027713625867, |
|
"grad_norm": 0.34083716495360583, |
|
"learning_rate": 3.1697754901270477e-06, |
|
"loss": 0.6507, |
|
"mean_token_accuracy": 0.7964440450072289, |
|
"num_tokens": 1128307445.0, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.6907088292769585, |
|
"grad_norm": 0.3774093707155365, |
|
"learning_rate": 3.1192347663813684e-06, |
|
"loss": 0.6547, |
|
"mean_token_accuracy": 0.7946882367134094, |
|
"num_tokens": 1133071242.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.6978148871913306, |
|
"grad_norm": 0.36333647453863616, |
|
"learning_rate": 3.0697376521956377e-06, |
|
"loss": 0.6526, |
|
"mean_token_accuracy": 0.7956908911466598, |
|
"num_tokens": 1137831284.0, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.7049209451057026, |
|
"grad_norm": 0.33780601240351715, |
|
"learning_rate": 3.021290702220331e-06, |
|
"loss": 0.6561, |
|
"mean_token_accuracy": 0.7948304824531078, |
|
"num_tokens": 1142587626.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7049209451057026, |
|
"eval_loss": 0.6507056355476379, |
|
"eval_mean_token_accuracy": 0.790492679466281, |
|
"eval_num_tokens": 1142587626.0, |
|
"eval_runtime": 149.5446, |
|
"eval_samples_per_second": 24.334, |
|
"eval_steps_per_second": 0.762, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7120270030200746, |
|
"grad_norm": 0.3537743025090112, |
|
"learning_rate": 2.9739003320380237e-06, |
|
"loss": 0.6624, |
|
"mean_token_accuracy": 0.793489520996809, |
|
"num_tokens": 1147357460.0, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.7191330609344466, |
|
"grad_norm": 0.4133446002595886, |
|
"learning_rate": 2.927572817313823e-06, |
|
"loss": 0.6585, |
|
"mean_token_accuracy": 0.7936319254338742, |
|
"num_tokens": 1152138440.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.7262391188488186, |
|
"grad_norm": 0.3563260686728207, |
|
"learning_rate": 2.8823142929643043e-06, |
|
"loss": 0.6426, |
|
"mean_token_accuracy": 0.797927625477314, |
|
"num_tokens": 1156890428.0, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.7333451767631907, |
|
"grad_norm": 0.3989494711298427, |
|
"learning_rate": 2.838130752345092e-06, |
|
"loss": 0.6582, |
|
"mean_token_accuracy": 0.7947382763028145, |
|
"num_tokens": 1161657895.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.7404512346775625, |
|
"grad_norm": 0.3432409644849126, |
|
"learning_rate": 2.7950280464572066e-06, |
|
"loss": 0.6541, |
|
"mean_token_accuracy": 0.7953485876321793, |
|
"num_tokens": 1166423043.0, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.7475572925919347, |
|
"grad_norm": 0.3368844710605464, |
|
"learning_rate": 2.7530118831722286e-06, |
|
"loss": 0.6481, |
|
"mean_token_accuracy": 0.796825060248375, |
|
"num_tokens": 1171166643.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.7546633505063065, |
|
"grad_norm": 0.45251140872957446, |
|
"learning_rate": 2.7120878264764437e-06, |
|
"loss": 0.6473, |
|
"mean_token_accuracy": 0.7977175071835518, |
|
"num_tokens": 1175924107.0, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.7617694084206788, |
|
"grad_norm": 0.3545121531531998, |
|
"learning_rate": 2.67226129573403e-06, |
|
"loss": 0.6512, |
|
"mean_token_accuracy": 0.7960038974881172, |
|
"num_tokens": 1180681893.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.7688754663350506, |
|
"grad_norm": 0.3323525365320921, |
|
"learning_rate": 2.633537564969398e-06, |
|
"loss": 0.6557, |
|
"mean_token_accuracy": 0.7952632494270802, |
|
"num_tokens": 1185447027.0, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.7759815242494228, |
|
"grad_norm": 0.34600724842490793, |
|
"learning_rate": 2.5959217621687823e-06, |
|
"loss": 0.6608, |
|
"mean_token_accuracy": 0.7938597463071346, |
|
"num_tokens": 1190231791.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.7830875821637946, |
|
"grad_norm": 0.3678379104234008, |
|
"learning_rate": 2.5594188686011616e-06, |
|
"loss": 0.6541, |
|
"mean_token_accuracy": 0.7947786100208759, |
|
"num_tokens": 1194998688.0, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.7901936400781666, |
|
"grad_norm": 0.36986419243022695, |
|
"learning_rate": 2.524033718158621e-06, |
|
"loss": 0.6492, |
|
"mean_token_accuracy": 0.7966626077890396, |
|
"num_tokens": 1199764688.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.7972996979925386, |
|
"grad_norm": 0.36634691155162535, |
|
"learning_rate": 2.489770996716227e-06, |
|
"loss": 0.6549, |
|
"mean_token_accuracy": 0.7945116639137269, |
|
"num_tokens": 1204526423.0, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.8044057559069107, |
|
"grad_norm": 0.3592637763107603, |
|
"learning_rate": 2.456635241511491e-06, |
|
"loss": 0.6436, |
|
"mean_token_accuracy": 0.7984024800360203, |
|
"num_tokens": 1209280668.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.8115118138212827, |
|
"grad_norm": 0.3637888435360703, |
|
"learning_rate": 2.4246308405435314e-06, |
|
"loss": 0.6503, |
|
"mean_token_accuracy": 0.7954847238957882, |
|
"num_tokens": 1214048139.0, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.8186178717356547, |
|
"grad_norm": 0.3515690726134511, |
|
"learning_rate": 2.3937620319919966e-06, |
|
"loss": 0.6471, |
|
"mean_token_accuracy": 0.7975172877311707, |
|
"num_tokens": 1218805359.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.8257239296500267, |
|
"grad_norm": 0.37901691417441497, |
|
"learning_rate": 2.3640329036558167e-06, |
|
"loss": 0.6458, |
|
"mean_token_accuracy": 0.7973252393305301, |
|
"num_tokens": 1223580683.0, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.8328299875643985, |
|
"grad_norm": 0.48078408366926273, |
|
"learning_rate": 2.3354473924118843e-06, |
|
"loss": 0.6517, |
|
"mean_token_accuracy": 0.7954902827739716, |
|
"num_tokens": 1228344380.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.8399360454787708, |
|
"grad_norm": 0.3487084891659478, |
|
"learning_rate": 2.3080092836937124e-06, |
|
"loss": 0.649, |
|
"mean_token_accuracy": 0.7968501009047031, |
|
"num_tokens": 1233124681.0, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.8470421033931426, |
|
"grad_norm": 0.34640131501737065, |
|
"learning_rate": 2.2817222109901442e-06, |
|
"loss": 0.6448, |
|
"mean_token_accuracy": 0.7978550389409065, |
|
"num_tokens": 1237873166.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.8470421033931426, |
|
"eval_loss": 0.6490960121154785, |
|
"eval_mean_token_accuracy": 0.7908800155447241, |
|
"eval_num_tokens": 1237873166.0, |
|
"eval_runtime": 149.9569, |
|
"eval_samples_per_second": 24.267, |
|
"eval_steps_per_second": 0.76, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.8541481613075148, |
|
"grad_norm": 0.3243906306128693, |
|
"learning_rate": 2.256589655364193e-06, |
|
"loss": 0.6593, |
|
"mean_token_accuracy": 0.7929202131927013, |
|
"num_tokens": 1242627340.0, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.8612542192218866, |
|
"grad_norm": 0.37597198041798413, |
|
"learning_rate": 2.2326149449920653e-06, |
|
"loss": 0.6446, |
|
"mean_token_accuracy": 0.797098808735609, |
|
"num_tokens": 1247387461.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.8683602771362586, |
|
"grad_norm": 0.35265594906604686, |
|
"learning_rate": 2.2098012547224197e-06, |
|
"loss": 0.6513, |
|
"mean_token_accuracy": 0.7950267992913723, |
|
"num_tokens": 1252135688.0, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.8754663350506307, |
|
"grad_norm": 0.3583812845832173, |
|
"learning_rate": 2.188151605655942e-06, |
|
"loss": 0.6521, |
|
"mean_token_accuracy": 0.7945805780589581, |
|
"num_tokens": 1256903702.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.8825723929650027, |
|
"grad_norm": 0.3577801661959976, |
|
"learning_rate": 2.1676688647452795e-06, |
|
"loss": 0.6437, |
|
"mean_token_accuracy": 0.7986263297498226, |
|
"num_tokens": 1261633144.0, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.8896784508793747, |
|
"grad_norm": 0.35744367217582623, |
|
"learning_rate": 2.1483557444153795e-06, |
|
"loss": 0.649, |
|
"mean_token_accuracy": 0.7966003373265267, |
|
"num_tokens": 1266390903.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.8967845087937467, |
|
"grad_norm": 0.39747974453689555, |
|
"learning_rate": 2.1302148022042993e-06, |
|
"loss": 0.6491, |
|
"mean_token_accuracy": 0.7970162339508533, |
|
"num_tokens": 1271162270.0, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.9038905667081187, |
|
"grad_norm": 0.3547995480225708, |
|
"learning_rate": 2.113248440424526e-06, |
|
"loss": 0.643, |
|
"mean_token_accuracy": 0.7987522542476654, |
|
"num_tokens": 1275906083.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.9109966246224905, |
|
"grad_norm": 0.3924659274346196, |
|
"learning_rate": 2.0974589058448456e-06, |
|
"loss": 0.6499, |
|
"mean_token_accuracy": 0.7970600210130214, |
|
"num_tokens": 1280649985.0, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.9181026825368628, |
|
"grad_norm": 0.3450930632168253, |
|
"learning_rate": 2.0828482893928208e-06, |
|
"loss": 0.6525, |
|
"mean_token_accuracy": 0.795515525341034, |
|
"num_tokens": 1285434113.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.9252087404512346, |
|
"grad_norm": 0.33567083461731984, |
|
"learning_rate": 2.069418525877897e-06, |
|
"loss": 0.644, |
|
"mean_token_accuracy": 0.798255106061697, |
|
"num_tokens": 1290191830.0, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.9323147983656068, |
|
"grad_norm": 0.36694627449422723, |
|
"learning_rate": 2.0571713937351834e-06, |
|
"loss": 0.6397, |
|
"mean_token_accuracy": 0.7977312818169594, |
|
"num_tokens": 1294948980.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.9394208562799786, |
|
"grad_norm": 0.362232508705221, |
|
"learning_rate": 2.0461085147899497e-06, |
|
"loss": 0.6457, |
|
"mean_token_accuracy": 0.7973731994628906, |
|
"num_tokens": 1299719386.0, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.9465269141943506, |
|
"grad_norm": 0.3669523670641092, |
|
"learning_rate": 2.0362313540428485e-06, |
|
"loss": 0.6472, |
|
"mean_token_accuracy": 0.797086289525032, |
|
"num_tokens": 1304487261.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.9536329721087227, |
|
"grad_norm": 0.37200879998568015, |
|
"learning_rate": 2.027541219475922e-06, |
|
"loss": 0.6475, |
|
"mean_token_accuracy": 0.7960396580398083, |
|
"num_tokens": 1309241194.0, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.9607390300230947, |
|
"grad_norm": 0.3688539723341265, |
|
"learning_rate": 2.020039261879382e-06, |
|
"loss": 0.6573, |
|
"mean_token_accuracy": 0.7950874969363213, |
|
"num_tokens": 1314011836.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.9678450879374667, |
|
"grad_norm": 0.43966459213132447, |
|
"learning_rate": 2.013726474699225e-06, |
|
"loss": 0.6505, |
|
"mean_token_accuracy": 0.7958736583590508, |
|
"num_tokens": 1318761485.0, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.9749511458518387, |
|
"grad_norm": 0.34824004564185856, |
|
"learning_rate": 2.008603693905673e-06, |
|
"loss": 0.6476, |
|
"mean_token_accuracy": 0.7972124963998795, |
|
"num_tokens": 1323527340.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.9820572037662108, |
|
"grad_norm": 0.37468360563296904, |
|
"learning_rate": 2.0046715978824663e-06, |
|
"loss": 0.6496, |
|
"mean_token_accuracy": 0.7958178780972958, |
|
"num_tokens": 1328302362.0, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.9891632616805826, |
|
"grad_norm": 0.3561744251531493, |
|
"learning_rate": 2.001930707337034e-06, |
|
"loss": 0.6501, |
|
"mean_token_accuracy": 0.7963785864412785, |
|
"num_tokens": 1333062144.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.9891632616805826, |
|
"eval_loss": 0.6482434868812561, |
|
"eval_mean_token_accuracy": 0.7910316936802446, |
|
"eval_num_tokens": 1333062144.0, |
|
"eval_runtime": 149.6853, |
|
"eval_samples_per_second": 24.311, |
|
"eval_steps_per_second": 0.762, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.9962693195949548, |
|
"grad_norm": 0.3552540336023921, |
|
"learning_rate": 2.000381385231536e-06, |
|
"loss": 0.656, |
|
"mean_token_accuracy": 0.7951462939381599, |
|
"num_tokens": 1337810166.0, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"mean_token_accuracy": 0.7962638111341567, |
|
"num_tokens": 1340314150.0, |
|
"step": 1408, |
|
"total_flos": 1.0314062938243072e+16, |
|
"train_loss": 0.7184352108531378, |
|
"train_runtime": 49977.4428, |
|
"train_samples_per_second": 14.417, |
|
"train_steps_per_second": 0.028 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1408, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0314062938243072e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|