|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.989494747373687, |
|
"eval_steps": 500, |
|
"global_step": 747, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02001000500250125, |
|
"grad_norm": 1.2456663846969604, |
|
"learning_rate": 4.999447296060165e-05, |
|
"loss": 1.3318, |
|
"num_input_tokens_seen": 15648, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0400200100050025, |
|
"grad_norm": 0.8590693473815918, |
|
"learning_rate": 4.997789428625975e-05, |
|
"loss": 1.2773, |
|
"num_input_tokens_seen": 28720, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.060030015007503754, |
|
"grad_norm": 0.8516742587089539, |
|
"learning_rate": 4.995027130745321e-05, |
|
"loss": 1.1401, |
|
"num_input_tokens_seen": 43808, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.080040020010005, |
|
"grad_norm": 1.0357950925827026, |
|
"learning_rate": 4.99116162380454e-05, |
|
"loss": 1.089, |
|
"num_input_tokens_seen": 57472, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10005002501250625, |
|
"grad_norm": 0.744416356086731, |
|
"learning_rate": 4.986194616988364e-05, |
|
"loss": 1.1176, |
|
"num_input_tokens_seen": 71968, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12006003001500751, |
|
"grad_norm": 0.7087241411209106, |
|
"learning_rate": 4.980128306524183e-05, |
|
"loss": 0.9949, |
|
"num_input_tokens_seen": 85552, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14007003501750875, |
|
"grad_norm": 0.708070695400238, |
|
"learning_rate": 4.972965374710952e-05, |
|
"loss": 1.0264, |
|
"num_input_tokens_seen": 100512, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.16008004002001, |
|
"grad_norm": 0.7232606410980225, |
|
"learning_rate": 4.964708988733178e-05, |
|
"loss": 1.0004, |
|
"num_input_tokens_seen": 113376, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18009004502251125, |
|
"grad_norm": 1.207189679145813, |
|
"learning_rate": 4.9553627992605066e-05, |
|
"loss": 1.1137, |
|
"num_input_tokens_seen": 127680, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2001000500250125, |
|
"grad_norm": 0.6889365315437317, |
|
"learning_rate": 4.944930938833535e-05, |
|
"loss": 0.9646, |
|
"num_input_tokens_seen": 143136, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22011005502751377, |
|
"grad_norm": 1.3263787031173706, |
|
"learning_rate": 4.9334180200365486e-05, |
|
"loss": 0.9757, |
|
"num_input_tokens_seen": 155488, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.24012006003001501, |
|
"grad_norm": 1.0919948816299438, |
|
"learning_rate": 4.9208291334580104e-05, |
|
"loss": 0.9126, |
|
"num_input_tokens_seen": 171008, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.26013006503251623, |
|
"grad_norm": 0.853539764881134, |
|
"learning_rate": 4.907169845439688e-05, |
|
"loss": 1.0119, |
|
"num_input_tokens_seen": 185536, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2801400700350175, |
|
"grad_norm": 0.9023884534835815, |
|
"learning_rate": 4.892446195615423e-05, |
|
"loss": 1.1192, |
|
"num_input_tokens_seen": 201728, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3001500750375188, |
|
"grad_norm": 1.3300387859344482, |
|
"learning_rate": 4.87666469424063e-05, |
|
"loss": 1.0167, |
|
"num_input_tokens_seen": 217584, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.32016008004002, |
|
"grad_norm": 1.1315807104110718, |
|
"learning_rate": 4.859832319313697e-05, |
|
"loss": 0.8291, |
|
"num_input_tokens_seen": 230864, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3401700850425213, |
|
"grad_norm": 1.2459896802902222, |
|
"learning_rate": 4.841956513490577e-05, |
|
"loss": 0.9591, |
|
"num_input_tokens_seen": 245584, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3601800900450225, |
|
"grad_norm": 1.114758849143982, |
|
"learning_rate": 4.8230451807939135e-05, |
|
"loss": 0.9869, |
|
"num_input_tokens_seen": 259760, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.38019009504752377, |
|
"grad_norm": 0.8792176842689514, |
|
"learning_rate": 4.803106683118177e-05, |
|
"loss": 1.0423, |
|
"num_input_tokens_seen": 274432, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.400200100050025, |
|
"grad_norm": 1.6365342140197754, |
|
"learning_rate": 4.782149836532345e-05, |
|
"loss": 1.0122, |
|
"num_input_tokens_seen": 288256, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.42021010505252626, |
|
"grad_norm": 1.2875956296920776, |
|
"learning_rate": 4.760183907381757e-05, |
|
"loss": 0.9849, |
|
"num_input_tokens_seen": 301136, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.44022011005502754, |
|
"grad_norm": 0.8925891518592834, |
|
"learning_rate": 4.737218608190878e-05, |
|
"loss": 0.7958, |
|
"num_input_tokens_seen": 313712, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.46023011505752875, |
|
"grad_norm": 0.9326754808425903, |
|
"learning_rate": 4.713264093368783e-05, |
|
"loss": 0.9773, |
|
"num_input_tokens_seen": 328496, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.48024012006003003, |
|
"grad_norm": 0.9033737182617188, |
|
"learning_rate": 4.6883309547192476e-05, |
|
"loss": 0.9953, |
|
"num_input_tokens_seen": 342288, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5002501250625313, |
|
"grad_norm": 1.606924057006836, |
|
"learning_rate": 4.6624302167574436e-05, |
|
"loss": 0.9726, |
|
"num_input_tokens_seen": 356128, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5202601300650325, |
|
"grad_norm": 1.0410974025726318, |
|
"learning_rate": 4.635573331835302e-05, |
|
"loss": 0.9987, |
|
"num_input_tokens_seen": 368000, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5402701350675337, |
|
"grad_norm": 1.3536303043365479, |
|
"learning_rate": 4.607772175077711e-05, |
|
"loss": 1.012, |
|
"num_input_tokens_seen": 381216, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.560280140070035, |
|
"grad_norm": 0.8227062225341797, |
|
"learning_rate": 4.5790390391317675e-05, |
|
"loss": 0.9104, |
|
"num_input_tokens_seen": 395296, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5802901450725363, |
|
"grad_norm": 1.2021853923797607, |
|
"learning_rate": 4.549386628731425e-05, |
|
"loss": 0.9087, |
|
"num_input_tokens_seen": 409312, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6003001500750376, |
|
"grad_norm": 0.7881823778152466, |
|
"learning_rate": 4.518828055079925e-05, |
|
"loss": 1.0859, |
|
"num_input_tokens_seen": 424064, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6203101550775387, |
|
"grad_norm": 0.6765316724777222, |
|
"learning_rate": 4.487376830052511e-05, |
|
"loss": 0.9696, |
|
"num_input_tokens_seen": 437264, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.64032016008004, |
|
"grad_norm": 1.2535051107406616, |
|
"learning_rate": 4.4550468602219716e-05, |
|
"loss": 1.0014, |
|
"num_input_tokens_seen": 450624, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6603301650825413, |
|
"grad_norm": 0.7798222303390503, |
|
"learning_rate": 4.421852440709666e-05, |
|
"loss": 0.8877, |
|
"num_input_tokens_seen": 465968, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6803401700850426, |
|
"grad_norm": 0.9760732054710388, |
|
"learning_rate": 4.387808248864751e-05, |
|
"loss": 0.9694, |
|
"num_input_tokens_seen": 480720, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7003501750875438, |
|
"grad_norm": 0.7668250799179077, |
|
"learning_rate": 4.352929337774395e-05, |
|
"loss": 0.8613, |
|
"num_input_tokens_seen": 495008, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.720360180090045, |
|
"grad_norm": 0.7338403463363647, |
|
"learning_rate": 4.3172311296078595e-05, |
|
"loss": 0.907, |
|
"num_input_tokens_seen": 509024, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7403701850925463, |
|
"grad_norm": 1.0990034341812134, |
|
"learning_rate": 4.2807294087973834e-05, |
|
"loss": 1.0501, |
|
"num_input_tokens_seen": 524752, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7603801900950475, |
|
"grad_norm": 1.2968486547470093, |
|
"learning_rate": 4.2434403150588895e-05, |
|
"loss": 0.9158, |
|
"num_input_tokens_seen": 537872, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7803901950975488, |
|
"grad_norm": 1.0896228551864624, |
|
"learning_rate": 4.205380336255594e-05, |
|
"loss": 0.9756, |
|
"num_input_tokens_seen": 552832, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.80040020010005, |
|
"grad_norm": 1.2773741483688354, |
|
"learning_rate": 4.166566301107687e-05, |
|
"loss": 1.0702, |
|
"num_input_tokens_seen": 566224, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8204102051025512, |
|
"grad_norm": 1.0507806539535522, |
|
"learning_rate": 4.127015371751284e-05, |
|
"loss": 0.8236, |
|
"num_input_tokens_seen": 582560, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8404202101050525, |
|
"grad_norm": 0.7220691442489624, |
|
"learning_rate": 4.08674503614997e-05, |
|
"loss": 0.8779, |
|
"num_input_tokens_seen": 595568, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8604302151075538, |
|
"grad_norm": 1.8035520315170288, |
|
"learning_rate": 4.0457731003622606e-05, |
|
"loss": 0.9544, |
|
"num_input_tokens_seen": 609040, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8804402201100551, |
|
"grad_norm": 1.3739808797836304, |
|
"learning_rate": 4.004117680668422e-05, |
|
"loss": 0.8529, |
|
"num_input_tokens_seen": 623168, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9004502251125562, |
|
"grad_norm": 0.7650644779205322, |
|
"learning_rate": 3.961797195560118e-05, |
|
"loss": 0.916, |
|
"num_input_tokens_seen": 639680, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9204602301150575, |
|
"grad_norm": 0.9346507787704468, |
|
"learning_rate": 3.918830357596434e-05, |
|
"loss": 1.0282, |
|
"num_input_tokens_seen": 654512, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9404702351175588, |
|
"grad_norm": 0.9470961689949036, |
|
"learning_rate": 3.8752361651298675e-05, |
|
"loss": 1.0636, |
|
"num_input_tokens_seen": 668992, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9604802401200601, |
|
"grad_norm": 0.9078488349914551, |
|
"learning_rate": 3.8310338939059644e-05, |
|
"loss": 0.855, |
|
"num_input_tokens_seen": 683360, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9804902451225613, |
|
"grad_norm": 0.8685442209243774, |
|
"learning_rate": 3.7862430885402876e-05, |
|
"loss": 0.9286, |
|
"num_input_tokens_seen": 697456, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0005002501250626, |
|
"grad_norm": 1.0271894931793213, |
|
"learning_rate": 3.740883553876515e-05, |
|
"loss": 0.9295, |
|
"num_input_tokens_seen": 713088, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0205102551275638, |
|
"grad_norm": 0.80521160364151, |
|
"learning_rate": 3.694975346229458e-05, |
|
"loss": 0.9945, |
|
"num_input_tokens_seen": 726176, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.040520260130065, |
|
"grad_norm": 0.9811381101608276, |
|
"learning_rate": 3.6485387645169064e-05, |
|
"loss": 0.8799, |
|
"num_input_tokens_seen": 741664, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0605302651325663, |
|
"grad_norm": 0.8328830599784851, |
|
"learning_rate": 3.601594341284195e-05, |
|
"loss": 0.8551, |
|
"num_input_tokens_seen": 754720, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0805402701350675, |
|
"grad_norm": 0.9785313606262207, |
|
"learning_rate": 3.55416283362546e-05, |
|
"loss": 0.8381, |
|
"num_input_tokens_seen": 767568, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1005502751375689, |
|
"grad_norm": 1.0604971647262573, |
|
"learning_rate": 3.5062652140056275e-05, |
|
"loss": 0.8036, |
|
"num_input_tokens_seen": 784192, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.12056028014007, |
|
"grad_norm": 0.9006226062774658, |
|
"learning_rate": 3.457922660987155e-05, |
|
"loss": 0.7851, |
|
"num_input_tokens_seen": 797328, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1405702851425712, |
|
"grad_norm": 0.6364954710006714, |
|
"learning_rate": 3.409156549865654e-05, |
|
"loss": 0.8262, |
|
"num_input_tokens_seen": 812416, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.1605802901450726, |
|
"grad_norm": 1.1533681154251099, |
|
"learning_rate": 3.3599884432185225e-05, |
|
"loss": 0.9297, |
|
"num_input_tokens_seen": 825744, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1805902951475737, |
|
"grad_norm": 0.8428523540496826, |
|
"learning_rate": 3.310440081370767e-05, |
|
"loss": 1.0422, |
|
"num_input_tokens_seen": 840256, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.2006003001500751, |
|
"grad_norm": 0.9648393392562866, |
|
"learning_rate": 3.260533372782234e-05, |
|
"loss": 0.8906, |
|
"num_input_tokens_seen": 854016, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2206103051525763, |
|
"grad_norm": 1.0610816478729248, |
|
"learning_rate": 3.2102903843604885e-05, |
|
"loss": 0.7934, |
|
"num_input_tokens_seen": 868592, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.2406203101550775, |
|
"grad_norm": 1.2185319662094116, |
|
"learning_rate": 3.1597333317036545e-05, |
|
"loss": 0.7439, |
|
"num_input_tokens_seen": 881280, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.2606303151575788, |
|
"grad_norm": 1.1018725633621216, |
|
"learning_rate": 3.10888456927748e-05, |
|
"loss": 0.841, |
|
"num_input_tokens_seen": 895168, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.28064032016008, |
|
"grad_norm": 1.42940354347229, |
|
"learning_rate": 3.057766580531031e-05, |
|
"loss": 0.9016, |
|
"num_input_tokens_seen": 910624, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.3006503251625814, |
|
"grad_norm": 1.1689488887786865, |
|
"learning_rate": 3.0064019679553274e-05, |
|
"loss": 0.8938, |
|
"num_input_tokens_seen": 923648, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.3206603301650826, |
|
"grad_norm": 1.2434953451156616, |
|
"learning_rate": 2.9548134430893604e-05, |
|
"loss": 0.8739, |
|
"num_input_tokens_seen": 938256, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3406703351675837, |
|
"grad_norm": 0.6362343430519104, |
|
"learning_rate": 2.903023816477885e-05, |
|
"loss": 0.9641, |
|
"num_input_tokens_seen": 954000, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.360680340170085, |
|
"grad_norm": 1.7914173603057861, |
|
"learning_rate": 2.8510559875854377e-05, |
|
"loss": 0.8616, |
|
"num_input_tokens_seen": 965744, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3806903451725863, |
|
"grad_norm": 1.1798431873321533, |
|
"learning_rate": 2.7989329346710375e-05, |
|
"loss": 0.7673, |
|
"num_input_tokens_seen": 980512, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.4007003501750876, |
|
"grad_norm": 1.0572681427001953, |
|
"learning_rate": 2.7466777046280457e-05, |
|
"loss": 0.9637, |
|
"num_input_tokens_seen": 996224, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.4207103551775888, |
|
"grad_norm": 1.3495503664016724, |
|
"learning_rate": 2.69431340279368e-05, |
|
"loss": 0.7466, |
|
"num_input_tokens_seen": 1008816, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.44072036018009, |
|
"grad_norm": 0.7960165143013, |
|
"learning_rate": 2.6418631827326857e-05, |
|
"loss": 0.8695, |
|
"num_input_tokens_seen": 1024032, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4607303651825914, |
|
"grad_norm": 1.0253639221191406, |
|
"learning_rate": 2.5893502359996786e-05, |
|
"loss": 0.8742, |
|
"num_input_tokens_seen": 1035536, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.4807403701850925, |
|
"grad_norm": 1.3461010456085205, |
|
"learning_rate": 2.5367977818847034e-05, |
|
"loss": 0.8879, |
|
"num_input_tokens_seen": 1048784, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.500750375187594, |
|
"grad_norm": 0.9898460507392883, |
|
"learning_rate": 2.484229057146507e-05, |
|
"loss": 0.8188, |
|
"num_input_tokens_seen": 1063920, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.520760380190095, |
|
"grad_norm": 1.0175529718399048, |
|
"learning_rate": 2.431667305738112e-05, |
|
"loss": 0.8898, |
|
"num_input_tokens_seen": 1080048, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5407703851925962, |
|
"grad_norm": 1.3844118118286133, |
|
"learning_rate": 2.3791357685291863e-05, |
|
"loss": 0.8779, |
|
"num_input_tokens_seen": 1093584, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.5607803901950974, |
|
"grad_norm": 1.3372527360916138, |
|
"learning_rate": 2.3266576730297956e-05, |
|
"loss": 0.9372, |
|
"num_input_tokens_seen": 1108192, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.5807903951975988, |
|
"grad_norm": 0.8785428404808044, |
|
"learning_rate": 2.274256223120051e-05, |
|
"loss": 0.7406, |
|
"num_input_tokens_seen": 1122368, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.6008004002001002, |
|
"grad_norm": 1.2185471057891846, |
|
"learning_rate": 2.221954588790206e-05, |
|
"loss": 0.8414, |
|
"num_input_tokens_seen": 1135040, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.6208104052026013, |
|
"grad_norm": 0.9093597531318665, |
|
"learning_rate": 2.1697758958957448e-05, |
|
"loss": 0.8781, |
|
"num_input_tokens_seen": 1150096, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.6408204102051025, |
|
"grad_norm": 1.1658143997192383, |
|
"learning_rate": 2.1177432159319754e-05, |
|
"loss": 0.8304, |
|
"num_input_tokens_seen": 1163840, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6608304152076037, |
|
"grad_norm": 1.4369289875030518, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 0.9539, |
|
"num_input_tokens_seen": 1179024, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.680840420210105, |
|
"grad_norm": 1.2581747770309448, |
|
"learning_rate": 2.014207847797256e-05, |
|
"loss": 0.7972, |
|
"num_input_tokens_seen": 1192800, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.7008504252126064, |
|
"grad_norm": 1.6154969930648804, |
|
"learning_rate": 1.9627509391510086e-05, |
|
"loss": 0.9455, |
|
"num_input_tokens_seen": 1206160, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.7208604302151076, |
|
"grad_norm": 0.9386204481124878, |
|
"learning_rate": 1.9115315822428437e-05, |
|
"loss": 0.742, |
|
"num_input_tokens_seen": 1219456, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7408704352176088, |
|
"grad_norm": 1.32277512550354, |
|
"learning_rate": 1.8605724243850502e-05, |
|
"loss": 0.9298, |
|
"num_input_tokens_seen": 1232848, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.76088044022011, |
|
"grad_norm": 1.7241052389144897, |
|
"learning_rate": 1.809895997839482e-05, |
|
"loss": 0.8751, |
|
"num_input_tokens_seen": 1244944, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.7808904452226113, |
|
"grad_norm": 1.2125686407089233, |
|
"learning_rate": 1.759524709854626e-05, |
|
"loss": 0.8162, |
|
"num_input_tokens_seen": 1259584, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.8009004502251127, |
|
"grad_norm": 1.6025140285491943, |
|
"learning_rate": 1.70948083275794e-05, |
|
"loss": 0.9372, |
|
"num_input_tokens_seen": 1274640, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.8209104552276139, |
|
"grad_norm": 0.9881381392478943, |
|
"learning_rate": 1.6597864941078552e-05, |
|
"loss": 0.9076, |
|
"num_input_tokens_seen": 1289936, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.840920460230115, |
|
"grad_norm": 1.4844774007797241, |
|
"learning_rate": 1.6104636669097776e-05, |
|
"loss": 0.8189, |
|
"num_input_tokens_seen": 1303184, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.8609304652326162, |
|
"grad_norm": 0.8360545635223389, |
|
"learning_rate": 1.561534159900441e-05, |
|
"loss": 0.8545, |
|
"num_input_tokens_seen": 1317920, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.8809404702351176, |
|
"grad_norm": 1.1437807083129883, |
|
"learning_rate": 1.513019607904882e-05, |
|
"loss": 0.8544, |
|
"num_input_tokens_seen": 1332192, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.900950475237619, |
|
"grad_norm": 1.3824429512023926, |
|
"learning_rate": 1.464941462270325e-05, |
|
"loss": 0.9191, |
|
"num_input_tokens_seen": 1348000, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.9209604802401201, |
|
"grad_norm": 0.9468676447868347, |
|
"learning_rate": 1.4173209813811788e-05, |
|
"loss": 0.8605, |
|
"num_input_tokens_seen": 1362096, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.9409704852426213, |
|
"grad_norm": 1.2974201440811157, |
|
"learning_rate": 1.3701792212593662e-05, |
|
"loss": 0.937, |
|
"num_input_tokens_seen": 1378656, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.9609804902451224, |
|
"grad_norm": 1.684973955154419, |
|
"learning_rate": 1.3235370262541272e-05, |
|
"loss": 0.9073, |
|
"num_input_tokens_seen": 1393344, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.9809904952476238, |
|
"grad_norm": 0.7046062350273132, |
|
"learning_rate": 1.277415019825417e-05, |
|
"loss": 0.8941, |
|
"num_input_tokens_seen": 1409280, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.001000500250125, |
|
"grad_norm": 1.5292088985443115, |
|
"learning_rate": 1.2318335954249669e-05, |
|
"loss": 0.8051, |
|
"num_input_tokens_seen": 1423536, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0210105052526264, |
|
"grad_norm": 1.1964282989501953, |
|
"learning_rate": 1.1868129074790577e-05, |
|
"loss": 0.751, |
|
"num_input_tokens_seen": 1436048, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.0410205102551275, |
|
"grad_norm": 1.2196362018585205, |
|
"learning_rate": 1.1423728624769695e-05, |
|
"loss": 0.762, |
|
"num_input_tokens_seen": 1450272, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.0610305152576287, |
|
"grad_norm": 1.8068214654922485, |
|
"learning_rate": 1.098533110169071e-05, |
|
"loss": 0.8222, |
|
"num_input_tokens_seen": 1464656, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.08104052026013, |
|
"grad_norm": 1.1758692264556885, |
|
"learning_rate": 1.0553130348784182e-05, |
|
"loss": 0.7271, |
|
"num_input_tokens_seen": 1478016, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.1010505252626315, |
|
"grad_norm": 1.0688625574111938, |
|
"learning_rate": 1.0127317469297277e-05, |
|
"loss": 0.7618, |
|
"num_input_tokens_seen": 1492080, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.1210605302651326, |
|
"grad_norm": 0.9757594466209412, |
|
"learning_rate": 9.708080741994868e-06, |
|
"loss": 0.7738, |
|
"num_input_tokens_seen": 1507504, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.141070535267634, |
|
"grad_norm": 1.5281833410263062, |
|
"learning_rate": 9.295605537909708e-06, |
|
"loss": 0.7976, |
|
"num_input_tokens_seen": 1520080, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.161080540270135, |
|
"grad_norm": 1.202627182006836, |
|
"learning_rate": 8.890074238378074e-06, |
|
"loss": 0.7181, |
|
"num_input_tokens_seen": 1531920, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.181090545272636, |
|
"grad_norm": 1.7492619752883911, |
|
"learning_rate": 8.491666154397573e-06, |
|
"loss": 0.7375, |
|
"num_input_tokens_seen": 1545856, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.2011005502751377, |
|
"grad_norm": 1.4915295839309692, |
|
"learning_rate": 8.100557447342327e-06, |
|
"loss": 0.7557, |
|
"num_input_tokens_seen": 1558256, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.221110555277639, |
|
"grad_norm": 1.4687122106552124, |
|
"learning_rate": 7.71692105107098e-06, |
|
"loss": 0.7973, |
|
"num_input_tokens_seen": 1573776, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.24112056028014, |
|
"grad_norm": 1.2812813520431519, |
|
"learning_rate": 7.340926595461687e-06, |
|
"loss": 0.881, |
|
"num_input_tokens_seen": 1589968, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.2611305652826412, |
|
"grad_norm": 1.453517198562622, |
|
"learning_rate": 6.972740331408015e-06, |
|
"loss": 0.7725, |
|
"num_input_tokens_seen": 1603488, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.2811405702851424, |
|
"grad_norm": 1.4879059791564941, |
|
"learning_rate": 6.612525057308949e-06, |
|
"loss": 0.8143, |
|
"num_input_tokens_seen": 1619136, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.301150575287644, |
|
"grad_norm": 1.426224708557129, |
|
"learning_rate": 6.260440047085439e-06, |
|
"loss": 0.7383, |
|
"num_input_tokens_seen": 1635088, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.321160580290145, |
|
"grad_norm": 1.5303268432617188, |
|
"learning_rate": 5.9166409797553415e-06, |
|
"loss": 0.9513, |
|
"num_input_tokens_seen": 1652560, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.3411705852926463, |
|
"grad_norm": 1.0902361869812012, |
|
"learning_rate": 5.581279870597867e-06, |
|
"loss": 0.6249, |
|
"num_input_tokens_seen": 1665168, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.3611805902951475, |
|
"grad_norm": 1.3195565938949585, |
|
"learning_rate": 5.254505003938043e-06, |
|
"loss": 0.7327, |
|
"num_input_tokens_seen": 1677312, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.3811905952976486, |
|
"grad_norm": 1.9101403951644897, |
|
"learning_rate": 4.936460867580889e-06, |
|
"loss": 0.8425, |
|
"num_input_tokens_seen": 1690400, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.4012006003001503, |
|
"grad_norm": 1.4452427625656128, |
|
"learning_rate": 4.627288088924156e-06, |
|
"loss": 0.8224, |
|
"num_input_tokens_seen": 1704640, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.4212106053026514, |
|
"grad_norm": 1.2759140729904175, |
|
"learning_rate": 4.327123372778122e-06, |
|
"loss": 0.7743, |
|
"num_input_tokens_seen": 1717808, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.4412206103051526, |
|
"grad_norm": 1.3661129474639893, |
|
"learning_rate": 4.036099440919763e-06, |
|
"loss": 0.6191, |
|
"num_input_tokens_seen": 1730688, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.4612306153076537, |
|
"grad_norm": 1.3944261074066162, |
|
"learning_rate": 3.754344973408064e-06, |
|
"loss": 0.8898, |
|
"num_input_tokens_seen": 1745472, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.481240620310155, |
|
"grad_norm": 1.5250520706176758, |
|
"learning_rate": 3.481984551686429e-06, |
|
"loss": 0.9432, |
|
"num_input_tokens_seen": 1761008, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.5012506253126565, |
|
"grad_norm": 1.3213047981262207, |
|
"learning_rate": 3.2191386034973627e-06, |
|
"loss": 0.8175, |
|
"num_input_tokens_seen": 1774704, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.5212606303151577, |
|
"grad_norm": 1.5623505115509033, |
|
"learning_rate": 2.9659233496337786e-06, |
|
"loss": 0.793, |
|
"num_input_tokens_seen": 1788768, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.541270635317659, |
|
"grad_norm": 1.8235459327697754, |
|
"learning_rate": 2.722450752550429e-06, |
|
"loss": 0.8368, |
|
"num_input_tokens_seen": 1799968, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.56128064032016, |
|
"grad_norm": 1.178887128829956, |
|
"learning_rate": 2.4888284668582285e-06, |
|
"loss": 0.8236, |
|
"num_input_tokens_seen": 1815008, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.581290645322661, |
|
"grad_norm": 1.4837779998779297, |
|
"learning_rate": 2.265159791723373e-06, |
|
"loss": 0.7644, |
|
"num_input_tokens_seen": 1830400, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.6013006503251628, |
|
"grad_norm": 1.1914581060409546, |
|
"learning_rate": 2.051543625192226e-06, |
|
"loss": 0.7604, |
|
"num_input_tokens_seen": 1844256, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.621310655327664, |
|
"grad_norm": 1.1886558532714844, |
|
"learning_rate": 1.8480744204622757e-06, |
|
"loss": 0.8209, |
|
"num_input_tokens_seen": 1859024, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.641320660330165, |
|
"grad_norm": 1.4688860177993774, |
|
"learning_rate": 1.6548421441183875e-06, |
|
"loss": 0.8396, |
|
"num_input_tokens_seen": 1874624, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.6613306653326663, |
|
"grad_norm": 1.299116849899292, |
|
"learning_rate": 1.4719322363529242e-06, |
|
"loss": 0.7678, |
|
"num_input_tokens_seen": 1888064, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.6813406703351674, |
|
"grad_norm": 1.663547396659851, |
|
"learning_rate": 1.2994255731871963e-06, |
|
"loss": 0.8765, |
|
"num_input_tokens_seen": 1902976, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.701350675337669, |
|
"grad_norm": 1.2219780683517456, |
|
"learning_rate": 1.137398430711123e-06, |
|
"loss": 0.8124, |
|
"num_input_tokens_seen": 1916416, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.72136068034017, |
|
"grad_norm": 1.8948079347610474, |
|
"learning_rate": 9.85922451356694e-07, |
|
"loss": 0.8009, |
|
"num_input_tokens_seen": 1931536, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.7413706853426714, |
|
"grad_norm": 1.3554043769836426, |
|
"learning_rate": 8.450646122203865e-07, |
|
"loss": 0.8841, |
|
"num_input_tokens_seen": 1947072, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.7613806903451725, |
|
"grad_norm": 1.5139766931533813, |
|
"learning_rate": 7.148871954483105e-07, |
|
"loss": 0.7737, |
|
"num_input_tokens_seen": 1960624, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.7813906953476737, |
|
"grad_norm": 1.410873293876648, |
|
"learning_rate": 5.954477606973679e-07, |
|
"loss": 0.7897, |
|
"num_input_tokens_seen": 1975232, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.8014007003501753, |
|
"grad_norm": 1.1572494506835938, |
|
"learning_rate": 4.867991196844918e-07, |
|
"loss": 0.7096, |
|
"num_input_tokens_seen": 1989760, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.8214107053526765, |
|
"grad_norm": 1.0913457870483398, |
|
"learning_rate": 3.8898931283523344e-07, |
|
"loss": 0.8227, |
|
"num_input_tokens_seen": 2006720, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.8414207103551776, |
|
"grad_norm": 1.4725265502929688, |
|
"learning_rate": 3.020615880420713e-07, |
|
"loss": 0.7987, |
|
"num_input_tokens_seen": 2021664, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.861430715357679, |
|
"grad_norm": 1.424524188041687, |
|
"learning_rate": 2.2605438154179038e-07, |
|
"loss": 0.7853, |
|
"num_input_tokens_seen": 2037536, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.88144072036018, |
|
"grad_norm": 1.1289780139923096, |
|
"learning_rate": 1.6100130092037703e-07, |
|
"loss": 0.7808, |
|
"num_input_tokens_seen": 2050432, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.9014507253626816, |
|
"grad_norm": 1.3387763500213623, |
|
"learning_rate": 1.0693111025300017e-07, |
|
"loss": 0.7777, |
|
"num_input_tokens_seen": 2064080, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.9214607303651827, |
|
"grad_norm": 1.7347784042358398, |
|
"learning_rate": 6.386771738558506e-08, |
|
"loss": 0.8181, |
|
"num_input_tokens_seen": 2079760, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.941470735367684, |
|
"grad_norm": 1.5915528535842896, |
|
"learning_rate": 3.1830163363655296e-08, |
|
"loss": 0.7858, |
|
"num_input_tokens_seen": 2094000, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.961480740370185, |
|
"grad_norm": 1.8485968112945557, |
|
"learning_rate": 1.0832614013073228e-08, |
|
"loss": 0.9016, |
|
"num_input_tokens_seen": 2108592, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.981490745372686, |
|
"grad_norm": 1.8265576362609863, |
|
"learning_rate": 8.843536764419069e-10, |
|
"loss": 0.7275, |
|
"num_input_tokens_seen": 2121680, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.989494747373687, |
|
"num_input_tokens_seen": 2127792, |
|
"step": 747, |
|
"total_flos": 9.608125026100838e+16, |
|
"train_loss": 0.8833092752709446, |
|
"train_runtime": 3405.3466, |
|
"train_samples_per_second": 3.521, |
|
"train_steps_per_second": 0.219 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 747, |
|
"num_input_tokens_seen": 2127792, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.608125026100838e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|