{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.989494747373687, "eval_steps": 500, "global_step": 747, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02001000500250125, "grad_norm": 1.2456663846969604, "learning_rate": 4.999447296060165e-05, "loss": 1.3318, "num_input_tokens_seen": 15648, "step": 5 }, { "epoch": 0.0400200100050025, "grad_norm": 0.8590693473815918, "learning_rate": 4.997789428625975e-05, "loss": 1.2773, "num_input_tokens_seen": 28720, "step": 10 }, { "epoch": 0.060030015007503754, "grad_norm": 0.8516742587089539, "learning_rate": 4.995027130745321e-05, "loss": 1.1401, "num_input_tokens_seen": 43808, "step": 15 }, { "epoch": 0.080040020010005, "grad_norm": 1.0357950925827026, "learning_rate": 4.99116162380454e-05, "loss": 1.089, "num_input_tokens_seen": 57472, "step": 20 }, { "epoch": 0.10005002501250625, "grad_norm": 0.744416356086731, "learning_rate": 4.986194616988364e-05, "loss": 1.1176, "num_input_tokens_seen": 71968, "step": 25 }, { "epoch": 0.12006003001500751, "grad_norm": 0.7087241411209106, "learning_rate": 4.980128306524183e-05, "loss": 0.9949, "num_input_tokens_seen": 85552, "step": 30 }, { "epoch": 0.14007003501750875, "grad_norm": 0.708070695400238, "learning_rate": 4.972965374710952e-05, "loss": 1.0264, "num_input_tokens_seen": 100512, "step": 35 }, { "epoch": 0.16008004002001, "grad_norm": 0.7232606410980225, "learning_rate": 4.964708988733178e-05, "loss": 1.0004, "num_input_tokens_seen": 113376, "step": 40 }, { "epoch": 0.18009004502251125, "grad_norm": 1.207189679145813, "learning_rate": 4.9553627992605066e-05, "loss": 1.1137, "num_input_tokens_seen": 127680, "step": 45 }, { "epoch": 0.2001000500250125, "grad_norm": 0.6889365315437317, "learning_rate": 4.944930938833535e-05, "loss": 0.9646, "num_input_tokens_seen": 143136, "step": 50 }, { "epoch": 0.22011005502751377, "grad_norm": 1.3263787031173706, "learning_rate": 4.9334180200365486e-05, "loss": 0.9757, "num_input_tokens_seen": 155488, "step": 55 }, { "epoch": 0.24012006003001501, "grad_norm": 1.0919948816299438, "learning_rate": 4.9208291334580104e-05, "loss": 0.9126, "num_input_tokens_seen": 171008, "step": 60 }, { "epoch": 0.26013006503251623, "grad_norm": 0.853539764881134, "learning_rate": 4.907169845439688e-05, "loss": 1.0119, "num_input_tokens_seen": 185536, "step": 65 }, { "epoch": 0.2801400700350175, "grad_norm": 0.9023884534835815, "learning_rate": 4.892446195615423e-05, "loss": 1.1192, "num_input_tokens_seen": 201728, "step": 70 }, { "epoch": 0.3001500750375188, "grad_norm": 1.3300387859344482, "learning_rate": 4.87666469424063e-05, "loss": 1.0167, "num_input_tokens_seen": 217584, "step": 75 }, { "epoch": 0.32016008004002, "grad_norm": 1.1315807104110718, "learning_rate": 4.859832319313697e-05, "loss": 0.8291, "num_input_tokens_seen": 230864, "step": 80 }, { "epoch": 0.3401700850425213, "grad_norm": 1.2459896802902222, "learning_rate": 4.841956513490577e-05, "loss": 0.9591, "num_input_tokens_seen": 245584, "step": 85 }, { "epoch": 0.3601800900450225, "grad_norm": 1.114758849143982, "learning_rate": 4.8230451807939135e-05, "loss": 0.9869, "num_input_tokens_seen": 259760, "step": 90 }, { "epoch": 0.38019009504752377, "grad_norm": 0.8792176842689514, "learning_rate": 4.803106683118177e-05, "loss": 1.0423, "num_input_tokens_seen": 274432, "step": 95 }, { "epoch": 0.400200100050025, "grad_norm": 1.6365342140197754, "learning_rate": 4.782149836532345e-05, "loss": 1.0122, "num_input_tokens_seen": 288256, "step": 100 }, { "epoch": 0.42021010505252626, "grad_norm": 1.2875956296920776, "learning_rate": 4.760183907381757e-05, "loss": 0.9849, "num_input_tokens_seen": 301136, "step": 105 }, { "epoch": 0.44022011005502754, "grad_norm": 0.8925891518592834, "learning_rate": 4.737218608190878e-05, "loss": 0.7958, "num_input_tokens_seen": 313712, "step": 110 }, { "epoch": 0.46023011505752875, "grad_norm": 0.9326754808425903, "learning_rate": 4.713264093368783e-05, "loss": 0.9773, "num_input_tokens_seen": 328496, "step": 115 }, { "epoch": 0.48024012006003003, "grad_norm": 0.9033737182617188, "learning_rate": 4.6883309547192476e-05, "loss": 0.9953, "num_input_tokens_seen": 342288, "step": 120 }, { "epoch": 0.5002501250625313, "grad_norm": 1.606924057006836, "learning_rate": 4.6624302167574436e-05, "loss": 0.9726, "num_input_tokens_seen": 356128, "step": 125 }, { "epoch": 0.5202601300650325, "grad_norm": 1.0410974025726318, "learning_rate": 4.635573331835302e-05, "loss": 0.9987, "num_input_tokens_seen": 368000, "step": 130 }, { "epoch": 0.5402701350675337, "grad_norm": 1.3536303043365479, "learning_rate": 4.607772175077711e-05, "loss": 1.012, "num_input_tokens_seen": 381216, "step": 135 }, { "epoch": 0.560280140070035, "grad_norm": 0.8227062225341797, "learning_rate": 4.5790390391317675e-05, "loss": 0.9104, "num_input_tokens_seen": 395296, "step": 140 }, { "epoch": 0.5802901450725363, "grad_norm": 1.2021853923797607, "learning_rate": 4.549386628731425e-05, "loss": 0.9087, "num_input_tokens_seen": 409312, "step": 145 }, { "epoch": 0.6003001500750376, "grad_norm": 0.7881823778152466, "learning_rate": 4.518828055079925e-05, "loss": 1.0859, "num_input_tokens_seen": 424064, "step": 150 }, { "epoch": 0.6203101550775387, "grad_norm": 0.6765316724777222, "learning_rate": 4.487376830052511e-05, "loss": 0.9696, "num_input_tokens_seen": 437264, "step": 155 }, { "epoch": 0.64032016008004, "grad_norm": 1.2535051107406616, "learning_rate": 4.4550468602219716e-05, "loss": 1.0014, "num_input_tokens_seen": 450624, "step": 160 }, { "epoch": 0.6603301650825413, "grad_norm": 0.7798222303390503, "learning_rate": 4.421852440709666e-05, "loss": 0.8877, "num_input_tokens_seen": 465968, "step": 165 }, { "epoch": 0.6803401700850426, "grad_norm": 0.9760732054710388, "learning_rate": 4.387808248864751e-05, "loss": 0.9694, "num_input_tokens_seen": 480720, "step": 170 }, { "epoch": 0.7003501750875438, "grad_norm": 0.7668250799179077, "learning_rate": 4.352929337774395e-05, "loss": 0.8613, "num_input_tokens_seen": 495008, "step": 175 }, { "epoch": 0.720360180090045, "grad_norm": 0.7338403463363647, "learning_rate": 4.3172311296078595e-05, "loss": 0.907, "num_input_tokens_seen": 509024, "step": 180 }, { "epoch": 0.7403701850925463, "grad_norm": 1.0990034341812134, "learning_rate": 4.2807294087973834e-05, "loss": 1.0501, "num_input_tokens_seen": 524752, "step": 185 }, { "epoch": 0.7603801900950475, "grad_norm": 1.2968486547470093, "learning_rate": 4.2434403150588895e-05, "loss": 0.9158, "num_input_tokens_seen": 537872, "step": 190 }, { "epoch": 0.7803901950975488, "grad_norm": 1.0896228551864624, "learning_rate": 4.205380336255594e-05, "loss": 0.9756, "num_input_tokens_seen": 552832, "step": 195 }, { "epoch": 0.80040020010005, "grad_norm": 1.2773741483688354, "learning_rate": 4.166566301107687e-05, "loss": 1.0702, "num_input_tokens_seen": 566224, "step": 200 }, { "epoch": 0.8204102051025512, "grad_norm": 1.0507806539535522, "learning_rate": 4.127015371751284e-05, "loss": 0.8236, "num_input_tokens_seen": 582560, "step": 205 }, { "epoch": 0.8404202101050525, "grad_norm": 0.7220691442489624, "learning_rate": 4.08674503614997e-05, "loss": 0.8779, "num_input_tokens_seen": 595568, "step": 210 }, { "epoch": 0.8604302151075538, "grad_norm": 1.8035520315170288, "learning_rate": 4.0457731003622606e-05, "loss": 0.9544, "num_input_tokens_seen": 609040, "step": 215 }, { "epoch": 0.8804402201100551, "grad_norm": 1.3739808797836304, "learning_rate": 4.004117680668422e-05, "loss": 0.8529, "num_input_tokens_seen": 623168, "step": 220 }, { "epoch": 0.9004502251125562, "grad_norm": 0.7650644779205322, "learning_rate": 3.961797195560118e-05, "loss": 0.916, "num_input_tokens_seen": 639680, "step": 225 }, { "epoch": 0.9204602301150575, "grad_norm": 0.9346507787704468, "learning_rate": 3.918830357596434e-05, "loss": 1.0282, "num_input_tokens_seen": 654512, "step": 230 }, { "epoch": 0.9404702351175588, "grad_norm": 0.9470961689949036, "learning_rate": 3.8752361651298675e-05, "loss": 1.0636, "num_input_tokens_seen": 668992, "step": 235 }, { "epoch": 0.9604802401200601, "grad_norm": 0.9078488349914551, "learning_rate": 3.8310338939059644e-05, "loss": 0.855, "num_input_tokens_seen": 683360, "step": 240 }, { "epoch": 0.9804902451225613, "grad_norm": 0.8685442209243774, "learning_rate": 3.7862430885402876e-05, "loss": 0.9286, "num_input_tokens_seen": 697456, "step": 245 }, { "epoch": 1.0005002501250626, "grad_norm": 1.0271894931793213, "learning_rate": 3.740883553876515e-05, "loss": 0.9295, "num_input_tokens_seen": 713088, "step": 250 }, { "epoch": 1.0205102551275638, "grad_norm": 0.80521160364151, "learning_rate": 3.694975346229458e-05, "loss": 0.9945, "num_input_tokens_seen": 726176, "step": 255 }, { "epoch": 1.040520260130065, "grad_norm": 0.9811381101608276, "learning_rate": 3.6485387645169064e-05, "loss": 0.8799, "num_input_tokens_seen": 741664, "step": 260 }, { "epoch": 1.0605302651325663, "grad_norm": 0.8328830599784851, "learning_rate": 3.601594341284195e-05, "loss": 0.8551, "num_input_tokens_seen": 754720, "step": 265 }, { "epoch": 1.0805402701350675, "grad_norm": 0.9785313606262207, "learning_rate": 3.55416283362546e-05, "loss": 0.8381, "num_input_tokens_seen": 767568, "step": 270 }, { "epoch": 1.1005502751375689, "grad_norm": 1.0604971647262573, "learning_rate": 3.5062652140056275e-05, "loss": 0.8036, "num_input_tokens_seen": 784192, "step": 275 }, { "epoch": 1.12056028014007, "grad_norm": 0.9006226062774658, "learning_rate": 3.457922660987155e-05, "loss": 0.7851, "num_input_tokens_seen": 797328, "step": 280 }, { "epoch": 1.1405702851425712, "grad_norm": 0.6364954710006714, "learning_rate": 3.409156549865654e-05, "loss": 0.8262, "num_input_tokens_seen": 812416, "step": 285 }, { "epoch": 1.1605802901450726, "grad_norm": 1.1533681154251099, "learning_rate": 3.3599884432185225e-05, "loss": 0.9297, "num_input_tokens_seen": 825744, "step": 290 }, { "epoch": 1.1805902951475737, "grad_norm": 0.8428523540496826, "learning_rate": 3.310440081370767e-05, "loss": 1.0422, "num_input_tokens_seen": 840256, "step": 295 }, { "epoch": 1.2006003001500751, "grad_norm": 0.9648393392562866, "learning_rate": 3.260533372782234e-05, "loss": 0.8906, "num_input_tokens_seen": 854016, "step": 300 }, { "epoch": 1.2206103051525763, "grad_norm": 1.0610816478729248, "learning_rate": 3.2102903843604885e-05, "loss": 0.7934, "num_input_tokens_seen": 868592, "step": 305 }, { "epoch": 1.2406203101550775, "grad_norm": 1.2185319662094116, "learning_rate": 3.1597333317036545e-05, "loss": 0.7439, "num_input_tokens_seen": 881280, "step": 310 }, { "epoch": 1.2606303151575788, "grad_norm": 1.1018725633621216, "learning_rate": 3.10888456927748e-05, "loss": 0.841, "num_input_tokens_seen": 895168, "step": 315 }, { "epoch": 1.28064032016008, "grad_norm": 1.42940354347229, "learning_rate": 3.057766580531031e-05, "loss": 0.9016, "num_input_tokens_seen": 910624, "step": 320 }, { "epoch": 1.3006503251625814, "grad_norm": 1.1689488887786865, "learning_rate": 3.0064019679553274e-05, "loss": 0.8938, "num_input_tokens_seen": 923648, "step": 325 }, { "epoch": 1.3206603301650826, "grad_norm": 1.2434953451156616, "learning_rate": 2.9548134430893604e-05, "loss": 0.8739, "num_input_tokens_seen": 938256, "step": 330 }, { "epoch": 1.3406703351675837, "grad_norm": 0.6362343430519104, "learning_rate": 2.903023816477885e-05, "loss": 0.9641, "num_input_tokens_seen": 954000, "step": 335 }, { "epoch": 1.360680340170085, "grad_norm": 1.7914173603057861, "learning_rate": 2.8510559875854377e-05, "loss": 0.8616, "num_input_tokens_seen": 965744, "step": 340 }, { "epoch": 1.3806903451725863, "grad_norm": 1.1798431873321533, "learning_rate": 2.7989329346710375e-05, "loss": 0.7673, "num_input_tokens_seen": 980512, "step": 345 }, { "epoch": 1.4007003501750876, "grad_norm": 1.0572681427001953, "learning_rate": 2.7466777046280457e-05, "loss": 0.9637, "num_input_tokens_seen": 996224, "step": 350 }, { "epoch": 1.4207103551775888, "grad_norm": 1.3495503664016724, "learning_rate": 2.69431340279368e-05, "loss": 0.7466, "num_input_tokens_seen": 1008816, "step": 355 }, { "epoch": 1.44072036018009, "grad_norm": 0.7960165143013, "learning_rate": 2.6418631827326857e-05, "loss": 0.8695, "num_input_tokens_seen": 1024032, "step": 360 }, { "epoch": 1.4607303651825914, "grad_norm": 1.0253639221191406, "learning_rate": 2.5893502359996786e-05, "loss": 0.8742, "num_input_tokens_seen": 1035536, "step": 365 }, { "epoch": 1.4807403701850925, "grad_norm": 1.3461010456085205, "learning_rate": 2.5367977818847034e-05, "loss": 0.8879, "num_input_tokens_seen": 1048784, "step": 370 }, { "epoch": 1.500750375187594, "grad_norm": 0.9898460507392883, "learning_rate": 2.484229057146507e-05, "loss": 0.8188, "num_input_tokens_seen": 1063920, "step": 375 }, { "epoch": 1.520760380190095, "grad_norm": 1.0175529718399048, "learning_rate": 2.431667305738112e-05, "loss": 0.8898, "num_input_tokens_seen": 1080048, "step": 380 }, { "epoch": 1.5407703851925962, "grad_norm": 1.3844118118286133, "learning_rate": 2.3791357685291863e-05, "loss": 0.8779, "num_input_tokens_seen": 1093584, "step": 385 }, { "epoch": 1.5607803901950974, "grad_norm": 1.3372527360916138, "learning_rate": 2.3266576730297956e-05, "loss": 0.9372, "num_input_tokens_seen": 1108192, "step": 390 }, { "epoch": 1.5807903951975988, "grad_norm": 0.8785428404808044, "learning_rate": 2.274256223120051e-05, "loss": 0.7406, "num_input_tokens_seen": 1122368, "step": 395 }, { "epoch": 1.6008004002001002, "grad_norm": 1.2185471057891846, "learning_rate": 2.221954588790206e-05, "loss": 0.8414, "num_input_tokens_seen": 1135040, "step": 400 }, { "epoch": 1.6208104052026013, "grad_norm": 0.9093597531318665, "learning_rate": 2.1697758958957448e-05, "loss": 0.8781, "num_input_tokens_seen": 1150096, "step": 405 }, { "epoch": 1.6408204102051025, "grad_norm": 1.1658143997192383, "learning_rate": 2.1177432159319754e-05, "loss": 0.8304, "num_input_tokens_seen": 1163840, "step": 410 }, { "epoch": 1.6608304152076037, "grad_norm": 1.4369289875030518, "learning_rate": 2.0658795558326743e-05, "loss": 0.9539, "num_input_tokens_seen": 1179024, "step": 415 }, { "epoch": 1.680840420210105, "grad_norm": 1.2581747770309448, "learning_rate": 2.014207847797256e-05, "loss": 0.7972, "num_input_tokens_seen": 1192800, "step": 420 }, { "epoch": 1.7008504252126064, "grad_norm": 1.6154969930648804, "learning_rate": 1.9627509391510086e-05, "loss": 0.9455, "num_input_tokens_seen": 1206160, "step": 425 }, { "epoch": 1.7208604302151076, "grad_norm": 0.9386204481124878, "learning_rate": 1.9115315822428437e-05, "loss": 0.742, "num_input_tokens_seen": 1219456, "step": 430 }, { "epoch": 1.7408704352176088, "grad_norm": 1.32277512550354, "learning_rate": 1.8605724243850502e-05, "loss": 0.9298, "num_input_tokens_seen": 1232848, "step": 435 }, { "epoch": 1.76088044022011, "grad_norm": 1.7241052389144897, "learning_rate": 1.809895997839482e-05, "loss": 0.8751, "num_input_tokens_seen": 1244944, "step": 440 }, { "epoch": 1.7808904452226113, "grad_norm": 1.2125686407089233, "learning_rate": 1.759524709854626e-05, "loss": 0.8162, "num_input_tokens_seen": 1259584, "step": 445 }, { "epoch": 1.8009004502251127, "grad_norm": 1.6025140285491943, "learning_rate": 1.70948083275794e-05, "loss": 0.9372, "num_input_tokens_seen": 1274640, "step": 450 }, { "epoch": 1.8209104552276139, "grad_norm": 0.9881381392478943, "learning_rate": 1.6597864941078552e-05, "loss": 0.9076, "num_input_tokens_seen": 1289936, "step": 455 }, { "epoch": 1.840920460230115, "grad_norm": 1.4844774007797241, "learning_rate": 1.6104636669097776e-05, "loss": 0.8189, "num_input_tokens_seen": 1303184, "step": 460 }, { "epoch": 1.8609304652326162, "grad_norm": 0.8360545635223389, "learning_rate": 1.561534159900441e-05, "loss": 0.8545, "num_input_tokens_seen": 1317920, "step": 465 }, { "epoch": 1.8809404702351176, "grad_norm": 1.1437807083129883, "learning_rate": 1.513019607904882e-05, "loss": 0.8544, "num_input_tokens_seen": 1332192, "step": 470 }, { "epoch": 1.900950475237619, "grad_norm": 1.3824429512023926, "learning_rate": 1.464941462270325e-05, "loss": 0.9191, "num_input_tokens_seen": 1348000, "step": 475 }, { "epoch": 1.9209604802401201, "grad_norm": 0.9468676447868347, "learning_rate": 1.4173209813811788e-05, "loss": 0.8605, "num_input_tokens_seen": 1362096, "step": 480 }, { "epoch": 1.9409704852426213, "grad_norm": 1.2974201440811157, "learning_rate": 1.3701792212593662e-05, "loss": 0.937, "num_input_tokens_seen": 1378656, "step": 485 }, { "epoch": 1.9609804902451224, "grad_norm": 1.684973955154419, "learning_rate": 1.3235370262541272e-05, "loss": 0.9073, "num_input_tokens_seen": 1393344, "step": 490 }, { "epoch": 1.9809904952476238, "grad_norm": 0.7046062350273132, "learning_rate": 1.277415019825417e-05, "loss": 0.8941, "num_input_tokens_seen": 1409280, "step": 495 }, { "epoch": 2.001000500250125, "grad_norm": 1.5292088985443115, "learning_rate": 1.2318335954249669e-05, "loss": 0.8051, "num_input_tokens_seen": 1423536, "step": 500 }, { "epoch": 2.0210105052526264, "grad_norm": 1.1964282989501953, "learning_rate": 1.1868129074790577e-05, "loss": 0.751, "num_input_tokens_seen": 1436048, "step": 505 }, { "epoch": 2.0410205102551275, "grad_norm": 1.2196362018585205, "learning_rate": 1.1423728624769695e-05, "loss": 0.762, "num_input_tokens_seen": 1450272, "step": 510 }, { "epoch": 2.0610305152576287, "grad_norm": 1.8068214654922485, "learning_rate": 1.098533110169071e-05, "loss": 0.8222, "num_input_tokens_seen": 1464656, "step": 515 }, { "epoch": 2.08104052026013, "grad_norm": 1.1758692264556885, "learning_rate": 1.0553130348784182e-05, "loss": 0.7271, "num_input_tokens_seen": 1478016, "step": 520 }, { "epoch": 2.1010505252626315, "grad_norm": 1.0688625574111938, "learning_rate": 1.0127317469297277e-05, "loss": 0.7618, "num_input_tokens_seen": 1492080, "step": 525 }, { "epoch": 2.1210605302651326, "grad_norm": 0.9757594466209412, "learning_rate": 9.708080741994868e-06, "loss": 0.7738, "num_input_tokens_seen": 1507504, "step": 530 }, { "epoch": 2.141070535267634, "grad_norm": 1.5281833410263062, "learning_rate": 9.295605537909708e-06, "loss": 0.7976, "num_input_tokens_seen": 1520080, "step": 535 }, { "epoch": 2.161080540270135, "grad_norm": 1.202627182006836, "learning_rate": 8.890074238378074e-06, "loss": 0.7181, "num_input_tokens_seen": 1531920, "step": 540 }, { "epoch": 2.181090545272636, "grad_norm": 1.7492619752883911, "learning_rate": 8.491666154397573e-06, "loss": 0.7375, "num_input_tokens_seen": 1545856, "step": 545 }, { "epoch": 2.2011005502751377, "grad_norm": 1.4915295839309692, "learning_rate": 8.100557447342327e-06, "loss": 0.7557, "num_input_tokens_seen": 1558256, "step": 550 }, { "epoch": 2.221110555277639, "grad_norm": 1.4687122106552124, "learning_rate": 7.71692105107098e-06, "loss": 0.7973, "num_input_tokens_seen": 1573776, "step": 555 }, { "epoch": 2.24112056028014, "grad_norm": 1.2812813520431519, "learning_rate": 7.340926595461687e-06, "loss": 0.881, "num_input_tokens_seen": 1589968, "step": 560 }, { "epoch": 2.2611305652826412, "grad_norm": 1.453517198562622, "learning_rate": 6.972740331408015e-06, "loss": 0.7725, "num_input_tokens_seen": 1603488, "step": 565 }, { "epoch": 2.2811405702851424, "grad_norm": 1.4879059791564941, "learning_rate": 6.612525057308949e-06, "loss": 0.8143, "num_input_tokens_seen": 1619136, "step": 570 }, { "epoch": 2.301150575287644, "grad_norm": 1.426224708557129, "learning_rate": 6.260440047085439e-06, "loss": 0.7383, "num_input_tokens_seen": 1635088, "step": 575 }, { "epoch": 2.321160580290145, "grad_norm": 1.5303268432617188, "learning_rate": 5.9166409797553415e-06, "loss": 0.9513, "num_input_tokens_seen": 1652560, "step": 580 }, { "epoch": 2.3411705852926463, "grad_norm": 1.0902361869812012, "learning_rate": 5.581279870597867e-06, "loss": 0.6249, "num_input_tokens_seen": 1665168, "step": 585 }, { "epoch": 2.3611805902951475, "grad_norm": 1.3195565938949585, "learning_rate": 5.254505003938043e-06, "loss": 0.7327, "num_input_tokens_seen": 1677312, "step": 590 }, { "epoch": 2.3811905952976486, "grad_norm": 1.9101403951644897, "learning_rate": 4.936460867580889e-06, "loss": 0.8425, "num_input_tokens_seen": 1690400, "step": 595 }, { "epoch": 2.4012006003001503, "grad_norm": 1.4452427625656128, "learning_rate": 4.627288088924156e-06, "loss": 0.8224, "num_input_tokens_seen": 1704640, "step": 600 }, { "epoch": 2.4212106053026514, "grad_norm": 1.2759140729904175, "learning_rate": 4.327123372778122e-06, "loss": 0.7743, "num_input_tokens_seen": 1717808, "step": 605 }, { "epoch": 2.4412206103051526, "grad_norm": 1.3661129474639893, "learning_rate": 4.036099440919763e-06, "loss": 0.6191, "num_input_tokens_seen": 1730688, "step": 610 }, { "epoch": 2.4612306153076537, "grad_norm": 1.3944261074066162, "learning_rate": 3.754344973408064e-06, "loss": 0.8898, "num_input_tokens_seen": 1745472, "step": 615 }, { "epoch": 2.481240620310155, "grad_norm": 1.5250520706176758, "learning_rate": 3.481984551686429e-06, "loss": 0.9432, "num_input_tokens_seen": 1761008, "step": 620 }, { "epoch": 2.5012506253126565, "grad_norm": 1.3213047981262207, "learning_rate": 3.2191386034973627e-06, "loss": 0.8175, "num_input_tokens_seen": 1774704, "step": 625 }, { "epoch": 2.5212606303151577, "grad_norm": 1.5623505115509033, "learning_rate": 2.9659233496337786e-06, "loss": 0.793, "num_input_tokens_seen": 1788768, "step": 630 }, { "epoch": 2.541270635317659, "grad_norm": 1.8235459327697754, "learning_rate": 2.722450752550429e-06, "loss": 0.8368, "num_input_tokens_seen": 1799968, "step": 635 }, { "epoch": 2.56128064032016, "grad_norm": 1.178887128829956, "learning_rate": 2.4888284668582285e-06, "loss": 0.8236, "num_input_tokens_seen": 1815008, "step": 640 }, { "epoch": 2.581290645322661, "grad_norm": 1.4837779998779297, "learning_rate": 2.265159791723373e-06, "loss": 0.7644, "num_input_tokens_seen": 1830400, "step": 645 }, { "epoch": 2.6013006503251628, "grad_norm": 1.1914581060409546, "learning_rate": 2.051543625192226e-06, "loss": 0.7604, "num_input_tokens_seen": 1844256, "step": 650 }, { "epoch": 2.621310655327664, "grad_norm": 1.1886558532714844, "learning_rate": 1.8480744204622757e-06, "loss": 0.8209, "num_input_tokens_seen": 1859024, "step": 655 }, { "epoch": 2.641320660330165, "grad_norm": 1.4688860177993774, "learning_rate": 1.6548421441183875e-06, "loss": 0.8396, "num_input_tokens_seen": 1874624, "step": 660 }, { "epoch": 2.6613306653326663, "grad_norm": 1.299116849899292, "learning_rate": 1.4719322363529242e-06, "loss": 0.7678, "num_input_tokens_seen": 1888064, "step": 665 }, { "epoch": 2.6813406703351674, "grad_norm": 1.663547396659851, "learning_rate": 1.2994255731871963e-06, "loss": 0.8765, "num_input_tokens_seen": 1902976, "step": 670 }, { "epoch": 2.701350675337669, "grad_norm": 1.2219780683517456, "learning_rate": 1.137398430711123e-06, "loss": 0.8124, "num_input_tokens_seen": 1916416, "step": 675 }, { "epoch": 2.72136068034017, "grad_norm": 1.8948079347610474, "learning_rate": 9.85922451356694e-07, "loss": 0.8009, "num_input_tokens_seen": 1931536, "step": 680 }, { "epoch": 2.7413706853426714, "grad_norm": 1.3554043769836426, "learning_rate": 8.450646122203865e-07, "loss": 0.8841, "num_input_tokens_seen": 1947072, "step": 685 }, { "epoch": 2.7613806903451725, "grad_norm": 1.5139766931533813, "learning_rate": 7.148871954483105e-07, "loss": 0.7737, "num_input_tokens_seen": 1960624, "step": 690 }, { "epoch": 2.7813906953476737, "grad_norm": 1.410873293876648, "learning_rate": 5.954477606973679e-07, "loss": 0.7897, "num_input_tokens_seen": 1975232, "step": 695 }, { "epoch": 2.8014007003501753, "grad_norm": 1.1572494506835938, "learning_rate": 4.867991196844918e-07, "loss": 0.7096, "num_input_tokens_seen": 1989760, "step": 700 }, { "epoch": 2.8214107053526765, "grad_norm": 1.0913457870483398, "learning_rate": 3.8898931283523344e-07, "loss": 0.8227, "num_input_tokens_seen": 2006720, "step": 705 }, { "epoch": 2.8414207103551776, "grad_norm": 1.4725265502929688, "learning_rate": 3.020615880420713e-07, "loss": 0.7987, "num_input_tokens_seen": 2021664, "step": 710 }, { "epoch": 2.861430715357679, "grad_norm": 1.424524188041687, "learning_rate": 2.2605438154179038e-07, "loss": 0.7853, "num_input_tokens_seen": 2037536, "step": 715 }, { "epoch": 2.88144072036018, "grad_norm": 1.1289780139923096, "learning_rate": 1.6100130092037703e-07, "loss": 0.7808, "num_input_tokens_seen": 2050432, "step": 720 }, { "epoch": 2.9014507253626816, "grad_norm": 1.3387763500213623, "learning_rate": 1.0693111025300017e-07, "loss": 0.7777, "num_input_tokens_seen": 2064080, "step": 725 }, { "epoch": 2.9214607303651827, "grad_norm": 1.7347784042358398, "learning_rate": 6.386771738558506e-08, "loss": 0.8181, "num_input_tokens_seen": 2079760, "step": 730 }, { "epoch": 2.941470735367684, "grad_norm": 1.5915528535842896, "learning_rate": 3.1830163363655296e-08, "loss": 0.7858, "num_input_tokens_seen": 2094000, "step": 735 }, { "epoch": 2.961480740370185, "grad_norm": 1.8485968112945557, "learning_rate": 1.0832614013073228e-08, "loss": 0.9016, "num_input_tokens_seen": 2108592, "step": 740 }, { "epoch": 2.981490745372686, "grad_norm": 1.8265576362609863, "learning_rate": 8.843536764419069e-10, "loss": 0.7275, "num_input_tokens_seen": 2121680, "step": 745 }, { "epoch": 2.989494747373687, "num_input_tokens_seen": 2127792, "step": 747, "total_flos": 9.608125026100838e+16, "train_loss": 0.8833092752709446, "train_runtime": 3405.3466, "train_samples_per_second": 3.521, "train_steps_per_second": 0.219 } ], "logging_steps": 5, "max_steps": 747, "num_input_tokens_seen": 2127792, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.608125026100838e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }