{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020181634712411706, "grad_norm": 61.64529800415039, "learning_rate": 2.0161290322580646e-06, "loss": 7.4781, "step": 10 }, { "epoch": 0.004036326942482341, "grad_norm": 12.924609184265137, "learning_rate": 4.032258064516129e-06, "loss": 5.3628, "step": 20 }, { "epoch": 0.006054490413723511, "grad_norm": 8.111140251159668, "learning_rate": 6.048387096774194e-06, "loss": 4.5188, "step": 30 }, { "epoch": 0.008072653884964682, "grad_norm": 8.318406105041504, "learning_rate": 8.064516129032258e-06, "loss": 4.2211, "step": 40 }, { "epoch": 0.010090817356205853, "grad_norm": 4.871032238006592, "learning_rate": 1.0080645161290323e-05, "loss": 3.844, "step": 50 }, { "epoch": 0.012108980827447022, "grad_norm": 4.057001113891602, "learning_rate": 1.2096774193548388e-05, "loss": 3.5377, "step": 60 }, { "epoch": 0.014127144298688193, "grad_norm": 3.9526875019073486, "learning_rate": 1.4112903225806454e-05, "loss": 3.203, "step": 70 }, { "epoch": 0.016145307769929364, "grad_norm": 3.171088695526123, "learning_rate": 1.6129032258064517e-05, "loss": 3.0832, "step": 80 }, { "epoch": 0.018163471241170535, "grad_norm": 3.63295841217041, "learning_rate": 1.8145161290322583e-05, "loss": 2.9575, "step": 90 }, { "epoch": 0.020181634712411706, "grad_norm": 2.8197014331817627, "learning_rate": 2.0161290322580645e-05, "loss": 2.831, "step": 100 }, { "epoch": 0.022199798183652877, "grad_norm": 3.540992259979248, "learning_rate": 2.217741935483871e-05, "loss": 2.6876, "step": 110 }, { "epoch": 0.024217961654894045, "grad_norm": 2.642745018005371, "learning_rate": 2.4193548387096777e-05, "loss": 2.6302, "step": 120 }, { "epoch": 0.026236125126135216, "grad_norm": 3.0355093479156494, "learning_rate": 2.620967741935484e-05, "loss": 2.5268, "step": 130 }, { "epoch": 0.028254288597376387, "grad_norm": 2.7031023502349854, "learning_rate": 2.822580645161291e-05, "loss": 2.5202, "step": 140 }, { "epoch": 0.030272452068617558, "grad_norm": 3.2656476497650146, "learning_rate": 3.024193548387097e-05, "loss": 2.3902, "step": 150 }, { "epoch": 0.03229061553985873, "grad_norm": 2.836116075515747, "learning_rate": 3.2258064516129034e-05, "loss": 2.4072, "step": 160 }, { "epoch": 0.034308779011099896, "grad_norm": 2.732985496520996, "learning_rate": 3.427419354838709e-05, "loss": 2.3159, "step": 170 }, { "epoch": 0.03632694248234107, "grad_norm": 2.5948660373687744, "learning_rate": 3.6290322580645165e-05, "loss": 2.2847, "step": 180 }, { "epoch": 0.03834510595358224, "grad_norm": 2.968618869781494, "learning_rate": 3.8306451612903224e-05, "loss": 2.3511, "step": 190 }, { "epoch": 0.04036326942482341, "grad_norm": 4.173515319824219, "learning_rate": 4.032258064516129e-05, "loss": 2.341, "step": 200 }, { "epoch": 0.04238143289606458, "grad_norm": 2.294019937515259, "learning_rate": 4.2338709677419356e-05, "loss": 2.2497, "step": 210 }, { "epoch": 0.044399596367305755, "grad_norm": 2.3338685035705566, "learning_rate": 4.435483870967742e-05, "loss": 2.2591, "step": 220 }, { "epoch": 0.04641775983854692, "grad_norm": 2.389831066131592, "learning_rate": 4.637096774193548e-05, "loss": 2.2448, "step": 230 }, { "epoch": 0.04843592330978809, "grad_norm": 2.5201873779296875, "learning_rate": 4.8387096774193554e-05, "loss": 2.272, "step": 240 }, { "epoch": 0.050454086781029264, "grad_norm": 3.078508138656616, "learning_rate": 5.040322580645161e-05, "loss": 2.1645, "step": 250 }, { "epoch": 0.05247225025227043, "grad_norm": 2.0750768184661865, "learning_rate": 5.241935483870968e-05, "loss": 2.2286, "step": 260 }, { "epoch": 0.054490413723511606, "grad_norm": 2.5442333221435547, "learning_rate": 5.443548387096774e-05, "loss": 2.2015, "step": 270 }, { "epoch": 0.056508577194752774, "grad_norm": 2.4685232639312744, "learning_rate": 5.645161290322582e-05, "loss": 2.1876, "step": 280 }, { "epoch": 0.05852674066599395, "grad_norm": 2.6516835689544678, "learning_rate": 5.8467741935483876e-05, "loss": 2.0738, "step": 290 }, { "epoch": 0.060544904137235116, "grad_norm": 2.234955310821533, "learning_rate": 6.048387096774194e-05, "loss": 2.1074, "step": 300 }, { "epoch": 0.06256306760847628, "grad_norm": 2.285836935043335, "learning_rate": 6.25e-05, "loss": 2.1229, "step": 310 }, { "epoch": 0.06458123107971746, "grad_norm": 2.150893449783325, "learning_rate": 6.451612903225807e-05, "loss": 2.0795, "step": 320 }, { "epoch": 0.06659939455095863, "grad_norm": 2.7134313583374023, "learning_rate": 6.653225806451613e-05, "loss": 2.0569, "step": 330 }, { "epoch": 0.06861755802219979, "grad_norm": 2.2563529014587402, "learning_rate": 6.854838709677419e-05, "loss": 2.054, "step": 340 }, { "epoch": 0.07063572149344097, "grad_norm": 2.307612419128418, "learning_rate": 7.056451612903226e-05, "loss": 2.2254, "step": 350 }, { "epoch": 0.07265388496468214, "grad_norm": 2.4022858142852783, "learning_rate": 7.258064516129033e-05, "loss": 2.0501, "step": 360 }, { "epoch": 0.07467204843592332, "grad_norm": 1.9709093570709229, "learning_rate": 7.45967741935484e-05, "loss": 2.0284, "step": 370 }, { "epoch": 0.07669021190716448, "grad_norm": 1.8165977001190186, "learning_rate": 7.661290322580645e-05, "loss": 2.0161, "step": 380 }, { "epoch": 0.07870837537840565, "grad_norm": 2.2313830852508545, "learning_rate": 7.862903225806451e-05, "loss": 2.0064, "step": 390 }, { "epoch": 0.08072653884964683, "grad_norm": 4.013941764831543, "learning_rate": 8.064516129032258e-05, "loss": 2.0056, "step": 400 }, { "epoch": 0.08274470232088799, "grad_norm": 2.2855305671691895, "learning_rate": 8.266129032258066e-05, "loss": 2.0044, "step": 410 }, { "epoch": 0.08476286579212916, "grad_norm": 1.9798551797866821, "learning_rate": 8.467741935483871e-05, "loss": 1.9908, "step": 420 }, { "epoch": 0.08678102926337034, "grad_norm": 2.0610132217407227, "learning_rate": 8.669354838709678e-05, "loss": 1.9845, "step": 430 }, { "epoch": 0.08879919273461151, "grad_norm": 2.354846477508545, "learning_rate": 8.870967741935484e-05, "loss": 2.0008, "step": 440 }, { "epoch": 0.09081735620585267, "grad_norm": 2.430293560028076, "learning_rate": 9.072580645161291e-05, "loss": 1.9855, "step": 450 }, { "epoch": 0.09283551967709384, "grad_norm": 2.4229743480682373, "learning_rate": 9.274193548387096e-05, "loss": 1.935, "step": 460 }, { "epoch": 0.09485368314833502, "grad_norm": 1.9692800045013428, "learning_rate": 9.475806451612904e-05, "loss": 2.1015, "step": 470 }, { "epoch": 0.09687184661957618, "grad_norm": 2.103461503982544, "learning_rate": 9.677419354838711e-05, "loss": 2.0086, "step": 480 }, { "epoch": 0.09889001009081735, "grad_norm": 1.9599565267562866, "learning_rate": 9.879032258064517e-05, "loss": 2.0889, "step": 490 }, { "epoch": 0.10090817356205853, "grad_norm": 2.226715087890625, "learning_rate": 9.999995545373623e-05, "loss": 1.9485, "step": 500 }, { "epoch": 0.1029263370332997, "grad_norm": 2.138767719268799, "learning_rate": 9.999945430918042e-05, "loss": 1.8391, "step": 510 }, { "epoch": 0.10494450050454086, "grad_norm": 1.9570167064666748, "learning_rate": 9.999839634283869e-05, "loss": 2.004, "step": 520 }, { "epoch": 0.10696266397578204, "grad_norm": 2.2599446773529053, "learning_rate": 9.999678156649317e-05, "loss": 2.0093, "step": 530 }, { "epoch": 0.10898082744702321, "grad_norm": 1.9920490980148315, "learning_rate": 9.999460999812691e-05, "loss": 1.8934, "step": 540 }, { "epoch": 0.11099899091826437, "grad_norm": 1.9292041063308716, "learning_rate": 9.999188166192368e-05, "loss": 1.8937, "step": 550 }, { "epoch": 0.11301715438950555, "grad_norm": 1.9816484451293945, "learning_rate": 9.998859658826777e-05, "loss": 1.9489, "step": 560 }, { "epoch": 0.11503531786074672, "grad_norm": 2.503206729888916, "learning_rate": 9.998475481374358e-05, "loss": 1.9871, "step": 570 }, { "epoch": 0.1170534813319879, "grad_norm": 1.8593742847442627, "learning_rate": 9.998035638113527e-05, "loss": 1.8784, "step": 580 }, { "epoch": 0.11907164480322906, "grad_norm": 2.2534918785095215, "learning_rate": 9.997540133942624e-05, "loss": 1.7789, "step": 590 }, { "epoch": 0.12108980827447023, "grad_norm": 1.7839148044586182, "learning_rate": 9.996988974379857e-05, "loss": 1.8833, "step": 600 }, { "epoch": 0.1231079717457114, "grad_norm": 1.9832431077957153, "learning_rate": 9.996382165563247e-05, "loss": 1.8618, "step": 610 }, { "epoch": 0.12512613521695257, "grad_norm": 1.8574587106704712, "learning_rate": 9.995719714250556e-05, "loss": 1.9742, "step": 620 }, { "epoch": 0.12714429868819374, "grad_norm": 2.3903088569641113, "learning_rate": 9.995001627819211e-05, "loss": 1.8395, "step": 630 }, { "epoch": 0.12916246215943492, "grad_norm": 1.8283036947250366, "learning_rate": 9.99422791426622e-05, "loss": 1.7997, "step": 640 }, { "epoch": 0.1311806256306761, "grad_norm": 2.3620362281799316, "learning_rate": 9.993398582208093e-05, "loss": 1.8091, "step": 650 }, { "epoch": 0.13319878910191726, "grad_norm": 1.9973320960998535, "learning_rate": 9.99251364088073e-05, "loss": 1.8977, "step": 660 }, { "epoch": 0.13521695257315844, "grad_norm": 2.1321332454681396, "learning_rate": 9.991573100139334e-05, "loss": 1.8635, "step": 670 }, { "epoch": 0.13723511604439959, "grad_norm": 2.079270362854004, "learning_rate": 9.990576970458285e-05, "loss": 1.8001, "step": 680 }, { "epoch": 0.13925327951564076, "grad_norm": 2.42185640335083, "learning_rate": 9.989525262931045e-05, "loss": 1.8191, "step": 690 }, { "epoch": 0.14127144298688193, "grad_norm": 1.972676396369934, "learning_rate": 9.988417989270011e-05, "loss": 1.8399, "step": 700 }, { "epoch": 0.1432896064581231, "grad_norm": 1.9847174882888794, "learning_rate": 9.987255161806402e-05, "loss": 1.7614, "step": 710 }, { "epoch": 0.14530776992936428, "grad_norm": 1.9572982788085938, "learning_rate": 9.986036793490112e-05, "loss": 1.8777, "step": 720 }, { "epoch": 0.14732593340060546, "grad_norm": 1.9790279865264893, "learning_rate": 9.984762897889568e-05, "loss": 1.8118, "step": 730 }, { "epoch": 0.14934409687184663, "grad_norm": 1.9155077934265137, "learning_rate": 9.983433489191581e-05, "loss": 1.8436, "step": 740 }, { "epoch": 0.15136226034308778, "grad_norm": 1.9012539386749268, "learning_rate": 9.98204858220119e-05, "loss": 1.8109, "step": 750 }, { "epoch": 0.15338042381432895, "grad_norm": 2.0491812229156494, "learning_rate": 9.980608192341488e-05, "loss": 1.8072, "step": 760 }, { "epoch": 0.15539858728557013, "grad_norm": 1.941206932067871, "learning_rate": 9.979112335653462e-05, "loss": 1.7898, "step": 770 }, { "epoch": 0.1574167507568113, "grad_norm": 2.0589675903320312, "learning_rate": 9.977561028795803e-05, "loss": 1.7524, "step": 780 }, { "epoch": 0.15943491422805248, "grad_norm": 1.9348995685577393, "learning_rate": 9.97595428904473e-05, "loss": 1.7864, "step": 790 }, { "epoch": 0.16145307769929365, "grad_norm": 1.8057056665420532, "learning_rate": 9.974292134293792e-05, "loss": 1.7851, "step": 800 }, { "epoch": 0.16347124117053483, "grad_norm": 1.8641736507415771, "learning_rate": 9.97257458305367e-05, "loss": 1.8094, "step": 810 }, { "epoch": 0.16548940464177597, "grad_norm": 1.713681936264038, "learning_rate": 9.970801654451973e-05, "loss": 1.7454, "step": 820 }, { "epoch": 0.16750756811301715, "grad_norm": 1.8647186756134033, "learning_rate": 9.968973368233022e-05, "loss": 1.7415, "step": 830 }, { "epoch": 0.16952573158425832, "grad_norm": 1.9415329694747925, "learning_rate": 9.96708974475763e-05, "loss": 1.7501, "step": 840 }, { "epoch": 0.1715438950554995, "grad_norm": 1.8891992568969727, "learning_rate": 9.965150805002878e-05, "loss": 1.8218, "step": 850 }, { "epoch": 0.17356205852674067, "grad_norm": 1.9533817768096924, "learning_rate": 9.963156570561878e-05, "loss": 1.6947, "step": 860 }, { "epoch": 0.17558022199798184, "grad_norm": 2.1692941188812256, "learning_rate": 9.96110706364354e-05, "loss": 1.784, "step": 870 }, { "epoch": 0.17759838546922302, "grad_norm": 2.229025363922119, "learning_rate": 9.959002307072312e-05, "loss": 1.7266, "step": 880 }, { "epoch": 0.17961654894046417, "grad_norm": 3.380396604537964, "learning_rate": 9.956842324287936e-05, "loss": 1.9071, "step": 890 }, { "epoch": 0.18163471241170534, "grad_norm": 1.9694714546203613, "learning_rate": 9.954627139345186e-05, "loss": 1.7442, "step": 900 }, { "epoch": 0.18365287588294651, "grad_norm": 1.9261081218719482, "learning_rate": 9.952356776913594e-05, "loss": 1.8254, "step": 910 }, { "epoch": 0.1856710393541877, "grad_norm": 1.6406139135360718, "learning_rate": 9.950031262277183e-05, "loss": 1.8027, "step": 920 }, { "epoch": 0.18768920282542886, "grad_norm": 1.874161720275879, "learning_rate": 9.947650621334179e-05, "loss": 1.8027, "step": 930 }, { "epoch": 0.18970736629667004, "grad_norm": 2.183828115463257, "learning_rate": 9.945214880596725e-05, "loss": 1.7398, "step": 940 }, { "epoch": 0.1917255297679112, "grad_norm": 1.954749584197998, "learning_rate": 9.94272406719059e-05, "loss": 1.7675, "step": 950 }, { "epoch": 0.19374369323915236, "grad_norm": 1.966210126876831, "learning_rate": 9.940178208854858e-05, "loss": 1.7841, "step": 960 }, { "epoch": 0.19576185671039353, "grad_norm": 2.180774688720703, "learning_rate": 9.937577333941626e-05, "loss": 1.6492, "step": 970 }, { "epoch": 0.1977800201816347, "grad_norm": 1.921655297279358, "learning_rate": 9.934921471415687e-05, "loss": 1.7661, "step": 980 }, { "epoch": 0.19979818365287588, "grad_norm": 1.8591227531433105, "learning_rate": 9.932210650854205e-05, "loss": 1.7543, "step": 990 }, { "epoch": 0.20181634712411706, "grad_norm": 1.6740055084228516, "learning_rate": 9.929444902446392e-05, "loss": 1.7578, "step": 1000 }, { "epoch": 0.20383451059535823, "grad_norm": 2.937347650527954, "learning_rate": 9.92662425699316e-05, "loss": 1.7566, "step": 1010 }, { "epoch": 0.2058526740665994, "grad_norm": 1.9954780340194702, "learning_rate": 9.923748745906789e-05, "loss": 1.8376, "step": 1020 }, { "epoch": 0.20787083753784055, "grad_norm": 2.204568386077881, "learning_rate": 9.920818401210574e-05, "loss": 1.7558, "step": 1030 }, { "epoch": 0.20988900100908173, "grad_norm": 2.1600749492645264, "learning_rate": 9.917833255538467e-05, "loss": 1.8082, "step": 1040 }, { "epoch": 0.2119071644803229, "grad_norm": 2.097487449645996, "learning_rate": 9.914793342134711e-05, "loss": 1.7874, "step": 1050 }, { "epoch": 0.21392532795156408, "grad_norm": 1.796174168586731, "learning_rate": 9.911698694853477e-05, "loss": 1.7015, "step": 1060 }, { "epoch": 0.21594349142280525, "grad_norm": 1.9651933908462524, "learning_rate": 9.908549348158485e-05, "loss": 1.8868, "step": 1070 }, { "epoch": 0.21796165489404642, "grad_norm": 2.083768367767334, "learning_rate": 9.905345337122609e-05, "loss": 1.8338, "step": 1080 }, { "epoch": 0.2199798183652876, "grad_norm": 1.953058123588562, "learning_rate": 9.902086697427504e-05, "loss": 1.792, "step": 1090 }, { "epoch": 0.22199798183652875, "grad_norm": 1.9650559425354004, "learning_rate": 9.8987734653632e-05, "loss": 1.6179, "step": 1100 }, { "epoch": 0.22401614530776992, "grad_norm": 1.7365285158157349, "learning_rate": 9.895405677827692e-05, "loss": 1.6512, "step": 1110 }, { "epoch": 0.2260343087790111, "grad_norm": 1.7468541860580444, "learning_rate": 9.89198337232654e-05, "loss": 1.6747, "step": 1120 }, { "epoch": 0.22805247225025227, "grad_norm": 1.967139720916748, "learning_rate": 9.888506586972446e-05, "loss": 1.7086, "step": 1130 }, { "epoch": 0.23007063572149344, "grad_norm": 1.824622631072998, "learning_rate": 9.884975360484827e-05, "loss": 1.887, "step": 1140 }, { "epoch": 0.23208879919273462, "grad_norm": 2.1145553588867188, "learning_rate": 9.881389732189392e-05, "loss": 1.7733, "step": 1150 }, { "epoch": 0.2341069626639758, "grad_norm": 1.6385475397109985, "learning_rate": 9.877749742017694e-05, "loss": 1.6961, "step": 1160 }, { "epoch": 0.23612512613521694, "grad_norm": 1.9020243883132935, "learning_rate": 9.874055430506691e-05, "loss": 1.6124, "step": 1170 }, { "epoch": 0.23814328960645811, "grad_norm": 1.817199945449829, "learning_rate": 9.870306838798297e-05, "loss": 1.6601, "step": 1180 }, { "epoch": 0.2401614530776993, "grad_norm": 2.001286268234253, "learning_rate": 9.866504008638917e-05, "loss": 1.7485, "step": 1190 }, { "epoch": 0.24217961654894046, "grad_norm": 1.9406825304031372, "learning_rate": 9.862646982378987e-05, "loss": 1.8185, "step": 1200 }, { "epoch": 0.24419778002018164, "grad_norm": 1.7781312465667725, "learning_rate": 9.8587358029725e-05, "loss": 1.751, "step": 1210 }, { "epoch": 0.2462159434914228, "grad_norm": 1.793482780456543, "learning_rate": 9.854770513976531e-05, "loss": 1.6768, "step": 1220 }, { "epoch": 0.248234106962664, "grad_norm": 1.9680509567260742, "learning_rate": 9.850751159550746e-05, "loss": 1.6726, "step": 1230 }, { "epoch": 0.25025227043390513, "grad_norm": 2.0250823497772217, "learning_rate": 9.846677784456918e-05, "loss": 1.7675, "step": 1240 }, { "epoch": 0.2522704339051463, "grad_norm": 1.7494395971298218, "learning_rate": 9.842550434058421e-05, "loss": 1.6584, "step": 1250 }, { "epoch": 0.2542885973763875, "grad_norm": 2.1690242290496826, "learning_rate": 9.838369154319728e-05, "loss": 1.802, "step": 1260 }, { "epoch": 0.25630676084762866, "grad_norm": 1.6883882284164429, "learning_rate": 9.8341339918059e-05, "loss": 1.5858, "step": 1270 }, { "epoch": 0.25832492431886983, "grad_norm": 1.9592852592468262, "learning_rate": 9.82984499368207e-05, "loss": 1.7107, "step": 1280 }, { "epoch": 0.260343087790111, "grad_norm": 1.9985136985778809, "learning_rate": 9.825502207712909e-05, "loss": 1.674, "step": 1290 }, { "epoch": 0.2623612512613522, "grad_norm": 1.6702767610549927, "learning_rate": 9.821105682262099e-05, "loss": 1.6821, "step": 1300 }, { "epoch": 0.26437941473259335, "grad_norm": 1.7378309965133667, "learning_rate": 9.816655466291803e-05, "loss": 1.7136, "step": 1310 }, { "epoch": 0.26639757820383453, "grad_norm": 1.7488417625427246, "learning_rate": 9.812151609362102e-05, "loss": 1.5944, "step": 1320 }, { "epoch": 0.2684157416750757, "grad_norm": 1.7831872701644897, "learning_rate": 9.807594161630458e-05, "loss": 1.575, "step": 1330 }, { "epoch": 0.2704339051463169, "grad_norm": 2.0005006790161133, "learning_rate": 9.802983173851149e-05, "loss": 1.6744, "step": 1340 }, { "epoch": 0.272452068617558, "grad_norm": 1.5925371646881104, "learning_rate": 9.798318697374702e-05, "loss": 1.6679, "step": 1350 }, { "epoch": 0.27447023208879917, "grad_norm": 1.6449369192123413, "learning_rate": 9.79360078414733e-05, "loss": 1.6395, "step": 1360 }, { "epoch": 0.27648839556004035, "grad_norm": 1.870029091835022, "learning_rate": 9.78882948671034e-05, "loss": 1.6746, "step": 1370 }, { "epoch": 0.2785065590312815, "grad_norm": 1.8943411111831665, "learning_rate": 9.784004858199563e-05, "loss": 1.7118, "step": 1380 }, { "epoch": 0.2805247225025227, "grad_norm": 1.9735866785049438, "learning_rate": 9.779126952344748e-05, "loss": 1.6215, "step": 1390 }, { "epoch": 0.28254288597376387, "grad_norm": 1.719292402267456, "learning_rate": 9.774195823468973e-05, "loss": 1.6028, "step": 1400 }, { "epoch": 0.28456104944500504, "grad_norm": 1.7490273714065552, "learning_rate": 9.769211526488038e-05, "loss": 1.5878, "step": 1410 }, { "epoch": 0.2865792129162462, "grad_norm": 1.621436357498169, "learning_rate": 9.764174116909852e-05, "loss": 1.7188, "step": 1420 }, { "epoch": 0.2885973763874874, "grad_norm": 1.7977508306503296, "learning_rate": 9.759083650833815e-05, "loss": 1.6966, "step": 1430 }, { "epoch": 0.29061553985872857, "grad_norm": 1.6182868480682373, "learning_rate": 9.753940184950192e-05, "loss": 1.6072, "step": 1440 }, { "epoch": 0.29263370332996974, "grad_norm": 2.094125747680664, "learning_rate": 9.748743776539488e-05, "loss": 1.7654, "step": 1450 }, { "epoch": 0.2946518668012109, "grad_norm": 2.0340657234191895, "learning_rate": 9.743494483471801e-05, "loss": 1.5987, "step": 1460 }, { "epoch": 0.2966700302724521, "grad_norm": 1.8374106884002686, "learning_rate": 9.738192364206185e-05, "loss": 1.6468, "step": 1470 }, { "epoch": 0.29868819374369326, "grad_norm": 1.8509396314620972, "learning_rate": 9.732837477789993e-05, "loss": 1.6514, "step": 1480 }, { "epoch": 0.3007063572149344, "grad_norm": 2.0533978939056396, "learning_rate": 9.727429883858227e-05, "loss": 1.6375, "step": 1490 }, { "epoch": 0.30272452068617556, "grad_norm": 1.7004354000091553, "learning_rate": 9.721969642632865e-05, "loss": 1.5852, "step": 1500 }, { "epoch": 0.30474268415741673, "grad_norm": 1.8703371286392212, "learning_rate": 9.716456814922196e-05, "loss": 1.5644, "step": 1510 }, { "epoch": 0.3067608476286579, "grad_norm": 1.6673510074615479, "learning_rate": 9.710891462120141e-05, "loss": 1.6704, "step": 1520 }, { "epoch": 0.3087790110998991, "grad_norm": 1.8909664154052734, "learning_rate": 9.70527364620557e-05, "loss": 1.7009, "step": 1530 }, { "epoch": 0.31079717457114026, "grad_norm": 1.6285436153411865, "learning_rate": 9.699603429741615e-05, "loss": 1.6874, "step": 1540 }, { "epoch": 0.31281533804238143, "grad_norm": 1.7128878831863403, "learning_rate": 9.693880875874961e-05, "loss": 1.8054, "step": 1550 }, { "epoch": 0.3148335015136226, "grad_norm": 2.0219228267669678, "learning_rate": 9.68810604833516e-05, "loss": 1.5689, "step": 1560 }, { "epoch": 0.3168516649848638, "grad_norm": 1.5529290437698364, "learning_rate": 9.682279011433908e-05, "loss": 1.7903, "step": 1570 }, { "epoch": 0.31886982845610495, "grad_norm": 1.7940829992294312, "learning_rate": 9.676399830064339e-05, "loss": 1.5716, "step": 1580 }, { "epoch": 0.32088799192734613, "grad_norm": 1.6057904958724976, "learning_rate": 9.670468569700288e-05, "loss": 1.6821, "step": 1590 }, { "epoch": 0.3229061553985873, "grad_norm": 1.8908942937850952, "learning_rate": 9.664485296395578e-05, "loss": 1.541, "step": 1600 }, { "epoch": 0.3249243188698285, "grad_norm": 1.8195271492004395, "learning_rate": 9.658450076783274e-05, "loss": 1.65, "step": 1610 }, { "epoch": 0.32694248234106965, "grad_norm": 6.518181800842285, "learning_rate": 9.652362978074947e-05, "loss": 1.6047, "step": 1620 }, { "epoch": 0.32896064581231077, "grad_norm": 1.5660922527313232, "learning_rate": 9.646224068059917e-05, "loss": 1.7102, "step": 1630 }, { "epoch": 0.33097880928355194, "grad_norm": 1.6389318704605103, "learning_rate": 9.640033415104508e-05, "loss": 1.6255, "step": 1640 }, { "epoch": 0.3329969727547931, "grad_norm": 1.6905416250228882, "learning_rate": 9.633791088151283e-05, "loss": 1.5718, "step": 1650 }, { "epoch": 0.3350151362260343, "grad_norm": 1.7866181135177612, "learning_rate": 9.627497156718271e-05, "loss": 1.7042, "step": 1660 }, { "epoch": 0.33703329969727547, "grad_norm": 1.7515119314193726, "learning_rate": 9.621151690898203e-05, "loss": 1.6239, "step": 1670 }, { "epoch": 0.33905146316851664, "grad_norm": 1.9881833791732788, "learning_rate": 9.614754761357718e-05, "loss": 1.7982, "step": 1680 }, { "epoch": 0.3410696266397578, "grad_norm": 1.8311055898666382, "learning_rate": 9.608306439336592e-05, "loss": 1.7399, "step": 1690 }, { "epoch": 0.343087790110999, "grad_norm": 1.688659906387329, "learning_rate": 9.60180679664693e-05, "loss": 1.6666, "step": 1700 }, { "epoch": 0.34510595358224017, "grad_norm": 1.8084384202957153, "learning_rate": 9.595255905672377e-05, "loss": 1.5487, "step": 1710 }, { "epoch": 0.34712411705348134, "grad_norm": 1.7272626161575317, "learning_rate": 9.588653839367302e-05, "loss": 1.551, "step": 1720 }, { "epoch": 0.3491422805247225, "grad_norm": 1.7780117988586426, "learning_rate": 9.582000671256e-05, "loss": 1.6598, "step": 1730 }, { "epoch": 0.3511604439959637, "grad_norm": 2.405978202819824, "learning_rate": 9.575296475431855e-05, "loss": 1.6297, "step": 1740 }, { "epoch": 0.35317860746720486, "grad_norm": 1.6499069929122925, "learning_rate": 9.568541326556527e-05, "loss": 1.5609, "step": 1750 }, { "epoch": 0.35519677093844604, "grad_norm": 1.7063804864883423, "learning_rate": 9.56173529985912e-05, "loss": 1.6281, "step": 1760 }, { "epoch": 0.35721493440968716, "grad_norm": 2.427915573120117, "learning_rate": 9.554878471135339e-05, "loss": 1.6166, "step": 1770 }, { "epoch": 0.35923309788092833, "grad_norm": 1.7606953382492065, "learning_rate": 9.547970916746649e-05, "loss": 1.6152, "step": 1780 }, { "epoch": 0.3612512613521695, "grad_norm": 1.7059513330459595, "learning_rate": 9.541012713619428e-05, "loss": 1.5078, "step": 1790 }, { "epoch": 0.3632694248234107, "grad_norm": 1.4295872449874878, "learning_rate": 9.5340039392441e-05, "loss": 1.5864, "step": 1800 }, { "epoch": 0.36528758829465185, "grad_norm": 1.8844209909439087, "learning_rate": 9.526944671674286e-05, "loss": 1.5401, "step": 1810 }, { "epoch": 0.36730575176589303, "grad_norm": 1.6048489809036255, "learning_rate": 9.51983498952592e-05, "loss": 1.7246, "step": 1820 }, { "epoch": 0.3693239152371342, "grad_norm": 2.0559234619140625, "learning_rate": 9.512674971976385e-05, "loss": 1.4579, "step": 1830 }, { "epoch": 0.3713420787083754, "grad_norm": 1.7094260454177856, "learning_rate": 9.505464698763629e-05, "loss": 1.5574, "step": 1840 }, { "epoch": 0.37336024217961655, "grad_norm": 1.5393606424331665, "learning_rate": 9.49820425018527e-05, "loss": 1.5389, "step": 1850 }, { "epoch": 0.3753784056508577, "grad_norm": 2.1417794227600098, "learning_rate": 9.49089370709771e-05, "loss": 1.6108, "step": 1860 }, { "epoch": 0.3773965691220989, "grad_norm": 1.6064629554748535, "learning_rate": 9.483533150915229e-05, "loss": 1.6211, "step": 1870 }, { "epoch": 0.3794147325933401, "grad_norm": 2.044330596923828, "learning_rate": 9.476122663609086e-05, "loss": 1.6693, "step": 1880 }, { "epoch": 0.38143289606458125, "grad_norm": 1.7574785947799683, "learning_rate": 9.468662327706594e-05, "loss": 1.5502, "step": 1890 }, { "epoch": 0.3834510595358224, "grad_norm": 1.8084129095077515, "learning_rate": 9.461152226290212e-05, "loss": 1.6606, "step": 1900 }, { "epoch": 0.3854692230070636, "grad_norm": 1.4581211805343628, "learning_rate": 9.453592442996614e-05, "loss": 1.5317, "step": 1910 }, { "epoch": 0.3874873864783047, "grad_norm": 1.8625178337097168, "learning_rate": 9.445983062015761e-05, "loss": 1.4944, "step": 1920 }, { "epoch": 0.3895055499495459, "grad_norm": 1.7608696222305298, "learning_rate": 9.43832416808996e-05, "loss": 1.5913, "step": 1930 }, { "epoch": 0.39152371342078707, "grad_norm": 1.9058195352554321, "learning_rate": 9.430615846512923e-05, "loss": 1.6255, "step": 1940 }, { "epoch": 0.39354187689202824, "grad_norm": 1.6042306423187256, "learning_rate": 9.422858183128808e-05, "loss": 1.6525, "step": 1950 }, { "epoch": 0.3955600403632694, "grad_norm": 1.6958731412887573, "learning_rate": 9.415051264331285e-05, "loss": 1.4745, "step": 1960 }, { "epoch": 0.3975782038345106, "grad_norm": 1.7777879238128662, "learning_rate": 9.407195177062549e-05, "loss": 1.5836, "step": 1970 }, { "epoch": 0.39959636730575177, "grad_norm": 1.7249339818954468, "learning_rate": 9.399290008812365e-05, "loss": 1.4844, "step": 1980 }, { "epoch": 0.40161453077699294, "grad_norm": 1.854981541633606, "learning_rate": 9.391335847617093e-05, "loss": 1.6211, "step": 1990 }, { "epoch": 0.4036326942482341, "grad_norm": 1.4041742086410522, "learning_rate": 9.383332782058705e-05, "loss": 1.6664, "step": 2000 }, { "epoch": 0.4056508577194753, "grad_norm": 1.605724573135376, "learning_rate": 9.375280901263796e-05, "loss": 1.5706, "step": 2010 }, { "epoch": 0.40766902119071646, "grad_norm": 1.7073047161102295, "learning_rate": 9.367180294902603e-05, "loss": 1.6047, "step": 2020 }, { "epoch": 0.40968718466195764, "grad_norm": 1.5172196626663208, "learning_rate": 9.359031053187988e-05, "loss": 1.5207, "step": 2030 }, { "epoch": 0.4117053481331988, "grad_norm": 1.9370439052581787, "learning_rate": 9.350833266874451e-05, "loss": 1.5746, "step": 2040 }, { "epoch": 0.41372351160444, "grad_norm": 2.1937825679779053, "learning_rate": 9.342587027257104e-05, "loss": 1.7112, "step": 2050 }, { "epoch": 0.4157416750756811, "grad_norm": 1.7659974098205566, "learning_rate": 9.334292426170672e-05, "loss": 1.6329, "step": 2060 }, { "epoch": 0.4177598385469223, "grad_norm": 1.5360724925994873, "learning_rate": 9.325949555988452e-05, "loss": 1.6289, "step": 2070 }, { "epoch": 0.41977800201816345, "grad_norm": 1.4664136171340942, "learning_rate": 9.317558509621296e-05, "loss": 1.6237, "step": 2080 }, { "epoch": 0.42179616548940463, "grad_norm": 1.5282961130142212, "learning_rate": 9.309119380516573e-05, "loss": 1.5247, "step": 2090 }, { "epoch": 0.4238143289606458, "grad_norm": 1.7818903923034668, "learning_rate": 9.300632262657128e-05, "loss": 1.6479, "step": 2100 }, { "epoch": 0.425832492431887, "grad_norm": 1.8099353313446045, "learning_rate": 9.292097250560232e-05, "loss": 1.692, "step": 2110 }, { "epoch": 0.42785065590312815, "grad_norm": 1.6518584489822388, "learning_rate": 9.283514439276539e-05, "loss": 1.5806, "step": 2120 }, { "epoch": 0.4298688193743693, "grad_norm": 1.6262339353561401, "learning_rate": 9.274883924389018e-05, "loss": 1.6018, "step": 2130 }, { "epoch": 0.4318869828456105, "grad_norm": 1.858011245727539, "learning_rate": 9.266205802011892e-05, "loss": 1.6162, "step": 2140 }, { "epoch": 0.4339051463168517, "grad_norm": 1.5540958642959595, "learning_rate": 9.257480168789565e-05, "loss": 1.5558, "step": 2150 }, { "epoch": 0.43592330978809285, "grad_norm": 1.5216200351715088, "learning_rate": 9.248707121895555e-05, "loss": 1.6317, "step": 2160 }, { "epoch": 0.437941473259334, "grad_norm": 2.030132532119751, "learning_rate": 9.239886759031398e-05, "loss": 1.457, "step": 2170 }, { "epoch": 0.4399596367305752, "grad_norm": 1.6100101470947266, "learning_rate": 9.231019178425573e-05, "loss": 1.6372, "step": 2180 }, { "epoch": 0.4419778002018164, "grad_norm": 1.6417200565338135, "learning_rate": 9.222104478832398e-05, "loss": 1.5867, "step": 2190 }, { "epoch": 0.4439959636730575, "grad_norm": 1.5133955478668213, "learning_rate": 9.213142759530936e-05, "loss": 1.5338, "step": 2200 }, { "epoch": 0.44601412714429867, "grad_norm": 1.543249249458313, "learning_rate": 9.204134120323883e-05, "loss": 1.6463, "step": 2210 }, { "epoch": 0.44803229061553984, "grad_norm": 2.0193662643432617, "learning_rate": 9.195078661536471e-05, "loss": 1.5299, "step": 2220 }, { "epoch": 0.450050454086781, "grad_norm": 1.4934704303741455, "learning_rate": 9.185976484015333e-05, "loss": 1.5422, "step": 2230 }, { "epoch": 0.4520686175580222, "grad_norm": 1.6684024333953857, "learning_rate": 9.176827689127389e-05, "loss": 1.62, "step": 2240 }, { "epoch": 0.45408678102926336, "grad_norm": 1.7710318565368652, "learning_rate": 9.167632378758719e-05, "loss": 1.4557, "step": 2250 }, { "epoch": 0.45610494450050454, "grad_norm": 1.717943549156189, "learning_rate": 9.158390655313422e-05, "loss": 1.601, "step": 2260 }, { "epoch": 0.4581231079717457, "grad_norm": 1.751428246498108, "learning_rate": 9.149102621712482e-05, "loss": 1.5032, "step": 2270 }, { "epoch": 0.4601412714429869, "grad_norm": 1.8777353763580322, "learning_rate": 9.139768381392616e-05, "loss": 1.6255, "step": 2280 }, { "epoch": 0.46215943491422806, "grad_norm": 1.7452517747879028, "learning_rate": 9.130388038305127e-05, "loss": 1.6209, "step": 2290 }, { "epoch": 0.46417759838546924, "grad_norm": 1.7831089496612549, "learning_rate": 9.12096169691474e-05, "loss": 1.6401, "step": 2300 }, { "epoch": 0.4661957618567104, "grad_norm": 1.591808795928955, "learning_rate": 9.111489462198448e-05, "loss": 1.5767, "step": 2310 }, { "epoch": 0.4682139253279516, "grad_norm": 1.6915345191955566, "learning_rate": 9.101971439644335e-05, "loss": 1.556, "step": 2320 }, { "epoch": 0.47023208879919276, "grad_norm": 1.8185473680496216, "learning_rate": 9.092407735250404e-05, "loss": 1.6477, "step": 2330 }, { "epoch": 0.4722502522704339, "grad_norm": 1.7397855520248413, "learning_rate": 9.082798455523396e-05, "loss": 1.4885, "step": 2340 }, { "epoch": 0.47426841574167505, "grad_norm": 1.7972854375839233, "learning_rate": 9.073143707477607e-05, "loss": 1.6802, "step": 2350 }, { "epoch": 0.47628657921291623, "grad_norm": 1.7079814672470093, "learning_rate": 9.063443598633688e-05, "loss": 1.5201, "step": 2360 }, { "epoch": 0.4783047426841574, "grad_norm": 1.7126438617706299, "learning_rate": 9.053698237017459e-05, "loss": 1.5861, "step": 2370 }, { "epoch": 0.4803229061553986, "grad_norm": 1.7915846109390259, "learning_rate": 9.043907731158699e-05, "loss": 1.5139, "step": 2380 }, { "epoch": 0.48234106962663975, "grad_norm": 1.5365029573440552, "learning_rate": 9.034072190089932e-05, "loss": 1.5428, "step": 2390 }, { "epoch": 0.4843592330978809, "grad_norm": 1.720226526260376, "learning_rate": 9.02419172334523e-05, "loss": 1.4767, "step": 2400 }, { "epoch": 0.4863773965691221, "grad_norm": 1.9600703716278076, "learning_rate": 9.014266440958974e-05, "loss": 1.6188, "step": 2410 }, { "epoch": 0.4883955600403633, "grad_norm": 1.8720204830169678, "learning_rate": 9.004296453464638e-05, "loss": 1.5432, "step": 2420 }, { "epoch": 0.49041372351160445, "grad_norm": 1.6241135597229004, "learning_rate": 8.994281871893562e-05, "loss": 1.5496, "step": 2430 }, { "epoch": 0.4924318869828456, "grad_norm": 1.8902565240859985, "learning_rate": 8.984222807773706e-05, "loss": 1.6235, "step": 2440 }, { "epoch": 0.4944500504540868, "grad_norm": 1.925911784172058, "learning_rate": 8.974119373128411e-05, "loss": 1.5734, "step": 2450 }, { "epoch": 0.496468213925328, "grad_norm": 2.0046586990356445, "learning_rate": 8.963971680475161e-05, "loss": 1.5009, "step": 2460 }, { "epoch": 0.49848637739656915, "grad_norm": 1.6803854703903198, "learning_rate": 8.95377984282431e-05, "loss": 1.5605, "step": 2470 }, { "epoch": 0.5005045408678103, "grad_norm": 1.5540610551834106, "learning_rate": 8.943543973677846e-05, "loss": 1.5961, "step": 2480 }, { "epoch": 0.5025227043390514, "grad_norm": 1.625119686126709, "learning_rate": 8.933264187028109e-05, "loss": 1.604, "step": 2490 }, { "epoch": 0.5045408678102926, "grad_norm": 1.7552522420883179, "learning_rate": 8.922940597356532e-05, "loss": 1.4812, "step": 2500 }, { "epoch": 0.5065590312815338, "grad_norm": 1.7819663286209106, "learning_rate": 8.912573319632367e-05, "loss": 1.6794, "step": 2510 }, { "epoch": 0.508577194752775, "grad_norm": 1.715986728668213, "learning_rate": 8.90216246931139e-05, "loss": 1.5846, "step": 2520 }, { "epoch": 0.5105953582240161, "grad_norm": 1.6441289186477661, "learning_rate": 8.891708162334635e-05, "loss": 1.5525, "step": 2530 }, { "epoch": 0.5126135216952573, "grad_norm": 1.7777501344680786, "learning_rate": 8.88121051512709e-05, "loss": 1.5136, "step": 2540 }, { "epoch": 0.5146316851664985, "grad_norm": 1.7654690742492676, "learning_rate": 8.870669644596402e-05, "loss": 1.574, "step": 2550 }, { "epoch": 0.5166498486377397, "grad_norm": 1.329132080078125, "learning_rate": 8.860085668131582e-05, "loss": 1.4299, "step": 2560 }, { "epoch": 0.5186680121089808, "grad_norm": 1.7770514488220215, "learning_rate": 8.84945870360169e-05, "loss": 1.6012, "step": 2570 }, { "epoch": 0.520686175580222, "grad_norm": 1.421560287475586, "learning_rate": 8.838788869354522e-05, "loss": 1.5075, "step": 2580 }, { "epoch": 0.5227043390514632, "grad_norm": 2.0325329303741455, "learning_rate": 8.828076284215301e-05, "loss": 1.4582, "step": 2590 }, { "epoch": 0.5247225025227044, "grad_norm": 2.303136110305786, "learning_rate": 8.817321067485343e-05, "loss": 1.6037, "step": 2600 }, { "epoch": 0.5267406659939455, "grad_norm": 1.5532690286636353, "learning_rate": 8.806523338940736e-05, "loss": 1.6264, "step": 2610 }, { "epoch": 0.5287588294651867, "grad_norm": 1.9267081022262573, "learning_rate": 8.795683218831001e-05, "loss": 1.6513, "step": 2620 }, { "epoch": 0.5307769929364279, "grad_norm": 1.4832985401153564, "learning_rate": 8.78480082787776e-05, "loss": 1.5968, "step": 2630 }, { "epoch": 0.5327951564076691, "grad_norm": 1.6407015323638916, "learning_rate": 8.773876287273377e-05, "loss": 1.6084, "step": 2640 }, { "epoch": 0.5348133198789102, "grad_norm": 1.6609562635421753, "learning_rate": 8.762909718679629e-05, "loss": 1.5557, "step": 2650 }, { "epoch": 0.5368314833501514, "grad_norm": 1.6559169292449951, "learning_rate": 8.751901244226332e-05, "loss": 1.623, "step": 2660 }, { "epoch": 0.5388496468213926, "grad_norm": 1.5652551651000977, "learning_rate": 8.740850986509994e-05, "loss": 1.4157, "step": 2670 }, { "epoch": 0.5408678102926338, "grad_norm": 1.9073171615600586, "learning_rate": 8.729759068592442e-05, "loss": 1.5152, "step": 2680 }, { "epoch": 0.5428859737638748, "grad_norm": 1.72683846950531, "learning_rate": 8.718625613999457e-05, "loss": 1.6011, "step": 2690 }, { "epoch": 0.544904137235116, "grad_norm": 1.3139292001724243, "learning_rate": 8.70745074671939e-05, "loss": 1.5194, "step": 2700 }, { "epoch": 0.5469223007063572, "grad_norm": 1.6831001043319702, "learning_rate": 8.696234591201793e-05, "loss": 1.5078, "step": 2710 }, { "epoch": 0.5489404641775983, "grad_norm": 1.5346101522445679, "learning_rate": 8.684977272356024e-05, "loss": 1.4988, "step": 2720 }, { "epoch": 0.5509586276488395, "grad_norm": 1.6062910556793213, "learning_rate": 8.673678915549855e-05, "loss": 1.6626, "step": 2730 }, { "epoch": 0.5529767911200807, "grad_norm": 1.7970744371414185, "learning_rate": 8.662339646608089e-05, "loss": 1.5251, "step": 2740 }, { "epoch": 0.5549949545913219, "grad_norm": 1.687828540802002, "learning_rate": 8.650959591811141e-05, "loss": 1.5361, "step": 2750 }, { "epoch": 0.557013118062563, "grad_norm": 1.8278907537460327, "learning_rate": 8.639538877893644e-05, "loss": 1.4754, "step": 2760 }, { "epoch": 0.5590312815338042, "grad_norm": 1.5675766468048096, "learning_rate": 8.628077632043032e-05, "loss": 1.5059, "step": 2770 }, { "epoch": 0.5610494450050454, "grad_norm": 1.5290266275405884, "learning_rate": 8.616575981898125e-05, "loss": 1.4684, "step": 2780 }, { "epoch": 0.5630676084762866, "grad_norm": 1.3652130365371704, "learning_rate": 8.605034055547709e-05, "loss": 1.4736, "step": 2790 }, { "epoch": 0.5650857719475277, "grad_norm": 1.4992094039916992, "learning_rate": 8.593451981529108e-05, "loss": 1.5559, "step": 2800 }, { "epoch": 0.5671039354187689, "grad_norm": 1.5727661848068237, "learning_rate": 8.581829888826754e-05, "loss": 1.5884, "step": 2810 }, { "epoch": 0.5691220988900101, "grad_norm": 1.8026494979858398, "learning_rate": 8.570167906870745e-05, "loss": 1.5782, "step": 2820 }, { "epoch": 0.5711402623612513, "grad_norm": 1.701768159866333, "learning_rate": 8.558466165535411e-05, "loss": 1.598, "step": 2830 }, { "epoch": 0.5731584258324924, "grad_norm": 1.6463897228240967, "learning_rate": 8.546724795137865e-05, "loss": 1.4889, "step": 2840 }, { "epoch": 0.5751765893037336, "grad_norm": 1.5795738697052002, "learning_rate": 8.534943926436554e-05, "loss": 1.5306, "step": 2850 }, { "epoch": 0.5771947527749748, "grad_norm": 1.5358326435089111, "learning_rate": 8.523123690629791e-05, "loss": 1.5801, "step": 2860 }, { "epoch": 0.579212916246216, "grad_norm": 1.566599726676941, "learning_rate": 8.511264219354313e-05, "loss": 1.4904, "step": 2870 }, { "epoch": 0.5812310797174571, "grad_norm": 1.3994381427764893, "learning_rate": 8.4993656446838e-05, "loss": 1.5097, "step": 2880 }, { "epoch": 0.5832492431886983, "grad_norm": 1.5844892263412476, "learning_rate": 8.48742809912741e-05, "loss": 1.561, "step": 2890 }, { "epoch": 0.5852674066599395, "grad_norm": 1.592240333557129, "learning_rate": 8.475451715628302e-05, "loss": 1.4621, "step": 2900 }, { "epoch": 0.5872855701311807, "grad_norm": 1.835154414176941, "learning_rate": 8.463436627562158e-05, "loss": 1.5978, "step": 2910 }, { "epoch": 0.5893037336024218, "grad_norm": 1.5988234281539917, "learning_rate": 8.451382968735693e-05, "loss": 1.4611, "step": 2920 }, { "epoch": 0.591321897073663, "grad_norm": 1.551164984703064, "learning_rate": 8.43929087338517e-05, "loss": 1.4682, "step": 2930 }, { "epoch": 0.5933400605449042, "grad_norm": 1.537986159324646, "learning_rate": 8.4271604761749e-05, "loss": 1.437, "step": 2940 }, { "epoch": 0.5953582240161454, "grad_norm": 1.5694961547851562, "learning_rate": 8.414991912195747e-05, "loss": 1.5569, "step": 2950 }, { "epoch": 0.5973763874873865, "grad_norm": 1.6652250289916992, "learning_rate": 8.402785316963618e-05, "loss": 1.4936, "step": 2960 }, { "epoch": 0.5993945509586277, "grad_norm": 1.690019965171814, "learning_rate": 8.390540826417964e-05, "loss": 1.4642, "step": 2970 }, { "epoch": 0.6014127144298688, "grad_norm": 1.532211422920227, "learning_rate": 8.378258576920253e-05, "loss": 1.5379, "step": 2980 }, { "epoch": 0.6034308779011099, "grad_norm": 1.6568650007247925, "learning_rate": 8.365938705252459e-05, "loss": 1.4731, "step": 2990 }, { "epoch": 0.6054490413723511, "grad_norm": 1.3985533714294434, "learning_rate": 8.353581348615538e-05, "loss": 1.5571, "step": 3000 }, { "epoch": 0.6074672048435923, "grad_norm": 1.7050758600234985, "learning_rate": 8.341186644627901e-05, "loss": 1.6194, "step": 3010 }, { "epoch": 0.6094853683148335, "grad_norm": 1.656028389930725, "learning_rate": 8.32875473132388e-05, "loss": 1.4723, "step": 3020 }, { "epoch": 0.6115035317860746, "grad_norm": 1.4241358041763306, "learning_rate": 8.316285747152189e-05, "loss": 1.4154, "step": 3030 }, { "epoch": 0.6135216952573158, "grad_norm": 1.6242072582244873, "learning_rate": 8.30377983097438e-05, "loss": 1.4525, "step": 3040 }, { "epoch": 0.615539858728557, "grad_norm": 1.6200363636016846, "learning_rate": 8.291237122063309e-05, "loss": 1.5451, "step": 3050 }, { "epoch": 0.6175580221997982, "grad_norm": 1.2775624990463257, "learning_rate": 8.27865776010157e-05, "loss": 1.454, "step": 3060 }, { "epoch": 0.6195761856710393, "grad_norm": 1.5615248680114746, "learning_rate": 8.266041885179949e-05, "loss": 1.4296, "step": 3070 }, { "epoch": 0.6215943491422805, "grad_norm": 1.7214854955673218, "learning_rate": 8.253389637795858e-05, "loss": 1.4538, "step": 3080 }, { "epoch": 0.6236125126135217, "grad_norm": 1.4547313451766968, "learning_rate": 8.240701158851778e-05, "loss": 1.4107, "step": 3090 }, { "epoch": 0.6256306760847629, "grad_norm": 1.4714813232421875, "learning_rate": 8.227976589653676e-05, "loss": 1.3942, "step": 3100 }, { "epoch": 0.627648839556004, "grad_norm": 1.4874024391174316, "learning_rate": 8.215216071909448e-05, "loss": 1.5679, "step": 3110 }, { "epoch": 0.6296670030272452, "grad_norm": 1.5745984315872192, "learning_rate": 8.202419747727333e-05, "loss": 1.4826, "step": 3120 }, { "epoch": 0.6316851664984864, "grad_norm": 1.5688875913619995, "learning_rate": 8.189587759614325e-05, "loss": 1.4611, "step": 3130 }, { "epoch": 0.6337033299697276, "grad_norm": 1.5299983024597168, "learning_rate": 8.176720250474594e-05, "loss": 1.4565, "step": 3140 }, { "epoch": 0.6357214934409687, "grad_norm": 1.5063962936401367, "learning_rate": 8.163817363607894e-05, "loss": 1.5174, "step": 3150 }, { "epoch": 0.6377396569122099, "grad_norm": 1.7101162672042847, "learning_rate": 8.150879242707962e-05, "loss": 1.4651, "step": 3160 }, { "epoch": 0.6397578203834511, "grad_norm": 1.6039749383926392, "learning_rate": 8.137906031860925e-05, "loss": 1.5918, "step": 3170 }, { "epoch": 0.6417759838546923, "grad_norm": 1.54131019115448, "learning_rate": 8.124897875543684e-05, "loss": 1.4304, "step": 3180 }, { "epoch": 0.6437941473259334, "grad_norm": 1.422153353691101, "learning_rate": 8.111854918622321e-05, "loss": 1.6108, "step": 3190 }, { "epoch": 0.6458123107971746, "grad_norm": 1.6443060636520386, "learning_rate": 8.098777306350469e-05, "loss": 1.4497, "step": 3200 }, { "epoch": 0.6478304742684158, "grad_norm": 1.5704275369644165, "learning_rate": 8.08566518436771e-05, "loss": 1.5172, "step": 3210 }, { "epoch": 0.649848637739657, "grad_norm": 1.7546700239181519, "learning_rate": 8.072518698697938e-05, "loss": 1.4498, "step": 3220 }, { "epoch": 0.6518668012108981, "grad_norm": 1.650565266609192, "learning_rate": 8.059337995747743e-05, "loss": 1.4536, "step": 3230 }, { "epoch": 0.6538849646821393, "grad_norm": 1.516605019569397, "learning_rate": 8.046123222304781e-05, "loss": 1.5499, "step": 3240 }, { "epoch": 0.6559031281533805, "grad_norm": 1.4544646739959717, "learning_rate": 8.032874525536131e-05, "loss": 1.4791, "step": 3250 }, { "epoch": 0.6579212916246215, "grad_norm": 1.590570092201233, "learning_rate": 8.019592052986665e-05, "loss": 1.3705, "step": 3260 }, { "epoch": 0.6599394550958627, "grad_norm": 1.3931959867477417, "learning_rate": 8.006275952577397e-05, "loss": 1.5409, "step": 3270 }, { "epoch": 0.6619576185671039, "grad_norm": 1.676604986190796, "learning_rate": 7.992926372603842e-05, "loss": 1.4835, "step": 3280 }, { "epoch": 0.6639757820383451, "grad_norm": 1.832785725593567, "learning_rate": 7.979543461734362e-05, "loss": 1.4715, "step": 3290 }, { "epoch": 0.6659939455095862, "grad_norm": 1.3548346757888794, "learning_rate": 7.966127369008512e-05, "loss": 1.4553, "step": 3300 }, { "epoch": 0.6680121089808274, "grad_norm": 1.397743821144104, "learning_rate": 7.952678243835376e-05, "loss": 1.4793, "step": 3310 }, { "epoch": 0.6700302724520686, "grad_norm": 1.5748425722122192, "learning_rate": 7.939196235991904e-05, "loss": 1.4791, "step": 3320 }, { "epoch": 0.6720484359233098, "grad_norm": 1.672951340675354, "learning_rate": 7.925681495621253e-05, "loss": 1.5467, "step": 3330 }, { "epoch": 0.6740665993945509, "grad_norm": 1.3464049100875854, "learning_rate": 7.912134173231098e-05, "loss": 1.4887, "step": 3340 }, { "epoch": 0.6760847628657921, "grad_norm": 1.761104941368103, "learning_rate": 7.898554419691974e-05, "loss": 1.4937, "step": 3350 }, { "epoch": 0.6781029263370333, "grad_norm": 1.5034103393554688, "learning_rate": 7.884942386235582e-05, "loss": 1.3636, "step": 3360 }, { "epoch": 0.6801210898082745, "grad_norm": 1.463498592376709, "learning_rate": 7.871298224453113e-05, "loss": 1.3987, "step": 3370 }, { "epoch": 0.6821392532795156, "grad_norm": 1.4649510383605957, "learning_rate": 7.857622086293557e-05, "loss": 1.5976, "step": 3380 }, { "epoch": 0.6841574167507568, "grad_norm": 1.6214101314544678, "learning_rate": 7.843914124062006e-05, "loss": 1.4602, "step": 3390 }, { "epoch": 0.686175580221998, "grad_norm": 1.752223253250122, "learning_rate": 7.830174490417972e-05, "loss": 1.4979, "step": 3400 }, { "epoch": 0.6881937436932392, "grad_norm": 1.4902223348617554, "learning_rate": 7.816403338373666e-05, "loss": 1.4157, "step": 3410 }, { "epoch": 0.6902119071644803, "grad_norm": 1.4046521186828613, "learning_rate": 7.802600821292314e-05, "loss": 1.3817, "step": 3420 }, { "epoch": 0.6922300706357215, "grad_norm": 1.5104862451553345, "learning_rate": 7.78876709288644e-05, "loss": 1.4425, "step": 3430 }, { "epoch": 0.6942482341069627, "grad_norm": 1.6517716646194458, "learning_rate": 7.774902307216148e-05, "loss": 1.5526, "step": 3440 }, { "epoch": 0.6962663975782039, "grad_norm": 1.43543541431427, "learning_rate": 7.76100661868742e-05, "loss": 1.5054, "step": 3450 }, { "epoch": 0.698284561049445, "grad_norm": 1.665265679359436, "learning_rate": 7.747080182050388e-05, "loss": 1.4123, "step": 3460 }, { "epoch": 0.7003027245206862, "grad_norm": 1.5892359018325806, "learning_rate": 7.733123152397609e-05, "loss": 1.4904, "step": 3470 }, { "epoch": 0.7023208879919274, "grad_norm": 1.5370794534683228, "learning_rate": 7.719135685162342e-05, "loss": 1.3999, "step": 3480 }, { "epoch": 0.7043390514631686, "grad_norm": 1.509031891822815, "learning_rate": 7.705117936116822e-05, "loss": 1.5462, "step": 3490 }, { "epoch": 0.7063572149344097, "grad_norm": 1.753881812095642, "learning_rate": 7.691070061370507e-05, "loss": 1.5522, "step": 3500 }, { "epoch": 0.7083753784056509, "grad_norm": 1.5018798112869263, "learning_rate": 7.676992217368364e-05, "loss": 1.4837, "step": 3510 }, { "epoch": 0.7103935418768921, "grad_norm": 1.5720365047454834, "learning_rate": 7.662884560889105e-05, "loss": 1.3675, "step": 3520 }, { "epoch": 0.7124117053481333, "grad_norm": 1.3165663480758667, "learning_rate": 7.648747249043457e-05, "loss": 1.5472, "step": 3530 }, { "epoch": 0.7144298688193743, "grad_norm": 1.523558497428894, "learning_rate": 7.634580439272401e-05, "loss": 1.5398, "step": 3540 }, { "epoch": 0.7164480322906155, "grad_norm": 1.283341884613037, "learning_rate": 7.620384289345425e-05, "loss": 1.5009, "step": 3550 }, { "epoch": 0.7184661957618567, "grad_norm": 1.6135417222976685, "learning_rate": 7.606158957358769e-05, "loss": 1.4926, "step": 3560 }, { "epoch": 0.7204843592330978, "grad_norm": 1.5714365243911743, "learning_rate": 7.591904601733655e-05, "loss": 1.5098, "step": 3570 }, { "epoch": 0.722502522704339, "grad_norm": 1.4282561540603638, "learning_rate": 7.577621381214529e-05, "loss": 1.4814, "step": 3580 }, { "epoch": 0.7245206861755802, "grad_norm": 1.384590744972229, "learning_rate": 7.563309454867295e-05, "loss": 1.4716, "step": 3590 }, { "epoch": 0.7265388496468214, "grad_norm": 1.3100378513336182, "learning_rate": 7.548968982077542e-05, "loss": 1.3972, "step": 3600 }, { "epoch": 0.7285570131180625, "grad_norm": 1.2014504671096802, "learning_rate": 7.534600122548765e-05, "loss": 1.392, "step": 3610 }, { "epoch": 0.7305751765893037, "grad_norm": 1.5689890384674072, "learning_rate": 7.520203036300588e-05, "loss": 1.4531, "step": 3620 }, { "epoch": 0.7325933400605449, "grad_norm": 1.7837287187576294, "learning_rate": 7.505777883666993e-05, "loss": 1.6061, "step": 3630 }, { "epoch": 0.7346115035317861, "grad_norm": 1.3635213375091553, "learning_rate": 7.491324825294514e-05, "loss": 1.4351, "step": 3640 }, { "epoch": 0.7366296670030272, "grad_norm": 1.3929572105407715, "learning_rate": 7.476844022140464e-05, "loss": 1.4991, "step": 3650 }, { "epoch": 0.7386478304742684, "grad_norm": 1.3147705793380737, "learning_rate": 7.462335635471136e-05, "loss": 1.4049, "step": 3660 }, { "epoch": 0.7406659939455096, "grad_norm": 1.5074125528335571, "learning_rate": 7.44779982686001e-05, "loss": 1.4351, "step": 3670 }, { "epoch": 0.7426841574167508, "grad_norm": 1.4625370502471924, "learning_rate": 7.43323675818595e-05, "loss": 1.3535, "step": 3680 }, { "epoch": 0.7447023208879919, "grad_norm": 1.6273128986358643, "learning_rate": 7.418646591631404e-05, "loss": 1.3886, "step": 3690 }, { "epoch": 0.7467204843592331, "grad_norm": 1.5632721185684204, "learning_rate": 7.404029489680598e-05, "loss": 1.4134, "step": 3700 }, { "epoch": 0.7487386478304743, "grad_norm": 1.491721510887146, "learning_rate": 7.389385615117723e-05, "loss": 1.4279, "step": 3710 }, { "epoch": 0.7507568113017155, "grad_norm": 1.5549407005310059, "learning_rate": 7.37471513102513e-05, "loss": 1.3888, "step": 3720 }, { "epoch": 0.7527749747729566, "grad_norm": 1.4470081329345703, "learning_rate": 7.360018200781502e-05, "loss": 1.4272, "step": 3730 }, { "epoch": 0.7547931382441978, "grad_norm": 1.686854600906372, "learning_rate": 7.345294988060046e-05, "loss": 1.5853, "step": 3740 }, { "epoch": 0.756811301715439, "grad_norm": 1.5734236240386963, "learning_rate": 7.330545656826662e-05, "loss": 1.44, "step": 3750 }, { "epoch": 0.7588294651866802, "grad_norm": 1.3938251733779907, "learning_rate": 7.315770371338126e-05, "loss": 1.3882, "step": 3760 }, { "epoch": 0.7608476286579213, "grad_norm": 1.5192281007766724, "learning_rate": 7.300969296140244e-05, "loss": 1.5221, "step": 3770 }, { "epoch": 0.7628657921291625, "grad_norm": 1.7236813306808472, "learning_rate": 7.286142596066044e-05, "loss": 1.4553, "step": 3780 }, { "epoch": 0.7648839556004037, "grad_norm": 1.402596116065979, "learning_rate": 7.271290436233916e-05, "loss": 1.4925, "step": 3790 }, { "epoch": 0.7669021190716448, "grad_norm": 1.5744677782058716, "learning_rate": 7.25641298204579e-05, "loss": 1.4484, "step": 3800 }, { "epoch": 0.768920282542886, "grad_norm": 1.640576720237732, "learning_rate": 7.241510399185287e-05, "loss": 1.4277, "step": 3810 }, { "epoch": 0.7709384460141272, "grad_norm": 1.6124114990234375, "learning_rate": 7.226582853615874e-05, "loss": 1.3545, "step": 3820 }, { "epoch": 0.7729566094853683, "grad_norm": 1.7142858505249023, "learning_rate": 7.211630511579015e-05, "loss": 1.4184, "step": 3830 }, { "epoch": 0.7749747729566094, "grad_norm": 1.3614004850387573, "learning_rate": 7.196653539592326e-05, "loss": 1.4101, "step": 3840 }, { "epoch": 0.7769929364278506, "grad_norm": 1.3301514387130737, "learning_rate": 7.181652104447711e-05, "loss": 1.4297, "step": 3850 }, { "epoch": 0.7790110998990918, "grad_norm": 1.441169023513794, "learning_rate": 7.166626373209514e-05, "loss": 1.4615, "step": 3860 }, { "epoch": 0.781029263370333, "grad_norm": 1.4002233743667603, "learning_rate": 7.15157651321265e-05, "loss": 1.5348, "step": 3870 }, { "epoch": 0.7830474268415741, "grad_norm": 1.4642239809036255, "learning_rate": 7.136502692060746e-05, "loss": 1.5266, "step": 3880 }, { "epoch": 0.7850655903128153, "grad_norm": 1.4477249383926392, "learning_rate": 7.121405077624276e-05, "loss": 1.468, "step": 3890 }, { "epoch": 0.7870837537840565, "grad_norm": 1.3571120500564575, "learning_rate": 7.106283838038685e-05, "loss": 1.472, "step": 3900 }, { "epoch": 0.7891019172552977, "grad_norm": 1.3332570791244507, "learning_rate": 7.091139141702527e-05, "loss": 1.3955, "step": 3910 }, { "epoch": 0.7911200807265388, "grad_norm": 1.5350067615509033, "learning_rate": 7.075971157275575e-05, "loss": 1.4683, "step": 3920 }, { "epoch": 0.79313824419778, "grad_norm": 1.4552041292190552, "learning_rate": 7.06078005367696e-05, "loss": 1.4189, "step": 3930 }, { "epoch": 0.7951564076690212, "grad_norm": 1.4249471426010132, "learning_rate": 7.045566000083278e-05, "loss": 1.4861, "step": 3940 }, { "epoch": 0.7971745711402624, "grad_norm": 1.3417631387710571, "learning_rate": 7.030329165926706e-05, "loss": 1.4106, "step": 3950 }, { "epoch": 0.7991927346115035, "grad_norm": 1.616866111755371, "learning_rate": 7.01506972089312e-05, "loss": 1.5251, "step": 3960 }, { "epoch": 0.8012108980827447, "grad_norm": 1.5517117977142334, "learning_rate": 6.999787834920202e-05, "loss": 1.388, "step": 3970 }, { "epoch": 0.8032290615539859, "grad_norm": 1.3729262351989746, "learning_rate": 6.984483678195553e-05, "loss": 1.4466, "step": 3980 }, { "epoch": 0.805247225025227, "grad_norm": 1.5104920864105225, "learning_rate": 6.969157421154789e-05, "loss": 1.488, "step": 3990 }, { "epoch": 0.8072653884964682, "grad_norm": 1.644098162651062, "learning_rate": 6.95380923447965e-05, "loss": 1.3932, "step": 4000 }, { "epoch": 0.8092835519677094, "grad_norm": 1.4966418743133545, "learning_rate": 6.938439289096095e-05, "loss": 1.4094, "step": 4010 }, { "epoch": 0.8113017154389506, "grad_norm": 1.6125532388687134, "learning_rate": 6.923047756172401e-05, "loss": 1.4399, "step": 4020 }, { "epoch": 0.8133198789101918, "grad_norm": 1.3540160655975342, "learning_rate": 6.907634807117257e-05, "loss": 1.3183, "step": 4030 }, { "epoch": 0.8153380423814329, "grad_norm": 1.390666127204895, "learning_rate": 6.892200613577852e-05, "loss": 1.5065, "step": 4040 }, { "epoch": 0.8173562058526741, "grad_norm": 1.4623992443084717, "learning_rate": 6.876745347437964e-05, "loss": 1.4661, "step": 4050 }, { "epoch": 0.8193743693239153, "grad_norm": 1.251976728439331, "learning_rate": 6.861269180816052e-05, "loss": 1.3724, "step": 4060 }, { "epoch": 0.8213925327951564, "grad_norm": 1.4126683473587036, "learning_rate": 6.845772286063332e-05, "loss": 1.4495, "step": 4070 }, { "epoch": 0.8234106962663976, "grad_norm": 1.3776429891586304, "learning_rate": 6.830254835761856e-05, "loss": 1.4128, "step": 4080 }, { "epoch": 0.8254288597376388, "grad_norm": 1.5709989070892334, "learning_rate": 6.814717002722602e-05, "loss": 1.5365, "step": 4090 }, { "epoch": 0.82744702320888, "grad_norm": 1.326343059539795, "learning_rate": 6.799158959983536e-05, "loss": 1.4633, "step": 4100 }, { "epoch": 0.829465186680121, "grad_norm": 1.4777864217758179, "learning_rate": 6.78358088080769e-05, "loss": 1.5106, "step": 4110 }, { "epoch": 0.8314833501513622, "grad_norm": 1.595955729484558, "learning_rate": 6.767982938681239e-05, "loss": 1.5467, "step": 4120 }, { "epoch": 0.8335015136226034, "grad_norm": 1.611190915107727, "learning_rate": 6.752365307311556e-05, "loss": 1.4587, "step": 4130 }, { "epoch": 0.8355196770938446, "grad_norm": 1.2045243978500366, "learning_rate": 6.736728160625284e-05, "loss": 1.5403, "step": 4140 }, { "epoch": 0.8375378405650857, "grad_norm": 1.5161255598068237, "learning_rate": 6.721071672766406e-05, "loss": 1.4287, "step": 4150 }, { "epoch": 0.8395560040363269, "grad_norm": 1.5501888990402222, "learning_rate": 6.705396018094297e-05, "loss": 1.4729, "step": 4160 }, { "epoch": 0.8415741675075681, "grad_norm": 1.350813388824463, "learning_rate": 6.689701371181781e-05, "loss": 1.3742, "step": 4170 }, { "epoch": 0.8435923309788093, "grad_norm": 1.5314749479293823, "learning_rate": 6.673987906813191e-05, "loss": 1.355, "step": 4180 }, { "epoch": 0.8456104944500504, "grad_norm": 1.5933685302734375, "learning_rate": 6.658255799982424e-05, "loss": 1.4609, "step": 4190 }, { "epoch": 0.8476286579212916, "grad_norm": 1.429506778717041, "learning_rate": 6.642505225890987e-05, "loss": 1.3823, "step": 4200 }, { "epoch": 0.8496468213925328, "grad_norm": 1.5170999765396118, "learning_rate": 6.626736359946052e-05, "loss": 1.4732, "step": 4210 }, { "epoch": 0.851664984863774, "grad_norm": 1.3810912370681763, "learning_rate": 6.610949377758497e-05, "loss": 1.4792, "step": 4220 }, { "epoch": 0.8536831483350151, "grad_norm": 1.5815091133117676, "learning_rate": 6.595144455140952e-05, "loss": 1.4539, "step": 4230 }, { "epoch": 0.8557013118062563, "grad_norm": 1.4099366664886475, "learning_rate": 6.579321768105845e-05, "loss": 1.459, "step": 4240 }, { "epoch": 0.8577194752774975, "grad_norm": 1.529607892036438, "learning_rate": 6.563481492863436e-05, "loss": 1.434, "step": 4250 }, { "epoch": 0.8597376387487387, "grad_norm": 1.4503751993179321, "learning_rate": 6.547623805819854e-05, "loss": 1.3988, "step": 4260 }, { "epoch": 0.8617558022199798, "grad_norm": 1.3069409132003784, "learning_rate": 6.531748883575143e-05, "loss": 1.4464, "step": 4270 }, { "epoch": 0.863773965691221, "grad_norm": 1.5749943256378174, "learning_rate": 6.51585690292128e-05, "loss": 1.3765, "step": 4280 }, { "epoch": 0.8657921291624622, "grad_norm": 1.7546286582946777, "learning_rate": 6.499948040840219e-05, "loss": 1.4531, "step": 4290 }, { "epoch": 0.8678102926337034, "grad_norm": 1.4653708934783936, "learning_rate": 6.484022474501914e-05, "loss": 1.421, "step": 4300 }, { "epoch": 0.8698284561049445, "grad_norm": 1.4347306489944458, "learning_rate": 6.468080381262347e-05, "loss": 1.4107, "step": 4310 }, { "epoch": 0.8718466195761857, "grad_norm": 1.6876280307769775, "learning_rate": 6.45212193866155e-05, "loss": 1.4524, "step": 4320 }, { "epoch": 0.8738647830474269, "grad_norm": 1.4129562377929688, "learning_rate": 6.436147324421635e-05, "loss": 1.3881, "step": 4330 }, { "epoch": 0.875882946518668, "grad_norm": 1.5428842306137085, "learning_rate": 6.420156716444805e-05, "loss": 1.3926, "step": 4340 }, { "epoch": 0.8779011099899092, "grad_norm": 1.4280976057052612, "learning_rate": 6.404150292811386e-05, "loss": 1.3836, "step": 4350 }, { "epoch": 0.8799192734611504, "grad_norm": 1.5553336143493652, "learning_rate": 6.388128231777828e-05, "loss": 1.5083, "step": 4360 }, { "epoch": 0.8819374369323916, "grad_norm": 1.3947172164916992, "learning_rate": 6.372090711774732e-05, "loss": 1.3669, "step": 4370 }, { "epoch": 0.8839556004036327, "grad_norm": 1.413853645324707, "learning_rate": 6.356037911404858e-05, "loss": 1.4119, "step": 4380 }, { "epoch": 0.8859737638748738, "grad_norm": 1.3096851110458374, "learning_rate": 6.339970009441137e-05, "loss": 1.391, "step": 4390 }, { "epoch": 0.887991927346115, "grad_norm": 1.2649918794631958, "learning_rate": 6.323887184824678e-05, "loss": 1.3865, "step": 4400 }, { "epoch": 0.8900100908173562, "grad_norm": 1.4351869821548462, "learning_rate": 6.307789616662778e-05, "loss": 1.3554, "step": 4410 }, { "epoch": 0.8920282542885973, "grad_norm": 1.4802148342132568, "learning_rate": 6.291677484226929e-05, "loss": 1.5499, "step": 4420 }, { "epoch": 0.8940464177598385, "grad_norm": 1.6230851411819458, "learning_rate": 6.275550966950814e-05, "loss": 1.4822, "step": 4430 }, { "epoch": 0.8960645812310797, "grad_norm": 1.442018985748291, "learning_rate": 6.259410244428318e-05, "loss": 1.383, "step": 4440 }, { "epoch": 0.8980827447023209, "grad_norm": 1.493252158164978, "learning_rate": 6.243255496411519e-05, "loss": 1.4048, "step": 4450 }, { "epoch": 0.900100908173562, "grad_norm": 1.5293428897857666, "learning_rate": 6.227086902808697e-05, "loss": 1.5437, "step": 4460 }, { "epoch": 0.9021190716448032, "grad_norm": 1.3244189023971558, "learning_rate": 6.210904643682318e-05, "loss": 1.4652, "step": 4470 }, { "epoch": 0.9041372351160444, "grad_norm": 1.4349359273910522, "learning_rate": 6.194708899247037e-05, "loss": 1.4024, "step": 4480 }, { "epoch": 0.9061553985872856, "grad_norm": 1.5854321718215942, "learning_rate": 6.178499849867689e-05, "loss": 1.4496, "step": 4490 }, { "epoch": 0.9081735620585267, "grad_norm": 1.4568113088607788, "learning_rate": 6.162277676057284e-05, "loss": 1.4378, "step": 4500 }, { "epoch": 0.9101917255297679, "grad_norm": 1.3261117935180664, "learning_rate": 6.146042558474987e-05, "loss": 1.4257, "step": 4510 }, { "epoch": 0.9122098890010091, "grad_norm": 1.4419591426849365, "learning_rate": 6.129794677924113e-05, "loss": 1.4045, "step": 4520 }, { "epoch": 0.9142280524722503, "grad_norm": 1.3974449634552002, "learning_rate": 6.113534215350116e-05, "loss": 1.4281, "step": 4530 }, { "epoch": 0.9162462159434914, "grad_norm": 1.3392093181610107, "learning_rate": 6.097261351838569e-05, "loss": 1.5913, "step": 4540 }, { "epoch": 0.9182643794147326, "grad_norm": 1.3982704877853394, "learning_rate": 6.0809762686131474e-05, "loss": 1.391, "step": 4550 }, { "epoch": 0.9202825428859738, "grad_norm": 1.2605746984481812, "learning_rate": 6.064679147033614e-05, "loss": 1.4911, "step": 4560 }, { "epoch": 0.922300706357215, "grad_norm": 1.5178440809249878, "learning_rate": 6.0483701685937954e-05, "loss": 1.4428, "step": 4570 }, { "epoch": 0.9243188698284561, "grad_norm": 1.522623062133789, "learning_rate": 6.0320495149195644e-05, "loss": 1.395, "step": 4580 }, { "epoch": 0.9263370332996973, "grad_norm": 1.7341537475585938, "learning_rate": 6.015717367766815e-05, "loss": 1.4924, "step": 4590 }, { "epoch": 0.9283551967709385, "grad_norm": 1.4069305658340454, "learning_rate": 5.999373909019437e-05, "loss": 1.4476, "step": 4600 }, { "epoch": 0.9303733602421796, "grad_norm": 1.2393527030944824, "learning_rate": 5.9830193206872974e-05, "loss": 1.4227, "step": 4610 }, { "epoch": 0.9323915237134208, "grad_norm": 1.2959623336791992, "learning_rate": 5.966653784904207e-05, "loss": 1.4123, "step": 4620 }, { "epoch": 0.934409687184662, "grad_norm": 1.6350857019424438, "learning_rate": 5.950277483925889e-05, "loss": 1.4116, "step": 4630 }, { "epoch": 0.9364278506559032, "grad_norm": 1.1435925960540771, "learning_rate": 5.933890600127958e-05, "loss": 1.4417, "step": 4640 }, { "epoch": 0.9384460141271443, "grad_norm": 1.4868639707565308, "learning_rate": 5.917493316003884e-05, "loss": 1.4769, "step": 4650 }, { "epoch": 0.9404641775983855, "grad_norm": 1.4463169574737549, "learning_rate": 5.90108581416296e-05, "loss": 1.4507, "step": 4660 }, { "epoch": 0.9424823410696267, "grad_norm": 1.3405494689941406, "learning_rate": 5.8846682773282694e-05, "loss": 1.4446, "step": 4670 }, { "epoch": 0.9445005045408678, "grad_norm": 1.4846607446670532, "learning_rate": 5.868240888334653e-05, "loss": 1.439, "step": 4680 }, { "epoch": 0.9465186680121089, "grad_norm": 1.2763575315475464, "learning_rate": 5.851803830126666e-05, "loss": 1.5239, "step": 4690 }, { "epoch": 0.9485368314833501, "grad_norm": 1.274949550628662, "learning_rate": 5.835357285756552e-05, "loss": 1.374, "step": 4700 }, { "epoch": 0.9505549949545913, "grad_norm": 1.277209758758545, "learning_rate": 5.8189014383821914e-05, "loss": 1.3033, "step": 4710 }, { "epoch": 0.9525731584258325, "grad_norm": 1.395630955696106, "learning_rate": 5.8024364712650724e-05, "loss": 1.3979, "step": 4720 }, { "epoch": 0.9545913218970736, "grad_norm": 1.2895914316177368, "learning_rate": 5.785962567768243e-05, "loss": 1.376, "step": 4730 }, { "epoch": 0.9566094853683148, "grad_norm": 1.2383575439453125, "learning_rate": 5.769479911354273e-05, "loss": 1.4396, "step": 4740 }, { "epoch": 0.958627648839556, "grad_norm": 1.3582751750946045, "learning_rate": 5.7529886855832096e-05, "loss": 1.497, "step": 4750 }, { "epoch": 0.9606458123107972, "grad_norm": 1.4777917861938477, "learning_rate": 5.736489074110533e-05, "loss": 1.2924, "step": 4760 }, { "epoch": 0.9626639757820383, "grad_norm": 1.448093056678772, "learning_rate": 5.71998126068511e-05, "loss": 1.3533, "step": 4770 }, { "epoch": 0.9646821392532795, "grad_norm": 1.336315393447876, "learning_rate": 5.7034654291471524e-05, "loss": 1.4275, "step": 4780 }, { "epoch": 0.9667003027245207, "grad_norm": 1.2323698997497559, "learning_rate": 5.686941763426161e-05, "loss": 1.4587, "step": 4790 }, { "epoch": 0.9687184661957619, "grad_norm": 1.4090189933776855, "learning_rate": 5.670410447538889e-05, "loss": 1.3976, "step": 4800 }, { "epoch": 0.970736629667003, "grad_norm": 1.471298336982727, "learning_rate": 5.653871665587278e-05, "loss": 1.4002, "step": 4810 }, { "epoch": 0.9727547931382442, "grad_norm": 1.2523037195205688, "learning_rate": 5.6373256017564215e-05, "loss": 1.3906, "step": 4820 }, { "epoch": 0.9747729566094854, "grad_norm": 1.441237211227417, "learning_rate": 5.620772440312508e-05, "loss": 1.3976, "step": 4830 }, { "epoch": 0.9767911200807265, "grad_norm": 1.7172768115997314, "learning_rate": 5.6042123656007685e-05, "loss": 1.4364, "step": 4840 }, { "epoch": 0.9788092835519677, "grad_norm": 1.4941967725753784, "learning_rate": 5.587645562043422e-05, "loss": 1.4107, "step": 4850 }, { "epoch": 0.9808274470232089, "grad_norm": 1.4575670957565308, "learning_rate": 5.5710722141376245e-05, "loss": 1.342, "step": 4860 }, { "epoch": 0.9828456104944501, "grad_norm": 1.235521674156189, "learning_rate": 5.5544925064534145e-05, "loss": 1.2921, "step": 4870 }, { "epoch": 0.9848637739656912, "grad_norm": 1.5723048448562622, "learning_rate": 5.537906623631657e-05, "loss": 1.5273, "step": 4880 }, { "epoch": 0.9868819374369324, "grad_norm": 1.4097591638565063, "learning_rate": 5.521314750381983e-05, "loss": 1.3714, "step": 4890 }, { "epoch": 0.9889001009081736, "grad_norm": 1.4014616012573242, "learning_rate": 5.5047170714807406e-05, "loss": 1.3598, "step": 4900 }, { "epoch": 0.9909182643794148, "grad_norm": 1.2477079629898071, "learning_rate": 5.4881137717689315e-05, "loss": 1.3501, "step": 4910 }, { "epoch": 0.992936427850656, "grad_norm": 1.3461527824401855, "learning_rate": 5.471505036150154e-05, "loss": 1.3813, "step": 4920 }, { "epoch": 0.9949545913218971, "grad_norm": 1.6173443794250488, "learning_rate": 5.454891049588544e-05, "loss": 1.5266, "step": 4930 }, { "epoch": 0.9969727547931383, "grad_norm": 1.351027011871338, "learning_rate": 5.438271997106712e-05, "loss": 1.395, "step": 4940 }, { "epoch": 0.9989909182643795, "grad_norm": 1.210114598274231, "learning_rate": 5.421648063783689e-05, "loss": 1.284, "step": 4950 } ], "logging_steps": 10, "max_steps": 9910, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 4955, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.387441374475059e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }