|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.29856985041650497, |
|
"eval_steps": 1000, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.9856985041650495e-05, |
|
"grad_norm": 8.064935684204102, |
|
"learning_rate": 0.0, |
|
"loss": 3.4849, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0014928492520825246, |
|
"grad_norm": 3.3433420658111572, |
|
"learning_rate": 0.00019998742849959144, |
|
"loss": 1.9038, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0029856985041650493, |
|
"grad_norm": 2.907883405685425, |
|
"learning_rate": 0.0001999731427036726, |
|
"loss": 1.608, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004478547756247574, |
|
"grad_norm": 3.2614288330078125, |
|
"learning_rate": 0.00019995885690775376, |
|
"loss": 1.5505, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.005971397008330099, |
|
"grad_norm": 3.8400654792785645, |
|
"learning_rate": 0.00019994457111183493, |
|
"loss": 1.5737, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.007464246260412624, |
|
"grad_norm": 3.3358442783355713, |
|
"learning_rate": 0.00019993028531591612, |
|
"loss": 1.567, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.008957095512495149, |
|
"grad_norm": 2.131911277770996, |
|
"learning_rate": 0.00019991599951999726, |
|
"loss": 1.5208, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.010449944764577673, |
|
"grad_norm": 3.180992364883423, |
|
"learning_rate": 0.00019990171372407845, |
|
"loss": 1.5586, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.011942794016660197, |
|
"grad_norm": 3.024989128112793, |
|
"learning_rate": 0.0001998874279281596, |
|
"loss": 1.5267, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.013435643268742723, |
|
"grad_norm": 3.4935102462768555, |
|
"learning_rate": 0.00019987314213224078, |
|
"loss": 1.5609, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.014928492520825247, |
|
"grad_norm": 3.565504550933838, |
|
"learning_rate": 0.00019985885633632194, |
|
"loss": 1.5112, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01642134177290777, |
|
"grad_norm": 2.0692882537841797, |
|
"learning_rate": 0.00019984457054040308, |
|
"loss": 1.5087, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.017914191024990297, |
|
"grad_norm": 4.110323905944824, |
|
"learning_rate": 0.00019983028474448427, |
|
"loss": 1.5073, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.01940704027707282, |
|
"grad_norm": 2.8632736206054688, |
|
"learning_rate": 0.0001998159989485654, |
|
"loss": 1.472, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.020899889529155346, |
|
"grad_norm": 2.629347324371338, |
|
"learning_rate": 0.0001998017131526466, |
|
"loss": 1.5236, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.022392738781237872, |
|
"grad_norm": 3.696873188018799, |
|
"learning_rate": 0.00019978742735672774, |
|
"loss": 1.5216, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.023885588033320394, |
|
"grad_norm": 3.253305435180664, |
|
"learning_rate": 0.00019977314156080893, |
|
"loss": 1.5915, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.02537843728540292, |
|
"grad_norm": 2.9587886333465576, |
|
"learning_rate": 0.0001997588557648901, |
|
"loss": 1.4862, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.026871286537485446, |
|
"grad_norm": 3.3311073780059814, |
|
"learning_rate": 0.00019974456996897126, |
|
"loss": 1.4703, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.02836413578956797, |
|
"grad_norm": 2.5960264205932617, |
|
"learning_rate": 0.00019973028417305243, |
|
"loss": 1.4818, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.029856985041650495, |
|
"grad_norm": 3.3142144680023193, |
|
"learning_rate": 0.0001997159983771336, |
|
"loss": 1.4746, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03134983429373302, |
|
"grad_norm": 3.5049827098846436, |
|
"learning_rate": 0.00019970171258121476, |
|
"loss": 1.442, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.03284268354581554, |
|
"grad_norm": 3.0218605995178223, |
|
"learning_rate": 0.00019968742678529592, |
|
"loss": 1.5265, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03433553279789807, |
|
"grad_norm": 2.936182975769043, |
|
"learning_rate": 0.00019967314098937709, |
|
"loss": 1.5174, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.035828382049980595, |
|
"grad_norm": 2.877253293991089, |
|
"learning_rate": 0.00019965885519345825, |
|
"loss": 1.4499, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03732123130206312, |
|
"grad_norm": 6.07016658782959, |
|
"learning_rate": 0.00019964456939753941, |
|
"loss": 1.4542, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03881408055414564, |
|
"grad_norm": 2.1618189811706543, |
|
"learning_rate": 0.0001996302836016206, |
|
"loss": 1.4343, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.040306929806228166, |
|
"grad_norm": 2.8267719745635986, |
|
"learning_rate": 0.00019961599780570174, |
|
"loss": 1.463, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.04179977905831069, |
|
"grad_norm": 2.6036462783813477, |
|
"learning_rate": 0.00019960171200978294, |
|
"loss": 1.4557, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.04329262831039322, |
|
"grad_norm": 3.0187127590179443, |
|
"learning_rate": 0.00019958742621386407, |
|
"loss": 1.4472, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.044785477562475744, |
|
"grad_norm": 3.9822633266448975, |
|
"learning_rate": 0.00019957314041794526, |
|
"loss": 1.4384, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04627832681455826, |
|
"grad_norm": 2.919654607772827, |
|
"learning_rate": 0.0001995588546220264, |
|
"loss": 1.4969, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.04777117606664079, |
|
"grad_norm": 2.922963857650757, |
|
"learning_rate": 0.0001995445688261076, |
|
"loss": 1.4987, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.049264025318723315, |
|
"grad_norm": 2.9638512134552, |
|
"learning_rate": 0.00019953028303018876, |
|
"loss": 1.4722, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.05075687457080584, |
|
"grad_norm": 3.408391237258911, |
|
"learning_rate": 0.00019951599723426992, |
|
"loss": 1.4723, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.052249723822888366, |
|
"grad_norm": 3.023597240447998, |
|
"learning_rate": 0.0001995017114383511, |
|
"loss": 1.4478, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.05374257307497089, |
|
"grad_norm": 2.1655213832855225, |
|
"learning_rate": 0.00019948742564243225, |
|
"loss": 1.4071, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.05523542232705341, |
|
"grad_norm": 3.796663999557495, |
|
"learning_rate": 0.00019947313984651342, |
|
"loss": 1.446, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.05672827157913594, |
|
"grad_norm": 3.0415594577789307, |
|
"learning_rate": 0.00019945885405059458, |
|
"loss": 1.4324, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.05822112083121846, |
|
"grad_norm": 2.524627685546875, |
|
"learning_rate": 0.00019944456825467575, |
|
"loss": 1.3963, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05971397008330099, |
|
"grad_norm": 3.2881991863250732, |
|
"learning_rate": 0.0001994302824587569, |
|
"loss": 1.4522, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.061206819335383515, |
|
"grad_norm": 3.392430067062378, |
|
"learning_rate": 0.00019941599666283808, |
|
"loss": 1.4329, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.06269966858746603, |
|
"grad_norm": 3.9426393508911133, |
|
"learning_rate": 0.00019940171086691927, |
|
"loss": 1.5203, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.06419251783954856, |
|
"grad_norm": 3.3737235069274902, |
|
"learning_rate": 0.0001993874250710004, |
|
"loss": 1.3674, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.06568536709163109, |
|
"grad_norm": 3.783085346221924, |
|
"learning_rate": 0.0001993731392750816, |
|
"loss": 1.4339, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.06717821634371361, |
|
"grad_norm": 3.4819202423095703, |
|
"learning_rate": 0.00019935885347916273, |
|
"loss": 1.4436, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.06867106559579614, |
|
"grad_norm": 3.141775608062744, |
|
"learning_rate": 0.00019934456768324393, |
|
"loss": 1.4683, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.07016391484787866, |
|
"grad_norm": 3.2881035804748535, |
|
"learning_rate": 0.00019933028188732506, |
|
"loss": 1.4395, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.07165676409996119, |
|
"grad_norm": 3.718122959136963, |
|
"learning_rate": 0.00019931599609140626, |
|
"loss": 1.4396, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.07314961335204372, |
|
"grad_norm": 4.3829474449157715, |
|
"learning_rate": 0.00019930171029548742, |
|
"loss": 1.4477, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.07464246260412624, |
|
"grad_norm": 3.3698525428771973, |
|
"learning_rate": 0.00019928742449956858, |
|
"loss": 1.3529, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.07613531185620875, |
|
"grad_norm": 3.7569565773010254, |
|
"learning_rate": 0.00019927313870364975, |
|
"loss": 1.4246, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.07762816110829128, |
|
"grad_norm": 3.1486406326293945, |
|
"learning_rate": 0.00019925885290773091, |
|
"loss": 1.3813, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.0791210103603738, |
|
"grad_norm": 4.0635480880737305, |
|
"learning_rate": 0.00019924456711181208, |
|
"loss": 1.4259, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.08061385961245633, |
|
"grad_norm": 3.2710611820220947, |
|
"learning_rate": 0.00019923028131589324, |
|
"loss": 1.3747, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.08210670886453886, |
|
"grad_norm": 3.4968345165252686, |
|
"learning_rate": 0.0001992159955199744, |
|
"loss": 1.4721, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.08359955811662138, |
|
"grad_norm": 4.274214267730713, |
|
"learning_rate": 0.00019920170972405557, |
|
"loss": 1.437, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.08509240736870391, |
|
"grad_norm": 2.970602512359619, |
|
"learning_rate": 0.00019918742392813674, |
|
"loss": 1.3336, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.08658525662078644, |
|
"grad_norm": 4.143342971801758, |
|
"learning_rate": 0.00019917313813221793, |
|
"loss": 1.4264, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.08807810587286896, |
|
"grad_norm": 3.7546920776367188, |
|
"learning_rate": 0.00019915885233629907, |
|
"loss": 1.441, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.08957095512495149, |
|
"grad_norm": 3.9160516262054443, |
|
"learning_rate": 0.00019914456654038026, |
|
"loss": 1.4261, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.09106380437703401, |
|
"grad_norm": 3.842073917388916, |
|
"learning_rate": 0.0001991302807444614, |
|
"loss": 1.4076, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.09255665362911653, |
|
"grad_norm": 4.392395496368408, |
|
"learning_rate": 0.0001991159949485426, |
|
"loss": 1.3789, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.09404950288119905, |
|
"grad_norm": 3.822425603866577, |
|
"learning_rate": 0.00019910170915262373, |
|
"loss": 1.3877, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.09554235213328158, |
|
"grad_norm": 3.1348562240600586, |
|
"learning_rate": 0.0001990874233567049, |
|
"loss": 1.4081, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.0970352013853641, |
|
"grad_norm": 3.453887939453125, |
|
"learning_rate": 0.00019907313756078608, |
|
"loss": 1.4143, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.09852805063744663, |
|
"grad_norm": 3.5057384967803955, |
|
"learning_rate": 0.00019905885176486722, |
|
"loss": 1.4264, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.10002089988952916, |
|
"grad_norm": 3.145796060562134, |
|
"learning_rate": 0.0001990445659689484, |
|
"loss": 1.4368, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.10151374914161168, |
|
"grad_norm": 3.4077043533325195, |
|
"learning_rate": 0.00019903028017302955, |
|
"loss": 1.388, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.10300659839369421, |
|
"grad_norm": 3.65567946434021, |
|
"learning_rate": 0.00019901599437711074, |
|
"loss": 1.42, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.10449944764577673, |
|
"grad_norm": 4.460702419281006, |
|
"learning_rate": 0.0001990017085811919, |
|
"loss": 1.3991, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.10599229689785926, |
|
"grad_norm": 4.155653476715088, |
|
"learning_rate": 0.00019898742278527307, |
|
"loss": 1.371, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.10748514614994178, |
|
"grad_norm": 3.8904318809509277, |
|
"learning_rate": 0.00019897313698935423, |
|
"loss": 1.4378, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.1089779954020243, |
|
"grad_norm": 4.0509233474731445, |
|
"learning_rate": 0.0001989588511934354, |
|
"loss": 1.3945, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.11047084465410682, |
|
"grad_norm": 3.785123109817505, |
|
"learning_rate": 0.00019894456539751656, |
|
"loss": 1.436, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.11196369390618935, |
|
"grad_norm": 3.4556167125701904, |
|
"learning_rate": 0.00019893027960159773, |
|
"loss": 1.3794, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.11345654315827187, |
|
"grad_norm": 4.0479559898376465, |
|
"learning_rate": 0.0001989159938056789, |
|
"loss": 1.4734, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.1149493924103544, |
|
"grad_norm": 3.890805721282959, |
|
"learning_rate": 0.00019890170800976006, |
|
"loss": 1.4341, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.11644224166243693, |
|
"grad_norm": 3.8178727626800537, |
|
"learning_rate": 0.00019888742221384122, |
|
"loss": 1.4754, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.11793509091451945, |
|
"grad_norm": 2.456165075302124, |
|
"learning_rate": 0.00019887313641792241, |
|
"loss": 1.3887, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.11942794016660198, |
|
"grad_norm": 3.5763051509857178, |
|
"learning_rate": 0.00019885885062200355, |
|
"loss": 1.3901, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1209207894186845, |
|
"grad_norm": 3.885662317276001, |
|
"learning_rate": 0.00019884456482608474, |
|
"loss": 1.3856, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.12241363867076703, |
|
"grad_norm": 3.6095409393310547, |
|
"learning_rate": 0.00019883027903016588, |
|
"loss": 1.448, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.12390648792284956, |
|
"grad_norm": 3.7112534046173096, |
|
"learning_rate": 0.00019881599323424707, |
|
"loss": 1.3537, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.12539933717493207, |
|
"grad_norm": 3.3566672801971436, |
|
"learning_rate": 0.0001988017074383282, |
|
"loss": 1.4389, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.1268921864270146, |
|
"grad_norm": 4.570401191711426, |
|
"learning_rate": 0.0001987874216424094, |
|
"loss": 1.4191, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.12838503567909712, |
|
"grad_norm": 4.455029010772705, |
|
"learning_rate": 0.00019877313584649057, |
|
"loss": 1.3677, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.12987788493117966, |
|
"grad_norm": 3.0861828327178955, |
|
"learning_rate": 0.00019875885005057173, |
|
"loss": 1.3677, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.13137073418326217, |
|
"grad_norm": 4.419896602630615, |
|
"learning_rate": 0.0001987445642546529, |
|
"loss": 1.4524, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.1328635834353447, |
|
"grad_norm": 5.187576770782471, |
|
"learning_rate": 0.00019873027845873406, |
|
"loss": 1.3868, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.13435643268742722, |
|
"grad_norm": 5.111696243286133, |
|
"learning_rate": 0.00019871599266281523, |
|
"loss": 1.4458, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.13584928193950974, |
|
"grad_norm": 3.2652997970581055, |
|
"learning_rate": 0.0001987017068668964, |
|
"loss": 1.4529, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.13734213119159228, |
|
"grad_norm": 4.190273761749268, |
|
"learning_rate": 0.00019868742107097755, |
|
"loss": 1.3991, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.1388349804436748, |
|
"grad_norm": 4.85620641708374, |
|
"learning_rate": 0.00019867313527505872, |
|
"loss": 1.3916, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.14032782969575733, |
|
"grad_norm": 3.030954360961914, |
|
"learning_rate": 0.00019865884947913988, |
|
"loss": 1.3805, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.14182067894783984, |
|
"grad_norm": 3.264406681060791, |
|
"learning_rate": 0.00019864456368322108, |
|
"loss": 1.4048, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.14331352819992238, |
|
"grad_norm": 3.2138588428497314, |
|
"learning_rate": 0.0001986302778873022, |
|
"loss": 1.4092, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.1448063774520049, |
|
"grad_norm": 3.847222328186035, |
|
"learning_rate": 0.0001986159920913834, |
|
"loss": 1.3871, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.14629922670408743, |
|
"grad_norm": 4.004987716674805, |
|
"learning_rate": 0.00019860170629546454, |
|
"loss": 1.3845, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.14779207595616994, |
|
"grad_norm": 3.5088725090026855, |
|
"learning_rate": 0.00019858742049954573, |
|
"loss": 1.379, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.14928492520825248, |
|
"grad_norm": 3.275099277496338, |
|
"learning_rate": 0.00019857313470362687, |
|
"loss": 1.3628, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.150777774460335, |
|
"grad_norm": 3.7903060913085938, |
|
"learning_rate": 0.00019855884890770806, |
|
"loss": 1.3804, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.1522706237124175, |
|
"grad_norm": 4.294798374176025, |
|
"learning_rate": 0.00019854456311178923, |
|
"loss": 1.3988, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.15376347296450005, |
|
"grad_norm": 3.2719295024871826, |
|
"learning_rate": 0.0001985302773158704, |
|
"loss": 1.387, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.15525632221658256, |
|
"grad_norm": 4.143224239349365, |
|
"learning_rate": 0.00019851599151995156, |
|
"loss": 1.393, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.1567491714686651, |
|
"grad_norm": 3.404754638671875, |
|
"learning_rate": 0.00019850170572403272, |
|
"loss": 1.4205, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.1582420207207476, |
|
"grad_norm": 3.607126474380493, |
|
"learning_rate": 0.0001984874199281139, |
|
"loss": 1.4239, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.15973486997283015, |
|
"grad_norm": 4.140823841094971, |
|
"learning_rate": 0.00019847313413219505, |
|
"loss": 1.4204, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.16122771922491266, |
|
"grad_norm": 3.893251419067383, |
|
"learning_rate": 0.00019845884833627622, |
|
"loss": 1.392, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.1627205684769952, |
|
"grad_norm": 4.304211139678955, |
|
"learning_rate": 0.00019844456254035738, |
|
"loss": 1.44, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.16421341772907772, |
|
"grad_norm": 5.273501873016357, |
|
"learning_rate": 0.00019843027674443855, |
|
"loss": 1.445, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.16570626698116026, |
|
"grad_norm": 4.787700176239014, |
|
"learning_rate": 0.00019841599094851974, |
|
"loss": 1.3668, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.16719911623324277, |
|
"grad_norm": 3.7984108924865723, |
|
"learning_rate": 0.00019840170515260087, |
|
"loss": 1.3523, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.16869196548532528, |
|
"grad_norm": 3.885608673095703, |
|
"learning_rate": 0.00019838741935668207, |
|
"loss": 1.3917, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.17018481473740782, |
|
"grad_norm": 3.459803342819214, |
|
"learning_rate": 0.0001983731335607632, |
|
"loss": 1.3833, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.17167766398949033, |
|
"grad_norm": 3.7103006839752197, |
|
"learning_rate": 0.0001983588477648444, |
|
"loss": 1.4473, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.17317051324157287, |
|
"grad_norm": 6.645928382873535, |
|
"learning_rate": 0.00019834456196892553, |
|
"loss": 1.3706, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.17466336249365538, |
|
"grad_norm": 3.7201037406921387, |
|
"learning_rate": 0.0001983302761730067, |
|
"loss": 1.3733, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.17615621174573792, |
|
"grad_norm": 4.050106048583984, |
|
"learning_rate": 0.0001983159903770879, |
|
"loss": 1.4096, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.17764906099782043, |
|
"grad_norm": 4.190842628479004, |
|
"learning_rate": 0.00019830170458116903, |
|
"loss": 1.4404, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.17914191024990297, |
|
"grad_norm": 4.393162727355957, |
|
"learning_rate": 0.00019828741878525022, |
|
"loss": 1.4443, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.1806347595019855, |
|
"grad_norm": 3.597520351409912, |
|
"learning_rate": 0.00019827313298933136, |
|
"loss": 1.4063, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.18212760875406803, |
|
"grad_norm": 3.608085870742798, |
|
"learning_rate": 0.00019825884719341255, |
|
"loss": 1.3857, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.18362045800615054, |
|
"grad_norm": 3.7055492401123047, |
|
"learning_rate": 0.0001982445613974937, |
|
"loss": 1.3997, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.18511330725823305, |
|
"grad_norm": 3.875457763671875, |
|
"learning_rate": 0.00019823027560157488, |
|
"loss": 1.4296, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.1866061565103156, |
|
"grad_norm": 5.074592590332031, |
|
"learning_rate": 0.00019821598980565604, |
|
"loss": 1.3785, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.1880990057623981, |
|
"grad_norm": 6.013392448425293, |
|
"learning_rate": 0.0001982017040097372, |
|
"loss": 1.4391, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.18959185501448064, |
|
"grad_norm": 5.679958820343018, |
|
"learning_rate": 0.00019818741821381837, |
|
"loss": 1.367, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.19108470426656315, |
|
"grad_norm": 3.6182546615600586, |
|
"learning_rate": 0.00019817313241789954, |
|
"loss": 1.4508, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.1925775535186457, |
|
"grad_norm": 5.209213733673096, |
|
"learning_rate": 0.0001981588466219807, |
|
"loss": 1.3878, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.1940704027707282, |
|
"grad_norm": 3.0043230056762695, |
|
"learning_rate": 0.00019814456082606187, |
|
"loss": 1.4248, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.19556325202281075, |
|
"grad_norm": 3.157851219177246, |
|
"learning_rate": 0.00019813027503014303, |
|
"loss": 1.3725, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.19705610127489326, |
|
"grad_norm": 3.5292418003082275, |
|
"learning_rate": 0.0001981159892342242, |
|
"loss": 1.3932, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.1985489505269758, |
|
"grad_norm": 3.2819600105285645, |
|
"learning_rate": 0.00019810170343830536, |
|
"loss": 1.3495, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.2000417997790583, |
|
"grad_norm": 3.0243399143218994, |
|
"learning_rate": 0.00019808741764238655, |
|
"loss": 1.3689, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.20153464903114082, |
|
"grad_norm": 3.4495368003845215, |
|
"learning_rate": 0.0001980731318464677, |
|
"loss": 1.3725, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.20302749828322336, |
|
"grad_norm": 3.538259744644165, |
|
"learning_rate": 0.00019805884605054888, |
|
"loss": 1.3905, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.20452034753530587, |
|
"grad_norm": 4.162181377410889, |
|
"learning_rate": 0.00019804456025463002, |
|
"loss": 1.4129, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.20601319678738841, |
|
"grad_norm": 4.592432022094727, |
|
"learning_rate": 0.0001980302744587112, |
|
"loss": 1.3634, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.20750604603947093, |
|
"grad_norm": 3.45967960357666, |
|
"learning_rate": 0.00019801598866279237, |
|
"loss": 1.416, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.20899889529155347, |
|
"grad_norm": 4.221930503845215, |
|
"learning_rate": 0.00019800170286687354, |
|
"loss": 1.4051, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.21049174454363598, |
|
"grad_norm": 4.144239902496338, |
|
"learning_rate": 0.0001979874170709547, |
|
"loss": 1.4219, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.21198459379571852, |
|
"grad_norm": 4.7492570877075195, |
|
"learning_rate": 0.00019797313127503587, |
|
"loss": 1.4028, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.21347744304780103, |
|
"grad_norm": 3.5841355323791504, |
|
"learning_rate": 0.00019795884547911703, |
|
"loss": 1.4361, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.21497029229988357, |
|
"grad_norm": 4.662593364715576, |
|
"learning_rate": 0.0001979445596831982, |
|
"loss": 1.3816, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.21646314155196608, |
|
"grad_norm": 4.700701713562012, |
|
"learning_rate": 0.00019793027388727936, |
|
"loss": 1.4226, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.2179559908040486, |
|
"grad_norm": 4.025181293487549, |
|
"learning_rate": 0.00019791598809136053, |
|
"loss": 1.4291, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.21944884005613113, |
|
"grad_norm": 3.064573049545288, |
|
"learning_rate": 0.0001979017022954417, |
|
"loss": 1.4293, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.22094168930821365, |
|
"grad_norm": 6.342152118682861, |
|
"learning_rate": 0.00019788741649952288, |
|
"loss": 1.4173, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.22243453856029619, |
|
"grad_norm": 5.89996337890625, |
|
"learning_rate": 0.00019787313070360402, |
|
"loss": 1.396, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.2239273878123787, |
|
"grad_norm": 4.462945938110352, |
|
"learning_rate": 0.0001978588449076852, |
|
"loss": 1.3868, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.22542023706446124, |
|
"grad_norm": 3.6449055671691895, |
|
"learning_rate": 0.00019784455911176635, |
|
"loss": 1.396, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.22691308631654375, |
|
"grad_norm": 4.674243927001953, |
|
"learning_rate": 0.00019783027331584754, |
|
"loss": 1.395, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.2284059355686263, |
|
"grad_norm": 3.6160385608673096, |
|
"learning_rate": 0.00019781598751992868, |
|
"loss": 1.3918, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.2298987848207088, |
|
"grad_norm": 4.326193332672119, |
|
"learning_rate": 0.00019780170172400987, |
|
"loss": 1.3947, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.23139163407279134, |
|
"grad_norm": 5.4003777503967285, |
|
"learning_rate": 0.00019778741592809104, |
|
"loss": 1.344, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.23288448332487385, |
|
"grad_norm": 4.711580753326416, |
|
"learning_rate": 0.0001977731301321722, |
|
"loss": 1.3959, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.23437733257695637, |
|
"grad_norm": 3.4752814769744873, |
|
"learning_rate": 0.00019775884433625337, |
|
"loss": 1.3722, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.2358701818290389, |
|
"grad_norm": 4.028527736663818, |
|
"learning_rate": 0.00019774455854033453, |
|
"loss": 1.3683, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.23736303108112142, |
|
"grad_norm": 4.094334602355957, |
|
"learning_rate": 0.0001977302727444157, |
|
"loss": 1.3607, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.23885588033320396, |
|
"grad_norm": 5.232580661773682, |
|
"learning_rate": 0.00019771598694849686, |
|
"loss": 1.4354, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.24034872958528647, |
|
"grad_norm": 4.269852161407471, |
|
"learning_rate": 0.00019770170115257802, |
|
"loss": 1.4372, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.241841578837369, |
|
"grad_norm": 3.312541961669922, |
|
"learning_rate": 0.0001976874153566592, |
|
"loss": 1.3826, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.24333442808945152, |
|
"grad_norm": 3.8900692462921143, |
|
"learning_rate": 0.00019767312956074035, |
|
"loss": 1.4189, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.24482727734153406, |
|
"grad_norm": 3.894512414932251, |
|
"learning_rate": 0.00019765884376482155, |
|
"loss": 1.3365, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.24632012659361657, |
|
"grad_norm": 4.644411563873291, |
|
"learning_rate": 0.00019764455796890268, |
|
"loss": 1.4311, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.2478129758456991, |
|
"grad_norm": 8.174029350280762, |
|
"learning_rate": 0.00019763027217298387, |
|
"loss": 1.361, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.24930582509778162, |
|
"grad_norm": 4.615732192993164, |
|
"learning_rate": 0.000197615986377065, |
|
"loss": 1.4552, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.25079867434986414, |
|
"grad_norm": 4.421249866485596, |
|
"learning_rate": 0.0001976017005811462, |
|
"loss": 1.3463, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.2522915236019467, |
|
"grad_norm": 2.8386716842651367, |
|
"learning_rate": 0.00019758741478522734, |
|
"loss": 1.348, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.2537843728540292, |
|
"grad_norm": 4.3141703605651855, |
|
"learning_rate": 0.0001975731289893085, |
|
"loss": 1.4306, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.2552772221061117, |
|
"grad_norm": 3.947331428527832, |
|
"learning_rate": 0.0001975588431933897, |
|
"loss": 1.3823, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.25677007135819424, |
|
"grad_norm": 3.2268636226654053, |
|
"learning_rate": 0.00019754455739747084, |
|
"loss": 1.4199, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.2582629206102768, |
|
"grad_norm": 4.0353102684021, |
|
"learning_rate": 0.00019753027160155203, |
|
"loss": 1.3927, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.2597557698623593, |
|
"grad_norm": 3.490560293197632, |
|
"learning_rate": 0.00019751598580563316, |
|
"loss": 1.401, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.2612486191144418, |
|
"grad_norm": 5.577207088470459, |
|
"learning_rate": 0.00019750170000971436, |
|
"loss": 1.3586, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.26274146836652434, |
|
"grad_norm": 4.168467998504639, |
|
"learning_rate": 0.0001974874142137955, |
|
"loss": 1.3303, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.2642343176186069, |
|
"grad_norm": 3.812627077102661, |
|
"learning_rate": 0.00019747312841787669, |
|
"loss": 1.3717, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.2657271668706894, |
|
"grad_norm": 4.875237464904785, |
|
"learning_rate": 0.00019745884262195785, |
|
"loss": 1.3873, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.2672200161227719, |
|
"grad_norm": 4.048189163208008, |
|
"learning_rate": 0.00019744455682603902, |
|
"loss": 1.3775, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.26871286537485445, |
|
"grad_norm": 3.9090261459350586, |
|
"learning_rate": 0.00019743027103012018, |
|
"loss": 1.4296, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.270205714626937, |
|
"grad_norm": 2.8476953506469727, |
|
"learning_rate": 0.00019741598523420134, |
|
"loss": 1.4175, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.2716985638790195, |
|
"grad_norm": 5.782102584838867, |
|
"learning_rate": 0.0001974016994382825, |
|
"loss": 1.3835, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.273191413131102, |
|
"grad_norm": 4.640264987945557, |
|
"learning_rate": 0.00019738741364236367, |
|
"loss": 1.4524, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.27468426238318455, |
|
"grad_norm": 4.81790828704834, |
|
"learning_rate": 0.00019737312784644484, |
|
"loss": 1.3183, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.2761771116352671, |
|
"grad_norm": 2.685009717941284, |
|
"learning_rate": 0.000197358842050526, |
|
"loss": 1.3243, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.2776699608873496, |
|
"grad_norm": 5.321321487426758, |
|
"learning_rate": 0.00019734455625460717, |
|
"loss": 1.4086, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.2791628101394321, |
|
"grad_norm": 3.065791368484497, |
|
"learning_rate": 0.00019733027045868836, |
|
"loss": 1.337, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.28065565939151466, |
|
"grad_norm": 4.3569817543029785, |
|
"learning_rate": 0.0001973159846627695, |
|
"loss": 1.4082, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.2821485086435972, |
|
"grad_norm": 4.67582368850708, |
|
"learning_rate": 0.0001973016988668507, |
|
"loss": 1.3832, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.2836413578956797, |
|
"grad_norm": 4.942144870758057, |
|
"learning_rate": 0.00019728741307093183, |
|
"loss": 1.3734, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.2851342071477622, |
|
"grad_norm": 4.853246688842773, |
|
"learning_rate": 0.00019727312727501302, |
|
"loss": 1.4111, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.28662705639984476, |
|
"grad_norm": 3.071237325668335, |
|
"learning_rate": 0.00019725884147909418, |
|
"loss": 1.3746, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.28811990565192724, |
|
"grad_norm": 4.844615459442139, |
|
"learning_rate": 0.00019724455568317535, |
|
"loss": 1.3051, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.2896127549040098, |
|
"grad_norm": 5.954223155975342, |
|
"learning_rate": 0.0001972302698872565, |
|
"loss": 1.4131, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.2911056041560923, |
|
"grad_norm": 3.6717801094055176, |
|
"learning_rate": 0.00019721598409133768, |
|
"loss": 1.4166, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.29259845340817486, |
|
"grad_norm": 3.6257095336914062, |
|
"learning_rate": 0.00019720169829541884, |
|
"loss": 1.3679, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.29409130266025735, |
|
"grad_norm": 4.245635032653809, |
|
"learning_rate": 0.0001971874124995, |
|
"loss": 1.3171, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.2955841519123399, |
|
"grad_norm": 5.362602710723877, |
|
"learning_rate": 0.00019717312670358117, |
|
"loss": 1.3932, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.2970770011644224, |
|
"grad_norm": 4.6283721923828125, |
|
"learning_rate": 0.00019715884090766234, |
|
"loss": 1.3757, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.29856985041650497, |
|
"grad_norm": 4.299574851989746, |
|
"learning_rate": 0.0001971445551117435, |
|
"loss": 1.4018, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 700001, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 21, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.535695686786089e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|