diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.1549566891241578, + "epoch": 1.539942252165544, "eval_steps": 500, - "global_step": 30000, + "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -42007,6 +42007,14006 @@ "learning_rate": 7.598194135786166e-05, "loss": 0.8086, "step": 30000 + }, + { + "epoch": 1.1551491819056785, + "grad_norm": 1.5883291959762573, + "learning_rate": 7.59525880175407e-05, + "loss": 0.7994, + "step": 30005 + }, + { + "epoch": 1.1553416746871992, + "grad_norm": 0.8642548322677612, + "learning_rate": 7.59232368761963e-05, + "loss": 0.8237, + "step": 30010 + }, + { + "epoch": 1.1555341674687198, + "grad_norm": 1.0051319599151611, + "learning_rate": 7.589388793651239e-05, + "loss": 0.8595, + "step": 30015 + }, + { + "epoch": 1.1557266602502407, + "grad_norm": 0.859375, + "learning_rate": 7.586454120117271e-05, + "loss": 0.8389, + "step": 30020 + }, + { + "epoch": 1.1559191530317614, + "grad_norm": 1.0464012622833252, + "learning_rate": 7.583519667286088e-05, + "loss": 0.9032, + "step": 30025 + }, + { + "epoch": 1.156111645813282, + "grad_norm": 1.8586502075195312, + "learning_rate": 7.580585435426024e-05, + "loss": 0.8104, + "step": 30030 + }, + { + "epoch": 1.1563041385948027, + "grad_norm": 1.273821234703064, + "learning_rate": 7.577651424805392e-05, + "loss": 0.7211, + "step": 30035 + }, + { + "epoch": 1.1564966313763234, + "grad_norm": 1.5293164253234863, + "learning_rate": 7.574717635692492e-05, + "loss": 0.8558, + "step": 30040 + }, + { + "epoch": 1.156689124157844, + "grad_norm": 1.440649390220642, + "learning_rate": 7.5717840683556e-05, + "loss": 0.816, + "step": 30045 + }, + { + "epoch": 1.1568816169393648, + "grad_norm": 2.2141449451446533, + "learning_rate": 7.568850723062967e-05, + "loss": 0.8884, + "step": 30050 + }, + { + "epoch": 1.1570741097208854, + "grad_norm": 1.564818024635315, + "learning_rate": 7.565917600082833e-05, + "loss": 1.0443, + "step": 30055 + }, + { + "epoch": 1.157266602502406, + "grad_norm": 0.9042505621910095, + "learning_rate": 7.562984699683408e-05, + "loss": 0.6834, + "step": 30060 + }, + { + "epoch": 1.1574590952839268, + "grad_norm": 1.246706485748291, + "learning_rate": 7.560052022132889e-05, + "loss": 0.9267, + "step": 30065 + }, + { + "epoch": 1.1576515880654474, + "grad_norm": 1.4773999452590942, + "learning_rate": 7.557119567699452e-05, + "loss": 0.7131, + "step": 30070 + }, + { + "epoch": 1.1578440808469683, + "grad_norm": 1.9604829549789429, + "learning_rate": 7.554187336651247e-05, + "loss": 0.8189, + "step": 30075 + }, + { + "epoch": 1.158036573628489, + "grad_norm": 1.0815962553024292, + "learning_rate": 7.551255329256402e-05, + "loss": 0.8375, + "step": 30080 + }, + { + "epoch": 1.1582290664100097, + "grad_norm": 1.2420703172683716, + "learning_rate": 7.548323545783042e-05, + "loss": 0.9146, + "step": 30085 + }, + { + "epoch": 1.1584215591915303, + "grad_norm": 1.9062912464141846, + "learning_rate": 7.545391986499252e-05, + "loss": 0.8919, + "step": 30090 + }, + { + "epoch": 1.158614051973051, + "grad_norm": 1.7222148180007935, + "learning_rate": 7.5424606516731e-05, + "loss": 0.7967, + "step": 30095 + }, + { + "epoch": 1.1588065447545717, + "grad_norm": 1.2561261653900146, + "learning_rate": 7.539529541572647e-05, + "loss": 0.837, + "step": 30100 + }, + { + "epoch": 1.1589990375360923, + "grad_norm": 1.5790022611618042, + "learning_rate": 7.536598656465918e-05, + "loss": 0.8983, + "step": 30105 + }, + { + "epoch": 1.159191530317613, + "grad_norm": 1.3885726928710938, + "learning_rate": 7.533667996620919e-05, + "loss": 0.8999, + "step": 30110 + }, + { + "epoch": 1.1593840230991337, + "grad_norm": 1.676992416381836, + "learning_rate": 7.530737562305649e-05, + "loss": 1.0169, + "step": 30115 + }, + { + "epoch": 1.1595765158806546, + "grad_norm": 1.5144734382629395, + "learning_rate": 7.52780735378807e-05, + "loss": 0.9365, + "step": 30120 + }, + { + "epoch": 1.1597690086621752, + "grad_norm": 1.8382542133331299, + "learning_rate": 7.524877371336129e-05, + "loss": 0.8854, + "step": 30125 + }, + { + "epoch": 1.159961501443696, + "grad_norm": 2.0379273891448975, + "learning_rate": 7.52194761521776e-05, + "loss": 0.8692, + "step": 30130 + }, + { + "epoch": 1.1601539942252166, + "grad_norm": 1.4084811210632324, + "learning_rate": 7.519018085700861e-05, + "loss": 0.9688, + "step": 30135 + }, + { + "epoch": 1.1603464870067373, + "grad_norm": 1.7691874504089355, + "learning_rate": 7.516088783053327e-05, + "loss": 1.031, + "step": 30140 + }, + { + "epoch": 1.160538979788258, + "grad_norm": 1.5990632772445679, + "learning_rate": 7.51315970754302e-05, + "loss": 0.9808, + "step": 30145 + }, + { + "epoch": 1.1607314725697786, + "grad_norm": 1.0263117551803589, + "learning_rate": 7.510230859437781e-05, + "loss": 0.85, + "step": 30150 + }, + { + "epoch": 1.1609239653512993, + "grad_norm": 0.9199804067611694, + "learning_rate": 7.50730223900544e-05, + "loss": 0.8131, + "step": 30155 + }, + { + "epoch": 1.16111645813282, + "grad_norm": 1.6857373714447021, + "learning_rate": 7.504373846513796e-05, + "loss": 0.9652, + "step": 30160 + }, + { + "epoch": 1.1613089509143406, + "grad_norm": 0.9786146283149719, + "learning_rate": 7.501445682230628e-05, + "loss": 0.8816, + "step": 30165 + }, + { + "epoch": 1.1615014436958615, + "grad_norm": 1.5863782167434692, + "learning_rate": 7.498517746423706e-05, + "loss": 0.8624, + "step": 30170 + }, + { + "epoch": 1.1616939364773822, + "grad_norm": 1.287121295928955, + "learning_rate": 7.495590039360763e-05, + "loss": 0.9224, + "step": 30175 + }, + { + "epoch": 1.1618864292589028, + "grad_norm": 2.6649892330169678, + "learning_rate": 7.492662561309518e-05, + "loss": 0.8616, + "step": 30180 + }, + { + "epoch": 1.1620789220404235, + "grad_norm": 1.1567127704620361, + "learning_rate": 7.489735312537676e-05, + "loss": 0.759, + "step": 30185 + }, + { + "epoch": 1.1622714148219442, + "grad_norm": 1.2174935340881348, + "learning_rate": 7.48680829331291e-05, + "loss": 0.7874, + "step": 30190 + }, + { + "epoch": 1.1624639076034649, + "grad_norm": 1.6268326044082642, + "learning_rate": 7.483881503902874e-05, + "loss": 0.8314, + "step": 30195 + }, + { + "epoch": 1.1626564003849855, + "grad_norm": 1.1430530548095703, + "learning_rate": 7.480954944575212e-05, + "loss": 0.8684, + "step": 30200 + }, + { + "epoch": 1.1628488931665062, + "grad_norm": 1.0805143117904663, + "learning_rate": 7.478028615597532e-05, + "loss": 0.9936, + "step": 30205 + }, + { + "epoch": 1.1630413859480269, + "grad_norm": 1.8118839263916016, + "learning_rate": 7.475102517237424e-05, + "loss": 0.7458, + "step": 30210 + }, + { + "epoch": 1.1632338787295478, + "grad_norm": 0.9031025171279907, + "learning_rate": 7.47217664976247e-05, + "loss": 0.7417, + "step": 30215 + }, + { + "epoch": 1.1634263715110684, + "grad_norm": 1.5473326444625854, + "learning_rate": 7.469251013440215e-05, + "loss": 0.8992, + "step": 30220 + }, + { + "epoch": 1.163618864292589, + "grad_norm": 1.164165735244751, + "learning_rate": 7.466325608538185e-05, + "loss": 0.9047, + "step": 30225 + }, + { + "epoch": 1.1638113570741098, + "grad_norm": 1.0037180185317993, + "learning_rate": 7.463400435323899e-05, + "loss": 0.7691, + "step": 30230 + }, + { + "epoch": 1.1640038498556304, + "grad_norm": 1.021618127822876, + "learning_rate": 7.460475494064841e-05, + "loss": 0.6932, + "step": 30235 + }, + { + "epoch": 1.164196342637151, + "grad_norm": 1.689186930656433, + "learning_rate": 7.457550785028472e-05, + "loss": 0.891, + "step": 30240 + }, + { + "epoch": 1.1643888354186718, + "grad_norm": 1.9816436767578125, + "learning_rate": 7.454626308482244e-05, + "loss": 0.8508, + "step": 30245 + }, + { + "epoch": 1.1645813282001924, + "grad_norm": 1.265624761581421, + "learning_rate": 7.45170206469358e-05, + "loss": 0.8929, + "step": 30250 + }, + { + "epoch": 1.1647738209817131, + "grad_norm": 1.2222243547439575, + "learning_rate": 7.44877805392988e-05, + "loss": 0.9135, + "step": 30255 + }, + { + "epoch": 1.1649663137632338, + "grad_norm": 2.1236441135406494, + "learning_rate": 7.445854276458527e-05, + "loss": 0.7737, + "step": 30260 + }, + { + "epoch": 1.1651588065447545, + "grad_norm": 0.9599865078926086, + "learning_rate": 7.44293073254688e-05, + "loss": 0.8891, + "step": 30265 + }, + { + "epoch": 1.1653512993262753, + "grad_norm": 1.1516355276107788, + "learning_rate": 7.440007422462276e-05, + "loss": 0.7167, + "step": 30270 + }, + { + "epoch": 1.165543792107796, + "grad_norm": 1.8902909755706787, + "learning_rate": 7.43708434647204e-05, + "loss": 0.8657, + "step": 30275 + }, + { + "epoch": 1.1657362848893167, + "grad_norm": 2.002458095550537, + "learning_rate": 7.434161504843461e-05, + "loss": 0.7881, + "step": 30280 + }, + { + "epoch": 1.1659287776708374, + "grad_norm": 1.3423494100570679, + "learning_rate": 7.431238897843811e-05, + "loss": 0.9164, + "step": 30285 + }, + { + "epoch": 1.166121270452358, + "grad_norm": 1.7833929061889648, + "learning_rate": 7.428316525740353e-05, + "loss": 0.9904, + "step": 30290 + }, + { + "epoch": 1.1663137632338787, + "grad_norm": 1.0303772687911987, + "learning_rate": 7.425394388800311e-05, + "loss": 0.7141, + "step": 30295 + }, + { + "epoch": 1.1665062560153994, + "grad_norm": 1.3508719205856323, + "learning_rate": 7.422472487290893e-05, + "loss": 0.792, + "step": 30300 + }, + { + "epoch": 1.16669874879692, + "grad_norm": 1.598431944847107, + "learning_rate": 7.419550821479298e-05, + "loss": 0.8109, + "step": 30305 + }, + { + "epoch": 1.166891241578441, + "grad_norm": 1.6909544467926025, + "learning_rate": 7.416629391632683e-05, + "loss": 0.9068, + "step": 30310 + }, + { + "epoch": 1.1670837343599616, + "grad_norm": 1.0492467880249023, + "learning_rate": 7.413708198018195e-05, + "loss": 0.9566, + "step": 30315 + }, + { + "epoch": 1.1672762271414823, + "grad_norm": 2.0640323162078857, + "learning_rate": 7.410787240902963e-05, + "loss": 0.818, + "step": 30320 + }, + { + "epoch": 1.167468719923003, + "grad_norm": 1.4553757905960083, + "learning_rate": 7.407866520554087e-05, + "loss": 0.8928, + "step": 30325 + }, + { + "epoch": 1.1676612127045236, + "grad_norm": 1.5272727012634277, + "learning_rate": 7.404946037238641e-05, + "loss": 0.8591, + "step": 30330 + }, + { + "epoch": 1.1678537054860443, + "grad_norm": 1.040766716003418, + "learning_rate": 7.402025791223694e-05, + "loss": 0.851, + "step": 30335 + }, + { + "epoch": 1.168046198267565, + "grad_norm": 0.9818170666694641, + "learning_rate": 7.399105782776276e-05, + "loss": 0.7963, + "step": 30340 + }, + { + "epoch": 1.1682386910490856, + "grad_norm": 1.3725941181182861, + "learning_rate": 7.396186012163404e-05, + "loss": 0.7677, + "step": 30345 + }, + { + "epoch": 1.1684311838306063, + "grad_norm": 1.8234846591949463, + "learning_rate": 7.393266479652075e-05, + "loss": 0.7203, + "step": 30350 + }, + { + "epoch": 1.168623676612127, + "grad_norm": 1.143553376197815, + "learning_rate": 7.390347185509258e-05, + "loss": 0.8207, + "step": 30355 + }, + { + "epoch": 1.1688161693936476, + "grad_norm": 1.3585904836654663, + "learning_rate": 7.387428130001904e-05, + "loss": 0.8421, + "step": 30360 + }, + { + "epoch": 1.1690086621751685, + "grad_norm": 2.451633930206299, + "learning_rate": 7.384509313396939e-05, + "loss": 0.8745, + "step": 30365 + }, + { + "epoch": 1.1692011549566892, + "grad_norm": 0.9068706035614014, + "learning_rate": 7.381590735961272e-05, + "loss": 1.0768, + "step": 30370 + }, + { + "epoch": 1.1693936477382099, + "grad_norm": 1.411170244216919, + "learning_rate": 7.378672397961788e-05, + "loss": 0.9386, + "step": 30375 + }, + { + "epoch": 1.1695861405197305, + "grad_norm": 2.274181842803955, + "learning_rate": 7.375754299665348e-05, + "loss": 0.9783, + "step": 30380 + }, + { + "epoch": 1.1697786333012512, + "grad_norm": 2.0918445587158203, + "learning_rate": 7.372836441338789e-05, + "loss": 0.8331, + "step": 30385 + }, + { + "epoch": 1.1699711260827719, + "grad_norm": 0.9770554900169373, + "learning_rate": 7.36991882324894e-05, + "loss": 0.8223, + "step": 30390 + }, + { + "epoch": 1.1701636188642925, + "grad_norm": 1.2381200790405273, + "learning_rate": 7.367001445662591e-05, + "loss": 0.8515, + "step": 30395 + }, + { + "epoch": 1.1703561116458132, + "grad_norm": 1.2155394554138184, + "learning_rate": 7.364084308846512e-05, + "loss": 0.9158, + "step": 30400 + }, + { + "epoch": 1.1705486044273339, + "grad_norm": 1.4437912702560425, + "learning_rate": 7.361167413067469e-05, + "loss": 0.8522, + "step": 30405 + }, + { + "epoch": 1.1707410972088548, + "grad_norm": 1.226899266242981, + "learning_rate": 7.358250758592184e-05, + "loss": 0.9088, + "step": 30410 + }, + { + "epoch": 1.1709335899903754, + "grad_norm": 0.8239467144012451, + "learning_rate": 7.355334345687361e-05, + "loss": 0.8154, + "step": 30415 + }, + { + "epoch": 1.1711260827718961, + "grad_norm": 1.354842185974121, + "learning_rate": 7.3524181746197e-05, + "loss": 0.7883, + "step": 30420 + }, + { + "epoch": 1.1713185755534168, + "grad_norm": 0.8536742329597473, + "learning_rate": 7.349502245655857e-05, + "loss": 0.6467, + "step": 30425 + }, + { + "epoch": 1.1715110683349375, + "grad_norm": 1.2554560899734497, + "learning_rate": 7.346586559062472e-05, + "loss": 0.8991, + "step": 30430 + }, + { + "epoch": 1.1717035611164581, + "grad_norm": 1.260736107826233, + "learning_rate": 7.343671115106172e-05, + "loss": 0.7645, + "step": 30435 + }, + { + "epoch": 1.1718960538979788, + "grad_norm": 1.7142833471298218, + "learning_rate": 7.340755914053552e-05, + "loss": 0.8953, + "step": 30440 + }, + { + "epoch": 1.1720885466794995, + "grad_norm": 1.4862356185913086, + "learning_rate": 7.337840956171184e-05, + "loss": 0.8362, + "step": 30445 + }, + { + "epoch": 1.1722810394610201, + "grad_norm": 1.4640238285064697, + "learning_rate": 7.33492624172563e-05, + "loss": 0.7173, + "step": 30450 + }, + { + "epoch": 1.1724735322425408, + "grad_norm": 1.3464069366455078, + "learning_rate": 7.332011770983417e-05, + "loss": 0.8988, + "step": 30455 + }, + { + "epoch": 1.1726660250240615, + "grad_norm": 1.3081461191177368, + "learning_rate": 7.32909754421105e-05, + "loss": 1.0021, + "step": 30460 + }, + { + "epoch": 1.1728585178055824, + "grad_norm": 0.9563549160957336, + "learning_rate": 7.326183561675022e-05, + "loss": 0.7904, + "step": 30465 + }, + { + "epoch": 1.173051010587103, + "grad_norm": 1.8011226654052734, + "learning_rate": 7.323269823641794e-05, + "loss": 0.9917, + "step": 30470 + }, + { + "epoch": 1.1732435033686237, + "grad_norm": 1.0634485483169556, + "learning_rate": 7.320356330377809e-05, + "loss": 0.8297, + "step": 30475 + }, + { + "epoch": 1.1734359961501444, + "grad_norm": 1.195955753326416, + "learning_rate": 7.317443082149488e-05, + "loss": 1.7867, + "step": 30480 + }, + { + "epoch": 1.173628488931665, + "grad_norm": 0.9291029572486877, + "learning_rate": 7.314530079223225e-05, + "loss": 0.7805, + "step": 30485 + }, + { + "epoch": 1.1738209817131857, + "grad_norm": 1.8289291858673096, + "learning_rate": 7.311617321865396e-05, + "loss": 0.8179, + "step": 30490 + }, + { + "epoch": 1.1740134744947064, + "grad_norm": 2.2788593769073486, + "learning_rate": 7.308704810342357e-05, + "loss": 1.0026, + "step": 30495 + }, + { + "epoch": 1.174205967276227, + "grad_norm": 0.9969417452812195, + "learning_rate": 7.305792544920433e-05, + "loss": 0.9294, + "step": 30500 + }, + { + "epoch": 1.174398460057748, + "grad_norm": 1.9698654413223267, + "learning_rate": 7.302880525865932e-05, + "loss": 1.031, + "step": 30505 + }, + { + "epoch": 1.1745909528392686, + "grad_norm": 2.6228976249694824, + "learning_rate": 7.299968753445142e-05, + "loss": 0.8685, + "step": 30510 + }, + { + "epoch": 1.1747834456207893, + "grad_norm": 1.7787238359451294, + "learning_rate": 7.297057227924324e-05, + "loss": 0.7516, + "step": 30515 + }, + { + "epoch": 1.17497593840231, + "grad_norm": 1.7885974645614624, + "learning_rate": 7.294145949569713e-05, + "loss": 0.9863, + "step": 30520 + }, + { + "epoch": 1.1751684311838306, + "grad_norm": 0.9006327986717224, + "learning_rate": 7.291234918647534e-05, + "loss": 0.8484, + "step": 30525 + }, + { + "epoch": 1.1753609239653513, + "grad_norm": 1.1137930154800415, + "learning_rate": 7.288324135423979e-05, + "loss": 0.8275, + "step": 30530 + }, + { + "epoch": 1.175553416746872, + "grad_norm": 0.8534238338470459, + "learning_rate": 7.285413600165214e-05, + "loss": 0.8755, + "step": 30535 + }, + { + "epoch": 1.1757459095283926, + "grad_norm": 1.3716295957565308, + "learning_rate": 7.282503313137397e-05, + "loss": 0.7926, + "step": 30540 + }, + { + "epoch": 1.1759384023099133, + "grad_norm": 2.3967831134796143, + "learning_rate": 7.27959327460665e-05, + "loss": 0.7856, + "step": 30545 + }, + { + "epoch": 1.176130895091434, + "grad_norm": 0.8649664521217346, + "learning_rate": 7.276683484839074e-05, + "loss": 0.6645, + "step": 30550 + }, + { + "epoch": 1.1763233878729547, + "grad_norm": 1.004795789718628, + "learning_rate": 7.273773944100755e-05, + "loss": 0.9662, + "step": 30555 + }, + { + "epoch": 1.1765158806544755, + "grad_norm": 1.4656823873519897, + "learning_rate": 7.27086465265775e-05, + "loss": 1.0357, + "step": 30560 + }, + { + "epoch": 1.1767083734359962, + "grad_norm": 1.2406914234161377, + "learning_rate": 7.267955610776089e-05, + "loss": 0.7643, + "step": 30565 + }, + { + "epoch": 1.1769008662175169, + "grad_norm": 0.761400043964386, + "learning_rate": 7.265046818721795e-05, + "loss": 0.7157, + "step": 30570 + }, + { + "epoch": 1.1770933589990376, + "grad_norm": 1.4222087860107422, + "learning_rate": 7.26213827676085e-05, + "loss": 0.7819, + "step": 30575 + }, + { + "epoch": 1.1772858517805582, + "grad_norm": 1.604976773262024, + "learning_rate": 7.259229985159223e-05, + "loss": 0.8072, + "step": 30580 + }, + { + "epoch": 1.177478344562079, + "grad_norm": 1.5902528762817383, + "learning_rate": 7.256321944182856e-05, + "loss": 0.8419, + "step": 30585 + }, + { + "epoch": 1.1776708373435996, + "grad_norm": 1.3055870532989502, + "learning_rate": 7.253414154097675e-05, + "loss": 0.8395, + "step": 30590 + }, + { + "epoch": 1.1778633301251202, + "grad_norm": 1.5313003063201904, + "learning_rate": 7.250506615169573e-05, + "loss": 0.853, + "step": 30595 + }, + { + "epoch": 1.1780558229066411, + "grad_norm": 2.564984083175659, + "learning_rate": 7.24759932766443e-05, + "loss": 0.8832, + "step": 30600 + }, + { + "epoch": 1.1782483156881618, + "grad_norm": 1.0367460250854492, + "learning_rate": 7.244692291848091e-05, + "loss": 0.7639, + "step": 30605 + }, + { + "epoch": 1.1784408084696825, + "grad_norm": 1.0948158502578735, + "learning_rate": 7.241785507986392e-05, + "loss": 0.8525, + "step": 30610 + }, + { + "epoch": 1.1786333012512031, + "grad_norm": 1.3061574697494507, + "learning_rate": 7.23887897634514e-05, + "loss": 0.7501, + "step": 30615 + }, + { + "epoch": 1.1788257940327238, + "grad_norm": 2.5141634941101074, + "learning_rate": 7.235972697190112e-05, + "loss": 0.8892, + "step": 30620 + }, + { + "epoch": 1.1790182868142445, + "grad_norm": 1.4427481889724731, + "learning_rate": 7.233066670787068e-05, + "loss": 0.8994, + "step": 30625 + }, + { + "epoch": 1.1792107795957651, + "grad_norm": 0.6231327652931213, + "learning_rate": 7.230160897401752e-05, + "loss": 0.711, + "step": 30630 + }, + { + "epoch": 1.1794032723772858, + "grad_norm": 1.6313964128494263, + "learning_rate": 7.227255377299873e-05, + "loss": 0.9548, + "step": 30635 + }, + { + "epoch": 1.1795957651588065, + "grad_norm": 1.321081280708313, + "learning_rate": 7.224350110747118e-05, + "loss": 0.7634, + "step": 30640 + }, + { + "epoch": 1.1797882579403272, + "grad_norm": 1.1919338703155518, + "learning_rate": 7.221445098009163e-05, + "loss": 0.8379, + "step": 30645 + }, + { + "epoch": 1.1799807507218478, + "grad_norm": 1.954206109046936, + "learning_rate": 7.218540339351643e-05, + "loss": 0.6995, + "step": 30650 + }, + { + "epoch": 1.1801732435033687, + "grad_norm": 1.4222872257232666, + "learning_rate": 7.215635835040187e-05, + "loss": 0.7785, + "step": 30655 + }, + { + "epoch": 1.1803657362848894, + "grad_norm": 1.8065252304077148, + "learning_rate": 7.21273158534039e-05, + "loss": 0.8699, + "step": 30660 + }, + { + "epoch": 1.18055822906641, + "grad_norm": 0.9798047542572021, + "learning_rate": 7.209827590517822e-05, + "loss": 0.9183, + "step": 30665 + }, + { + "epoch": 1.1807507218479307, + "grad_norm": 1.1454569101333618, + "learning_rate": 7.206923850838041e-05, + "loss": 0.8424, + "step": 30670 + }, + { + "epoch": 1.1809432146294514, + "grad_norm": 0.8481330871582031, + "learning_rate": 7.204020366566571e-05, + "loss": 0.8378, + "step": 30675 + }, + { + "epoch": 1.181135707410972, + "grad_norm": 1.2855151891708374, + "learning_rate": 7.201117137968915e-05, + "loss": 0.8317, + "step": 30680 + }, + { + "epoch": 1.1813282001924927, + "grad_norm": 2.0627200603485107, + "learning_rate": 7.198214165310555e-05, + "loss": 0.7981, + "step": 30685 + }, + { + "epoch": 1.1815206929740134, + "grad_norm": 0.8609490394592285, + "learning_rate": 7.195311448856952e-05, + "loss": 0.7964, + "step": 30690 + }, + { + "epoch": 1.181713185755534, + "grad_norm": 1.0228421688079834, + "learning_rate": 7.192408988873537e-05, + "loss": 0.914, + "step": 30695 + }, + { + "epoch": 1.181905678537055, + "grad_norm": 1.3062362670898438, + "learning_rate": 7.189506785625722e-05, + "loss": 0.8929, + "step": 30700 + }, + { + "epoch": 1.1820981713185756, + "grad_norm": 0.9503874182701111, + "learning_rate": 7.186604839378891e-05, + "loss": 0.8451, + "step": 30705 + }, + { + "epoch": 1.1822906641000963, + "grad_norm": 1.9427149295806885, + "learning_rate": 7.183703150398414e-05, + "loss": 1.0375, + "step": 30710 + }, + { + "epoch": 1.182483156881617, + "grad_norm": 1.4807151556015015, + "learning_rate": 7.180801718949626e-05, + "loss": 0.9003, + "step": 30715 + }, + { + "epoch": 1.1826756496631377, + "grad_norm": 1.048420786857605, + "learning_rate": 7.177900545297846e-05, + "loss": 0.775, + "step": 30720 + }, + { + "epoch": 1.1828681424446583, + "grad_norm": 1.6969749927520752, + "learning_rate": 7.174999629708363e-05, + "loss": 0.9417, + "step": 30725 + }, + { + "epoch": 1.183060635226179, + "grad_norm": 1.2834903001785278, + "learning_rate": 7.172098972446453e-05, + "loss": 0.9584, + "step": 30730 + }, + { + "epoch": 1.1832531280076997, + "grad_norm": 2.063692331314087, + "learning_rate": 7.169198573777361e-05, + "loss": 0.8202, + "step": 30735 + }, + { + "epoch": 1.1834456207892203, + "grad_norm": 0.9396731853485107, + "learning_rate": 7.166298433966301e-05, + "loss": 0.9421, + "step": 30740 + }, + { + "epoch": 1.183638113570741, + "grad_norm": 1.4655632972717285, + "learning_rate": 7.163398553278483e-05, + "loss": 0.8273, + "step": 30745 + }, + { + "epoch": 1.1838306063522617, + "grad_norm": 2.0412590503692627, + "learning_rate": 7.160498931979076e-05, + "loss": 0.8433, + "step": 30750 + }, + { + "epoch": 1.1840230991337826, + "grad_norm": 1.9432737827301025, + "learning_rate": 7.157599570333226e-05, + "loss": 0.9141, + "step": 30755 + }, + { + "epoch": 1.1842155919153032, + "grad_norm": 0.863294243812561, + "learning_rate": 7.154700468606073e-05, + "loss": 0.9083, + "step": 30760 + }, + { + "epoch": 1.184408084696824, + "grad_norm": 1.1578996181488037, + "learning_rate": 7.151801627062713e-05, + "loss": 0.8211, + "step": 30765 + }, + { + "epoch": 1.1846005774783446, + "grad_norm": 1.4375747442245483, + "learning_rate": 7.148903045968221e-05, + "loss": 0.7579, + "step": 30770 + }, + { + "epoch": 1.1847930702598652, + "grad_norm": 1.566175937652588, + "learning_rate": 7.146004725587664e-05, + "loss": 0.7958, + "step": 30775 + }, + { + "epoch": 1.184985563041386, + "grad_norm": 2.4917242527008057, + "learning_rate": 7.143106666186068e-05, + "loss": 0.9255, + "step": 30780 + }, + { + "epoch": 1.1851780558229066, + "grad_norm": 1.2194336652755737, + "learning_rate": 7.14020886802844e-05, + "loss": 0.715, + "step": 30785 + }, + { + "epoch": 1.1853705486044273, + "grad_norm": 0.9946492314338684, + "learning_rate": 7.137311331379769e-05, + "loss": 0.8673, + "step": 30790 + }, + { + "epoch": 1.1855630413859481, + "grad_norm": 1.6159164905548096, + "learning_rate": 7.134414056505015e-05, + "loss": 0.7092, + "step": 30795 + }, + { + "epoch": 1.1857555341674688, + "grad_norm": 1.3039904832839966, + "learning_rate": 7.131517043669108e-05, + "loss": 0.9336, + "step": 30800 + }, + { + "epoch": 1.1859480269489895, + "grad_norm": 0.7158260345458984, + "learning_rate": 7.12862029313697e-05, + "loss": 0.9515, + "step": 30805 + }, + { + "epoch": 1.1861405197305102, + "grad_norm": 1.4139379262924194, + "learning_rate": 7.12572380517348e-05, + "loss": 0.8344, + "step": 30810 + }, + { + "epoch": 1.1863330125120308, + "grad_norm": 1.963030219078064, + "learning_rate": 7.122827580043509e-05, + "loss": 0.8987, + "step": 30815 + }, + { + "epoch": 1.1865255052935515, + "grad_norm": 1.060595989227295, + "learning_rate": 7.1199316180119e-05, + "loss": 0.7739, + "step": 30820 + }, + { + "epoch": 1.1867179980750722, + "grad_norm": 1.1344969272613525, + "learning_rate": 7.117035919343464e-05, + "loss": 0.9343, + "step": 30825 + }, + { + "epoch": 1.1869104908565928, + "grad_norm": 1.3690916299819946, + "learning_rate": 7.114140484302992e-05, + "loss": 0.6558, + "step": 30830 + }, + { + "epoch": 1.1871029836381135, + "grad_norm": 1.053550124168396, + "learning_rate": 7.11124531315526e-05, + "loss": 0.7993, + "step": 30835 + }, + { + "epoch": 1.1872954764196342, + "grad_norm": 1.509711503982544, + "learning_rate": 7.108350406165007e-05, + "loss": 1.0615, + "step": 30840 + }, + { + "epoch": 1.1874879692011548, + "grad_norm": 1.440314769744873, + "learning_rate": 7.10545576359695e-05, + "loss": 0.7368, + "step": 30845 + }, + { + "epoch": 1.1876804619826757, + "grad_norm": 1.1829510927200317, + "learning_rate": 7.102561385715794e-05, + "loss": 0.7871, + "step": 30850 + }, + { + "epoch": 1.1878729547641964, + "grad_norm": 1.2442659139633179, + "learning_rate": 7.099667272786205e-05, + "loss": 0.7765, + "step": 30855 + }, + { + "epoch": 1.188065447545717, + "grad_norm": 1.2763118743896484, + "learning_rate": 7.096773425072827e-05, + "loss": 0.9741, + "step": 30860 + }, + { + "epoch": 1.1882579403272377, + "grad_norm": 1.0801767110824585, + "learning_rate": 7.093879842840289e-05, + "loss": 0.7907, + "step": 30865 + }, + { + "epoch": 1.1884504331087584, + "grad_norm": 1.863099217414856, + "learning_rate": 7.090986526353192e-05, + "loss": 0.7885, + "step": 30870 + }, + { + "epoch": 1.188642925890279, + "grad_norm": 2.064089298248291, + "learning_rate": 7.088093475876098e-05, + "loss": 0.9183, + "step": 30875 + }, + { + "epoch": 1.1888354186717998, + "grad_norm": 1.4984971284866333, + "learning_rate": 7.085200691673573e-05, + "loss": 0.9088, + "step": 30880 + }, + { + "epoch": 1.1890279114533204, + "grad_norm": 1.2373710870742798, + "learning_rate": 7.082308174010138e-05, + "loss": 0.8277, + "step": 30885 + }, + { + "epoch": 1.189220404234841, + "grad_norm": 0.864380955696106, + "learning_rate": 7.079415923150285e-05, + "loss": 0.7536, + "step": 30890 + }, + { + "epoch": 1.189412897016362, + "grad_norm": 1.1806195974349976, + "learning_rate": 7.076523939358504e-05, + "loss": 0.7892, + "step": 30895 + }, + { + "epoch": 1.1896053897978827, + "grad_norm": 1.486262559890747, + "learning_rate": 7.07363222289924e-05, + "loss": 0.7652, + "step": 30900 + }, + { + "epoch": 1.1897978825794033, + "grad_norm": 1.093450665473938, + "learning_rate": 7.070740774036926e-05, + "loss": 0.7741, + "step": 30905 + }, + { + "epoch": 1.189990375360924, + "grad_norm": 0.9283474683761597, + "learning_rate": 7.067849593035962e-05, + "loss": 0.7752, + "step": 30910 + }, + { + "epoch": 1.1901828681424447, + "grad_norm": 1.1739362478256226, + "learning_rate": 7.064958680160729e-05, + "loss": 0.762, + "step": 30915 + }, + { + "epoch": 1.1903753609239653, + "grad_norm": 1.4015986919403076, + "learning_rate": 7.062068035675584e-05, + "loss": 0.866, + "step": 30920 + }, + { + "epoch": 1.190567853705486, + "grad_norm": 1.121469259262085, + "learning_rate": 7.059177659844853e-05, + "loss": 0.7242, + "step": 30925 + }, + { + "epoch": 1.1907603464870067, + "grad_norm": 1.6382770538330078, + "learning_rate": 7.056287552932842e-05, + "loss": 0.8558, + "step": 30930 + }, + { + "epoch": 1.1909528392685274, + "grad_norm": 1.2235866785049438, + "learning_rate": 7.053397715203837e-05, + "loss": 1.0149, + "step": 30935 + }, + { + "epoch": 1.191145332050048, + "grad_norm": 1.0109368562698364, + "learning_rate": 7.050508146922093e-05, + "loss": 0.8432, + "step": 30940 + }, + { + "epoch": 1.1913378248315687, + "grad_norm": 1.2103551626205444, + "learning_rate": 7.047618848351835e-05, + "loss": 0.8176, + "step": 30945 + }, + { + "epoch": 1.1915303176130896, + "grad_norm": 1.0425639152526855, + "learning_rate": 7.044729819757279e-05, + "loss": 0.7266, + "step": 30950 + }, + { + "epoch": 1.1917228103946103, + "grad_norm": 1.1277648210525513, + "learning_rate": 7.041841061402606e-05, + "loss": 0.743, + "step": 30955 + }, + { + "epoch": 1.191915303176131, + "grad_norm": 1.4034432172775269, + "learning_rate": 7.038952573551967e-05, + "loss": 0.9021, + "step": 30960 + }, + { + "epoch": 1.1921077959576516, + "grad_norm": 0.9935101270675659, + "learning_rate": 7.036064356469504e-05, + "loss": 0.7794, + "step": 30965 + }, + { + "epoch": 1.1923002887391723, + "grad_norm": 1.6952626705169678, + "learning_rate": 7.033176410419322e-05, + "loss": 0.8085, + "step": 30970 + }, + { + "epoch": 1.192492781520693, + "grad_norm": 1.1734607219696045, + "learning_rate": 7.030288735665498e-05, + "loss": 0.742, + "step": 30975 + }, + { + "epoch": 1.1926852743022136, + "grad_norm": 1.4427086114883423, + "learning_rate": 7.027401332472102e-05, + "loss": 0.9636, + "step": 30980 + }, + { + "epoch": 1.1928777670837343, + "grad_norm": 1.4568814039230347, + "learning_rate": 7.024514201103163e-05, + "loss": 0.857, + "step": 30985 + }, + { + "epoch": 1.1930702598652552, + "grad_norm": 0.9864609241485596, + "learning_rate": 7.021627341822684e-05, + "loss": 0.7251, + "step": 30990 + }, + { + "epoch": 1.1932627526467758, + "grad_norm": 1.3467034101486206, + "learning_rate": 7.018740754894659e-05, + "loss": 0.8612, + "step": 30995 + }, + { + "epoch": 1.1934552454282965, + "grad_norm": 0.9315931797027588, + "learning_rate": 7.015854440583044e-05, + "loss": 0.7092, + "step": 31000 + }, + { + "epoch": 1.1936477382098172, + "grad_norm": 1.0659290552139282, + "learning_rate": 7.012968399151769e-05, + "loss": 0.8342, + "step": 31005 + }, + { + "epoch": 1.1938402309913378, + "grad_norm": 1.1278982162475586, + "learning_rate": 7.010082630864748e-05, + "loss": 0.8266, + "step": 31010 + }, + { + "epoch": 1.1940327237728585, + "grad_norm": 1.739660382270813, + "learning_rate": 7.007197135985865e-05, + "loss": 0.8522, + "step": 31015 + }, + { + "epoch": 1.1942252165543792, + "grad_norm": 1.020524263381958, + "learning_rate": 7.004311914778977e-05, + "loss": 0.7267, + "step": 31020 + }, + { + "epoch": 1.1944177093358999, + "grad_norm": 1.6146787405014038, + "learning_rate": 7.001426967507921e-05, + "loss": 0.8447, + "step": 31025 + }, + { + "epoch": 1.1946102021174205, + "grad_norm": 2.2342722415924072, + "learning_rate": 6.998542294436504e-05, + "loss": 0.9637, + "step": 31030 + }, + { + "epoch": 1.1948026948989412, + "grad_norm": 1.4456665515899658, + "learning_rate": 6.995657895828511e-05, + "loss": 0.8051, + "step": 31035 + }, + { + "epoch": 1.1949951876804619, + "grad_norm": 3.0097768306732178, + "learning_rate": 6.992773771947703e-05, + "loss": 0.9212, + "step": 31040 + }, + { + "epoch": 1.1951876804619828, + "grad_norm": 2.814483165740967, + "learning_rate": 6.989889923057813e-05, + "loss": 0.7442, + "step": 31045 + }, + { + "epoch": 1.1953801732435034, + "grad_norm": 1.2800745964050293, + "learning_rate": 6.987006349422546e-05, + "loss": 0.8654, + "step": 31050 + }, + { + "epoch": 1.195572666025024, + "grad_norm": 1.355048418045044, + "learning_rate": 6.98412305130559e-05, + "loss": 0.7145, + "step": 31055 + }, + { + "epoch": 1.1957651588065448, + "grad_norm": 1.7989205121994019, + "learning_rate": 6.981240028970607e-05, + "loss": 1.0386, + "step": 31060 + }, + { + "epoch": 1.1959576515880654, + "grad_norm": 2.1455464363098145, + "learning_rate": 6.97835728268122e-05, + "loss": 1.0023, + "step": 31065 + }, + { + "epoch": 1.196150144369586, + "grad_norm": 1.548008918762207, + "learning_rate": 6.975474812701047e-05, + "loss": 0.834, + "step": 31070 + }, + { + "epoch": 1.1963426371511068, + "grad_norm": 1.2741403579711914, + "learning_rate": 6.972592619293665e-05, + "loss": 0.6641, + "step": 31075 + }, + { + "epoch": 1.1965351299326275, + "grad_norm": 1.5108489990234375, + "learning_rate": 6.969710702722632e-05, + "loss": 0.8214, + "step": 31080 + }, + { + "epoch": 1.1967276227141483, + "grad_norm": 1.490020513534546, + "learning_rate": 6.966829063251484e-05, + "loss": 0.8304, + "step": 31085 + }, + { + "epoch": 1.196920115495669, + "grad_norm": 0.9961896538734436, + "learning_rate": 6.963947701143724e-05, + "loss": 0.804, + "step": 31090 + }, + { + "epoch": 1.1971126082771897, + "grad_norm": 1.3393563032150269, + "learning_rate": 6.961066616662834e-05, + "loss": 0.882, + "step": 31095 + }, + { + "epoch": 1.1973051010587104, + "grad_norm": 0.9488278031349182, + "learning_rate": 6.958185810072273e-05, + "loss": 0.8017, + "step": 31100 + }, + { + "epoch": 1.197497593840231, + "grad_norm": 1.5525720119476318, + "learning_rate": 6.955305281635469e-05, + "loss": 0.8404, + "step": 31105 + }, + { + "epoch": 1.1976900866217517, + "grad_norm": 1.5819134712219238, + "learning_rate": 6.952425031615823e-05, + "loss": 0.8679, + "step": 31110 + }, + { + "epoch": 1.1978825794032724, + "grad_norm": 1.090757966041565, + "learning_rate": 6.949545060276726e-05, + "loss": 0.6385, + "step": 31115 + }, + { + "epoch": 1.198075072184793, + "grad_norm": 1.378661870956421, + "learning_rate": 6.946665367881523e-05, + "loss": 0.8704, + "step": 31120 + }, + { + "epoch": 1.1982675649663137, + "grad_norm": 1.3583415746688843, + "learning_rate": 6.943785954693547e-05, + "loss": 0.992, + "step": 31125 + }, + { + "epoch": 1.1984600577478344, + "grad_norm": 1.195250153541565, + "learning_rate": 6.940906820976097e-05, + "loss": 0.8763, + "step": 31130 + }, + { + "epoch": 1.198652550529355, + "grad_norm": 1.0746605396270752, + "learning_rate": 6.938027966992458e-05, + "loss": 0.7867, + "step": 31135 + }, + { + "epoch": 1.198845043310876, + "grad_norm": 1.0139212608337402, + "learning_rate": 6.935149393005873e-05, + "loss": 0.8423, + "step": 31140 + }, + { + "epoch": 1.1990375360923966, + "grad_norm": 1.3249413967132568, + "learning_rate": 6.932271099279576e-05, + "loss": 0.8433, + "step": 31145 + }, + { + "epoch": 1.1992300288739173, + "grad_norm": 0.8639804124832153, + "learning_rate": 6.929393086076765e-05, + "loss": 0.9309, + "step": 31150 + }, + { + "epoch": 1.199422521655438, + "grad_norm": 1.7848960161209106, + "learning_rate": 6.926515353660614e-05, + "loss": 0.9963, + "step": 31155 + }, + { + "epoch": 1.1996150144369586, + "grad_norm": 1.1610658168792725, + "learning_rate": 6.923637902294275e-05, + "loss": 0.8316, + "step": 31160 + }, + { + "epoch": 1.1998075072184793, + "grad_norm": 1.335810899734497, + "learning_rate": 6.920760732240868e-05, + "loss": 0.9464, + "step": 31165 + }, + { + "epoch": 1.2, + "grad_norm": 1.3204543590545654, + "learning_rate": 6.917883843763497e-05, + "loss": 0.7098, + "step": 31170 + }, + { + "epoch": 1.2001924927815206, + "grad_norm": 1.3988627195358276, + "learning_rate": 6.915007237125233e-05, + "loss": 0.8002, + "step": 31175 + }, + { + "epoch": 1.2003849855630413, + "grad_norm": 1.4686270952224731, + "learning_rate": 6.912130912589116e-05, + "loss": 0.7664, + "step": 31180 + }, + { + "epoch": 1.2005774783445622, + "grad_norm": 1.4970102310180664, + "learning_rate": 6.909254870418176e-05, + "loss": 0.8932, + "step": 31185 + }, + { + "epoch": 1.2007699711260829, + "grad_norm": 1.4880247116088867, + "learning_rate": 6.906379110875403e-05, + "loss": 0.8988, + "step": 31190 + }, + { + "epoch": 1.2009624639076035, + "grad_norm": 1.4801645278930664, + "learning_rate": 6.903503634223764e-05, + "loss": 0.9222, + "step": 31195 + }, + { + "epoch": 1.2011549566891242, + "grad_norm": 1.630956768989563, + "learning_rate": 6.900628440726209e-05, + "loss": 0.8345, + "step": 31200 + }, + { + "epoch": 1.2013474494706449, + "grad_norm": 1.5143691301345825, + "learning_rate": 6.897753530645652e-05, + "loss": 0.7211, + "step": 31205 + }, + { + "epoch": 1.2015399422521655, + "grad_norm": 0.8206589818000793, + "learning_rate": 6.894878904244979e-05, + "loss": 0.8318, + "step": 31210 + }, + { + "epoch": 1.2017324350336862, + "grad_norm": 1.4505168199539185, + "learning_rate": 6.892004561787064e-05, + "loss": 0.8583, + "step": 31215 + }, + { + "epoch": 1.2019249278152069, + "grad_norm": 1.4092106819152832, + "learning_rate": 6.889130503534745e-05, + "loss": 0.9484, + "step": 31220 + }, + { + "epoch": 1.2021174205967275, + "grad_norm": 1.2476885318756104, + "learning_rate": 6.886256729750832e-05, + "loss": 0.7712, + "step": 31225 + }, + { + "epoch": 1.2023099133782482, + "grad_norm": 0.6989136338233948, + "learning_rate": 6.883383240698114e-05, + "loss": 0.7654, + "step": 31230 + }, + { + "epoch": 1.2025024061597689, + "grad_norm": 1.4014019966125488, + "learning_rate": 6.880510036639354e-05, + "loss": 0.9665, + "step": 31235 + }, + { + "epoch": 1.2026948989412898, + "grad_norm": 2.1451468467712402, + "learning_rate": 6.877637117837286e-05, + "loss": 0.9765, + "step": 31240 + }, + { + "epoch": 1.2028873917228105, + "grad_norm": 1.5151429176330566, + "learning_rate": 6.87476448455462e-05, + "loss": 0.8647, + "step": 31245 + }, + { + "epoch": 1.2030798845043311, + "grad_norm": 0.9325594902038574, + "learning_rate": 6.871892137054038e-05, + "loss": 0.8579, + "step": 31250 + }, + { + "epoch": 1.2032723772858518, + "grad_norm": 1.5600671768188477, + "learning_rate": 6.869020075598198e-05, + "loss": 0.7753, + "step": 31255 + }, + { + "epoch": 1.2034648700673725, + "grad_norm": 1.1633321046829224, + "learning_rate": 6.866148300449733e-05, + "loss": 0.8108, + "step": 31260 + }, + { + "epoch": 1.2036573628488931, + "grad_norm": 1.3864375352859497, + "learning_rate": 6.863276811871246e-05, + "loss": 0.8359, + "step": 31265 + }, + { + "epoch": 1.2038498556304138, + "grad_norm": 1.111289381980896, + "learning_rate": 6.860405610125313e-05, + "loss": 0.8911, + "step": 31270 + }, + { + "epoch": 1.2040423484119345, + "grad_norm": 1.1889983415603638, + "learning_rate": 6.857534695474492e-05, + "loss": 0.9569, + "step": 31275 + }, + { + "epoch": 1.2042348411934554, + "grad_norm": 1.9969587326049805, + "learning_rate": 6.854664068181307e-05, + "loss": 0.9898, + "step": 31280 + }, + { + "epoch": 1.204427333974976, + "grad_norm": 1.4256417751312256, + "learning_rate": 6.851793728508251e-05, + "loss": 0.8671, + "step": 31285 + }, + { + "epoch": 1.2046198267564967, + "grad_norm": 1.4675942659378052, + "learning_rate": 6.84892367671781e-05, + "loss": 0.8998, + "step": 31290 + }, + { + "epoch": 1.2048123195380174, + "grad_norm": 1.571075201034546, + "learning_rate": 6.846053913072423e-05, + "loss": 0.8697, + "step": 31295 + }, + { + "epoch": 1.205004812319538, + "grad_norm": 1.4728553295135498, + "learning_rate": 6.84318443783451e-05, + "loss": 0.874, + "step": 31300 + }, + { + "epoch": 1.2051973051010587, + "grad_norm": 1.1161712408065796, + "learning_rate": 6.840315251266472e-05, + "loss": 0.7356, + "step": 31305 + }, + { + "epoch": 1.2053897978825794, + "grad_norm": 1.4653940200805664, + "learning_rate": 6.837446353630672e-05, + "loss": 0.8024, + "step": 31310 + }, + { + "epoch": 1.2055822906641, + "grad_norm": 1.2116683721542358, + "learning_rate": 6.83457774518945e-05, + "loss": 0.8066, + "step": 31315 + }, + { + "epoch": 1.2057747834456207, + "grad_norm": 1.4532692432403564, + "learning_rate": 6.831709426205128e-05, + "loss": 0.8092, + "step": 31320 + }, + { + "epoch": 1.2059672762271414, + "grad_norm": 1.5983264446258545, + "learning_rate": 6.82884139693999e-05, + "loss": 1.2661, + "step": 31325 + }, + { + "epoch": 1.206159769008662, + "grad_norm": 1.1187528371810913, + "learning_rate": 6.825973657656297e-05, + "loss": 0.8836, + "step": 31330 + }, + { + "epoch": 1.206352261790183, + "grad_norm": 1.9153698682785034, + "learning_rate": 6.82310620861629e-05, + "loss": 0.8676, + "step": 31335 + }, + { + "epoch": 1.2065447545717036, + "grad_norm": 1.0382484197616577, + "learning_rate": 6.820239050082175e-05, + "loss": 0.7845, + "step": 31340 + }, + { + "epoch": 1.2067372473532243, + "grad_norm": 1.228818416595459, + "learning_rate": 6.817372182316135e-05, + "loss": 0.9074, + "step": 31345 + }, + { + "epoch": 1.206929740134745, + "grad_norm": 1.1787536144256592, + "learning_rate": 6.814505605580323e-05, + "loss": 0.8794, + "step": 31350 + }, + { + "epoch": 1.2071222329162656, + "grad_norm": 1.3236230611801147, + "learning_rate": 6.811639320136876e-05, + "loss": 0.7517, + "step": 31355 + }, + { + "epoch": 1.2073147256977863, + "grad_norm": 1.2837519645690918, + "learning_rate": 6.808773326247888e-05, + "loss": 1.0796, + "step": 31360 + }, + { + "epoch": 1.207507218479307, + "grad_norm": 1.7796859741210938, + "learning_rate": 6.805907624175443e-05, + "loss": 0.8648, + "step": 31365 + }, + { + "epoch": 1.2076997112608276, + "grad_norm": 1.2428163290023804, + "learning_rate": 6.803042214181586e-05, + "loss": 0.9098, + "step": 31370 + }, + { + "epoch": 1.2078922040423483, + "grad_norm": 0.7685776352882385, + "learning_rate": 6.800177096528337e-05, + "loss": 0.935, + "step": 31375 + }, + { + "epoch": 1.2080846968238692, + "grad_norm": 1.5274989604949951, + "learning_rate": 6.797312271477699e-05, + "loss": 0.8187, + "step": 31380 + }, + { + "epoch": 1.2082771896053899, + "grad_norm": 1.497298240661621, + "learning_rate": 6.794447739291639e-05, + "loss": 0.7576, + "step": 31385 + }, + { + "epoch": 1.2084696823869105, + "grad_norm": 1.8094556331634521, + "learning_rate": 6.791583500232092e-05, + "loss": 0.8676, + "step": 31390 + }, + { + "epoch": 1.2086621751684312, + "grad_norm": 1.844341516494751, + "learning_rate": 6.788719554560987e-05, + "loss": 0.8661, + "step": 31395 + }, + { + "epoch": 1.2088546679499519, + "grad_norm": 2.015148639678955, + "learning_rate": 6.785855902540205e-05, + "loss": 0.8985, + "step": 31400 + }, + { + "epoch": 1.2090471607314726, + "grad_norm": 1.5119047164916992, + "learning_rate": 6.782992544431603e-05, + "loss": 0.8247, + "step": 31405 + }, + { + "epoch": 1.2092396535129932, + "grad_norm": 1.3965736627578735, + "learning_rate": 6.780129480497028e-05, + "loss": 0.7286, + "step": 31410 + }, + { + "epoch": 1.209432146294514, + "grad_norm": 1.238343358039856, + "learning_rate": 6.777266710998283e-05, + "loss": 0.9662, + "step": 31415 + }, + { + "epoch": 1.2096246390760346, + "grad_norm": 0.8886631727218628, + "learning_rate": 6.774404236197144e-05, + "loss": 0.7578, + "step": 31420 + }, + { + "epoch": 1.2098171318575552, + "grad_norm": 1.1155580282211304, + "learning_rate": 6.771542056355373e-05, + "loss": 0.6954, + "step": 31425 + }, + { + "epoch": 1.2100096246390761, + "grad_norm": 1.5056712627410889, + "learning_rate": 6.768680171734692e-05, + "loss": 0.9816, + "step": 31430 + }, + { + "epoch": 1.2102021174205968, + "grad_norm": 1.3402817249298096, + "learning_rate": 6.765818582596805e-05, + "loss": 0.9098, + "step": 31435 + }, + { + "epoch": 1.2103946102021175, + "grad_norm": 1.0524393320083618, + "learning_rate": 6.762957289203386e-05, + "loss": 0.8848, + "step": 31440 + }, + { + "epoch": 1.2105871029836381, + "grad_norm": 0.9701569676399231, + "learning_rate": 6.760096291816078e-05, + "loss": 0.9192, + "step": 31445 + }, + { + "epoch": 1.2107795957651588, + "grad_norm": 2.2335519790649414, + "learning_rate": 6.757235590696503e-05, + "loss": 0.8811, + "step": 31450 + }, + { + "epoch": 1.2109720885466795, + "grad_norm": 1.1316584348678589, + "learning_rate": 6.75437518610625e-05, + "loss": 0.796, + "step": 31455 + }, + { + "epoch": 1.2111645813282002, + "grad_norm": 1.6016689538955688, + "learning_rate": 6.751515078306887e-05, + "loss": 0.8402, + "step": 31460 + }, + { + "epoch": 1.2113570741097208, + "grad_norm": 1.7846043109893799, + "learning_rate": 6.748655267559951e-05, + "loss": 0.8634, + "step": 31465 + }, + { + "epoch": 1.2115495668912415, + "grad_norm": 1.7141379117965698, + "learning_rate": 6.745795754126953e-05, + "loss": 1.0208, + "step": 31470 + }, + { + "epoch": 1.2117420596727624, + "grad_norm": 1.225437879562378, + "learning_rate": 6.742936538269373e-05, + "loss": 0.9122, + "step": 31475 + }, + { + "epoch": 1.211934552454283, + "grad_norm": 1.7134424448013306, + "learning_rate": 6.740077620248675e-05, + "loss": 0.8984, + "step": 31480 + }, + { + "epoch": 1.2121270452358037, + "grad_norm": 1.0409084558486938, + "learning_rate": 6.737219000326283e-05, + "loss": 0.9374, + "step": 31485 + }, + { + "epoch": 1.2123195380173244, + "grad_norm": 1.527655005455017, + "learning_rate": 6.734360678763593e-05, + "loss": 0.9277, + "step": 31490 + }, + { + "epoch": 1.212512030798845, + "grad_norm": 1.9206295013427734, + "learning_rate": 6.731502655821993e-05, + "loss": 0.7926, + "step": 31495 + }, + { + "epoch": 1.2127045235803657, + "grad_norm": 1.035962462425232, + "learning_rate": 6.728644931762824e-05, + "loss": 0.8946, + "step": 31500 + }, + { + "epoch": 1.2128970163618864, + "grad_norm": 1.146831750869751, + "learning_rate": 6.725787506847399e-05, + "loss": 0.9099, + "step": 31505 + }, + { + "epoch": 1.213089509143407, + "grad_norm": 1.0527504682540894, + "learning_rate": 6.723501782474154e-05, + "loss": 0.7984, + "step": 31510 + }, + { + "epoch": 1.2132820019249277, + "grad_norm": 1.390851378440857, + "learning_rate": 6.72064489667592e-05, + "loss": 0.9729, + "step": 31515 + }, + { + "epoch": 1.2134744947064484, + "grad_norm": 1.2551788091659546, + "learning_rate": 6.717788310752988e-05, + "loss": 0.945, + "step": 31520 + }, + { + "epoch": 1.213666987487969, + "grad_norm": 1.2362979650497437, + "learning_rate": 6.714932024966574e-05, + "loss": 1.068, + "step": 31525 + }, + { + "epoch": 1.21385948026949, + "grad_norm": 1.221686601638794, + "learning_rate": 6.712076039577862e-05, + "loss": 0.897, + "step": 31530 + }, + { + "epoch": 1.2140519730510106, + "grad_norm": 1.4022241830825806, + "learning_rate": 6.709220354848018e-05, + "loss": 0.8292, + "step": 31535 + }, + { + "epoch": 1.2142444658325313, + "grad_norm": 1.1567379236221313, + "learning_rate": 6.706364971038173e-05, + "loss": 0.838, + "step": 31540 + }, + { + "epoch": 1.214436958614052, + "grad_norm": 1.1204020977020264, + "learning_rate": 6.70350988840943e-05, + "loss": 0.9189, + "step": 31545 + }, + { + "epoch": 1.2146294513955727, + "grad_norm": 1.358654260635376, + "learning_rate": 6.700655107222874e-05, + "loss": 0.8147, + "step": 31550 + }, + { + "epoch": 1.2148219441770933, + "grad_norm": 1.1854889392852783, + "learning_rate": 6.697800627739552e-05, + "loss": 0.7513, + "step": 31555 + }, + { + "epoch": 1.215014436958614, + "grad_norm": 1.1493016481399536, + "learning_rate": 6.694946450220483e-05, + "loss": 1.0312, + "step": 31560 + }, + { + "epoch": 1.2152069297401347, + "grad_norm": 1.5674629211425781, + "learning_rate": 6.692092574926673e-05, + "loss": 0.7985, + "step": 31565 + }, + { + "epoch": 1.2153994225216556, + "grad_norm": 1.1671700477600098, + "learning_rate": 6.689239002119084e-05, + "loss": 0.9128, + "step": 31570 + }, + { + "epoch": 1.2155919153031762, + "grad_norm": 1.3727703094482422, + "learning_rate": 6.686385732058651e-05, + "loss": 0.8517, + "step": 31575 + }, + { + "epoch": 1.215784408084697, + "grad_norm": 0.9869505763053894, + "learning_rate": 6.683532765006296e-05, + "loss": 0.8142, + "step": 31580 + }, + { + "epoch": 1.2159769008662176, + "grad_norm": 1.681252121925354, + "learning_rate": 6.680680101222901e-05, + "loss": 0.7783, + "step": 31585 + }, + { + "epoch": 1.2161693936477382, + "grad_norm": 1.7720850706100464, + "learning_rate": 6.677827740969317e-05, + "loss": 0.922, + "step": 31590 + }, + { + "epoch": 1.216361886429259, + "grad_norm": 1.2633975744247437, + "learning_rate": 6.674975684506384e-05, + "loss": 0.8357, + "step": 31595 + }, + { + "epoch": 1.2165543792107796, + "grad_norm": 1.2901790142059326, + "learning_rate": 6.672123932094898e-05, + "loss": 0.856, + "step": 31600 + }, + { + "epoch": 1.2167468719923002, + "grad_norm": 0.9209010004997253, + "learning_rate": 6.66927248399563e-05, + "loss": 0.8126, + "step": 31605 + }, + { + "epoch": 1.216939364773821, + "grad_norm": 1.8657188415527344, + "learning_rate": 6.666421340469331e-05, + "loss": 0.8026, + "step": 31610 + }, + { + "epoch": 1.2171318575553416, + "grad_norm": 0.8723480701446533, + "learning_rate": 6.66357050177672e-05, + "loss": 0.6801, + "step": 31615 + }, + { + "epoch": 1.2173243503368623, + "grad_norm": 1.3046056032180786, + "learning_rate": 6.660719968178479e-05, + "loss": 0.7513, + "step": 31620 + }, + { + "epoch": 1.2175168431183832, + "grad_norm": 1.1734675168991089, + "learning_rate": 6.657869739935282e-05, + "loss": 0.7972, + "step": 31625 + }, + { + "epoch": 1.2177093358999038, + "grad_norm": 1.3330581188201904, + "learning_rate": 6.655019817307758e-05, + "loss": 0.8381, + "step": 31630 + }, + { + "epoch": 1.2179018286814245, + "grad_norm": 1.2588646411895752, + "learning_rate": 6.652170200556508e-05, + "loss": 0.8309, + "step": 31635 + }, + { + "epoch": 1.2180943214629452, + "grad_norm": 1.5502039194107056, + "learning_rate": 6.649320889942121e-05, + "loss": 0.9067, + "step": 31640 + }, + { + "epoch": 1.2182868142444658, + "grad_norm": 1.2267836332321167, + "learning_rate": 6.646471885725139e-05, + "loss": 0.9091, + "step": 31645 + }, + { + "epoch": 1.2184793070259865, + "grad_norm": 0.8720155954360962, + "learning_rate": 6.643623188166088e-05, + "loss": 0.9406, + "step": 31650 + }, + { + "epoch": 1.2186717998075072, + "grad_norm": 2.29374623298645, + "learning_rate": 6.640774797525464e-05, + "loss": 0.976, + "step": 31655 + }, + { + "epoch": 1.2188642925890278, + "grad_norm": 1.3115676641464233, + "learning_rate": 6.637926714063734e-05, + "loss": 0.7881, + "step": 31660 + }, + { + "epoch": 1.2190567853705485, + "grad_norm": 1.2413272857666016, + "learning_rate": 6.635078938041328e-05, + "loss": 0.8841, + "step": 31665 + }, + { + "epoch": 1.2192492781520694, + "grad_norm": 1.0983569622039795, + "learning_rate": 6.632231469718668e-05, + "loss": 0.8903, + "step": 31670 + }, + { + "epoch": 1.21944177093359, + "grad_norm": 1.3633029460906982, + "learning_rate": 6.62938430935613e-05, + "loss": 0.7965, + "step": 31675 + }, + { + "epoch": 1.2196342637151107, + "grad_norm": 1.1858634948730469, + "learning_rate": 6.626537457214064e-05, + "loss": 0.7937, + "step": 31680 + }, + { + "epoch": 1.2198267564966314, + "grad_norm": 1.2882673740386963, + "learning_rate": 6.623690913552804e-05, + "loss": 0.9946, + "step": 31685 + }, + { + "epoch": 1.220019249278152, + "grad_norm": 0.9842979311943054, + "learning_rate": 6.620844678632641e-05, + "loss": 0.7972, + "step": 31690 + }, + { + "epoch": 1.2202117420596728, + "grad_norm": 1.0719999074935913, + "learning_rate": 6.617998752713852e-05, + "loss": 0.823, + "step": 31695 + }, + { + "epoch": 1.2204042348411934, + "grad_norm": 1.3692537546157837, + "learning_rate": 6.615153136056674e-05, + "loss": 0.7786, + "step": 31700 + }, + { + "epoch": 1.220596727622714, + "grad_norm": 1.2221060991287231, + "learning_rate": 6.612307828921313e-05, + "loss": 0.7953, + "step": 31705 + }, + { + "epoch": 1.2207892204042348, + "grad_norm": 1.3177229166030884, + "learning_rate": 6.609462831567964e-05, + "loss": 0.7488, + "step": 31710 + }, + { + "epoch": 1.2209817131857554, + "grad_norm": 0.9634941816329956, + "learning_rate": 6.60661814425678e-05, + "loss": 0.7675, + "step": 31715 + }, + { + "epoch": 1.221174205967276, + "grad_norm": 0.9454881548881531, + "learning_rate": 6.603773767247885e-05, + "loss": 0.7102, + "step": 31720 + }, + { + "epoch": 1.221366698748797, + "grad_norm": 1.579321026802063, + "learning_rate": 6.60092970080138e-05, + "loss": 0.9614, + "step": 31725 + }, + { + "epoch": 1.2215591915303177, + "grad_norm": 1.7888600826263428, + "learning_rate": 6.59808594517734e-05, + "loss": 0.96, + "step": 31730 + }, + { + "epoch": 1.2217516843118383, + "grad_norm": 1.8311741352081299, + "learning_rate": 6.595242500635804e-05, + "loss": 0.9041, + "step": 31735 + }, + { + "epoch": 1.221944177093359, + "grad_norm": 1.2068012952804565, + "learning_rate": 6.592399367436787e-05, + "loss": 0.9126, + "step": 31740 + }, + { + "epoch": 1.2221366698748797, + "grad_norm": 0.8985161781311035, + "learning_rate": 6.589556545840274e-05, + "loss": 0.9111, + "step": 31745 + }, + { + "epoch": 1.2223291626564003, + "grad_norm": 1.460936427116394, + "learning_rate": 6.586714036106221e-05, + "loss": 0.6821, + "step": 31750 + }, + { + "epoch": 1.222521655437921, + "grad_norm": 1.1933438777923584, + "learning_rate": 6.583871838494562e-05, + "loss": 0.7843, + "step": 31755 + }, + { + "epoch": 1.2227141482194417, + "grad_norm": 1.5719155073165894, + "learning_rate": 6.581029953265192e-05, + "loss": 0.8344, + "step": 31760 + }, + { + "epoch": 1.2229066410009626, + "grad_norm": 1.3877325057983398, + "learning_rate": 6.578188380677981e-05, + "loss": 0.7456, + "step": 31765 + }, + { + "epoch": 1.2230991337824832, + "grad_norm": 1.7668566703796387, + "learning_rate": 6.575347120992779e-05, + "loss": 1.0394, + "step": 31770 + }, + { + "epoch": 1.223291626564004, + "grad_norm": 1.813501000404358, + "learning_rate": 6.572506174469398e-05, + "loss": 0.8027, + "step": 31775 + }, + { + "epoch": 1.2234841193455246, + "grad_norm": 0.9930652976036072, + "learning_rate": 6.569665541367616e-05, + "loss": 0.9634, + "step": 31780 + }, + { + "epoch": 1.2236766121270453, + "grad_norm": 1.6402193307876587, + "learning_rate": 6.566825221947201e-05, + "loss": 0.8719, + "step": 31785 + }, + { + "epoch": 1.223869104908566, + "grad_norm": 2.223097562789917, + "learning_rate": 6.563985216467879e-05, + "loss": 0.8185, + "step": 31790 + }, + { + "epoch": 1.2240615976900866, + "grad_norm": 1.260113000869751, + "learning_rate": 6.56114552518934e-05, + "loss": 0.835, + "step": 31795 + }, + { + "epoch": 1.2242540904716073, + "grad_norm": 2.260995864868164, + "learning_rate": 6.558306148371269e-05, + "loss": 0.8712, + "step": 31800 + }, + { + "epoch": 1.224446583253128, + "grad_norm": 1.3736824989318848, + "learning_rate": 6.5554670862733e-05, + "loss": 0.8764, + "step": 31805 + }, + { + "epoch": 1.2246390760346486, + "grad_norm": 1.0253816843032837, + "learning_rate": 6.552628339155044e-05, + "loss": 0.8936, + "step": 31810 + }, + { + "epoch": 1.2248315688161693, + "grad_norm": 1.0663061141967773, + "learning_rate": 6.549789907276094e-05, + "loss": 0.8789, + "step": 31815 + }, + { + "epoch": 1.2250240615976902, + "grad_norm": 1.781774640083313, + "learning_rate": 6.546951790896003e-05, + "loss": 0.8116, + "step": 31820 + }, + { + "epoch": 1.2252165543792108, + "grad_norm": 1.6696699857711792, + "learning_rate": 6.544113990274291e-05, + "loss": 0.9066, + "step": 31825 + }, + { + "epoch": 1.2254090471607315, + "grad_norm": 1.6726038455963135, + "learning_rate": 6.541276505670466e-05, + "loss": 0.7497, + "step": 31830 + }, + { + "epoch": 1.2256015399422522, + "grad_norm": 1.851981282234192, + "learning_rate": 6.538439337343991e-05, + "loss": 0.98, + "step": 31835 + }, + { + "epoch": 1.2257940327237729, + "grad_norm": 1.3425476551055908, + "learning_rate": 6.535602485554307e-05, + "loss": 0.8501, + "step": 31840 + }, + { + "epoch": 1.2259865255052935, + "grad_norm": 1.2194117307662964, + "learning_rate": 6.532765950560827e-05, + "loss": 0.8375, + "step": 31845 + }, + { + "epoch": 1.2261790182868142, + "grad_norm": 1.3500804901123047, + "learning_rate": 6.529929732622932e-05, + "loss": 0.9349, + "step": 31850 + }, + { + "epoch": 1.2263715110683349, + "grad_norm": 1.5260719060897827, + "learning_rate": 6.527093831999977e-05, + "loss": 0.8235, + "step": 31855 + }, + { + "epoch": 1.2265640038498558, + "grad_norm": 1.9417953491210938, + "learning_rate": 6.524258248951285e-05, + "loss": 0.8308, + "step": 31860 + }, + { + "epoch": 1.2267564966313764, + "grad_norm": 1.3070498704910278, + "learning_rate": 6.521422983736151e-05, + "loss": 0.8632, + "step": 31865 + }, + { + "epoch": 1.226948989412897, + "grad_norm": 0.9319446086883545, + "learning_rate": 6.51858803661384e-05, + "loss": 0.8022, + "step": 31870 + }, + { + "epoch": 1.2271414821944178, + "grad_norm": 1.5047639608383179, + "learning_rate": 6.515753407843595e-05, + "loss": 0.9273, + "step": 31875 + }, + { + "epoch": 1.2273339749759384, + "grad_norm": 1.0930204391479492, + "learning_rate": 6.512919097684617e-05, + "loss": 0.8992, + "step": 31880 + }, + { + "epoch": 1.227526467757459, + "grad_norm": 1.2178794145584106, + "learning_rate": 6.510085106396085e-05, + "loss": 0.8333, + "step": 31885 + }, + { + "epoch": 1.2277189605389798, + "grad_norm": 1.62590491771698, + "learning_rate": 6.507251434237155e-05, + "loss": 0.7445, + "step": 31890 + }, + { + "epoch": 1.2279114533205004, + "grad_norm": 3.1164746284484863, + "learning_rate": 6.504418081466945e-05, + "loss": 1.0109, + "step": 31895 + }, + { + "epoch": 1.2281039461020211, + "grad_norm": 1.2229804992675781, + "learning_rate": 6.50158504834454e-05, + "loss": 0.8863, + "step": 31900 + }, + { + "epoch": 1.2282964388835418, + "grad_norm": 1.2148635387420654, + "learning_rate": 6.498752335129014e-05, + "loss": 0.817, + "step": 31905 + }, + { + "epoch": 1.2284889316650625, + "grad_norm": 0.9945757389068604, + "learning_rate": 6.495919942079391e-05, + "loss": 0.819, + "step": 31910 + }, + { + "epoch": 1.2286814244465833, + "grad_norm": 2.1877007484436035, + "learning_rate": 6.493087869454673e-05, + "loss": 0.8977, + "step": 31915 + }, + { + "epoch": 1.228873917228104, + "grad_norm": 2.055363178253174, + "learning_rate": 6.490256117513845e-05, + "loss": 0.9108, + "step": 31920 + }, + { + "epoch": 1.2290664100096247, + "grad_norm": 1.7472233772277832, + "learning_rate": 6.487424686515842e-05, + "loss": 1.0029, + "step": 31925 + }, + { + "epoch": 1.2292589027911454, + "grad_norm": 0.9974002838134766, + "learning_rate": 6.48459357671958e-05, + "loss": 0.8006, + "step": 31930 + }, + { + "epoch": 1.229451395572666, + "grad_norm": 1.9604872465133667, + "learning_rate": 6.481762788383951e-05, + "loss": 0.9948, + "step": 31935 + }, + { + "epoch": 1.2296438883541867, + "grad_norm": 0.9676084518432617, + "learning_rate": 6.478932321767808e-05, + "loss": 0.7514, + "step": 31940 + }, + { + "epoch": 1.2298363811357074, + "grad_norm": 2.2556610107421875, + "learning_rate": 6.476102177129978e-05, + "loss": 0.8251, + "step": 31945 + }, + { + "epoch": 1.230028873917228, + "grad_norm": 0.8798750042915344, + "learning_rate": 6.473272354729263e-05, + "loss": 0.772, + "step": 31950 + }, + { + "epoch": 1.2302213666987487, + "grad_norm": 1.4595062732696533, + "learning_rate": 6.470442854824425e-05, + "loss": 0.9378, + "step": 31955 + }, + { + "epoch": 1.2304138594802696, + "grad_norm": 0.9831540584564209, + "learning_rate": 6.467613677674212e-05, + "loss": 0.752, + "step": 31960 + }, + { + "epoch": 1.2306063522617903, + "grad_norm": 2.3248252868652344, + "learning_rate": 6.464784823537324e-05, + "loss": 0.8944, + "step": 31965 + }, + { + "epoch": 1.230798845043311, + "grad_norm": 1.3205488920211792, + "learning_rate": 6.461956292672447e-05, + "loss": 0.7635, + "step": 31970 + }, + { + "epoch": 1.2309913378248316, + "grad_norm": 2.114743232727051, + "learning_rate": 6.459128085338229e-05, + "loss": 0.8587, + "step": 31975 + }, + { + "epoch": 1.2311838306063523, + "grad_norm": 1.4038870334625244, + "learning_rate": 6.456300201793292e-05, + "loss": 0.8453, + "step": 31980 + }, + { + "epoch": 1.231376323387873, + "grad_norm": 1.8678901195526123, + "learning_rate": 6.453472642296226e-05, + "loss": 0.9314, + "step": 31985 + }, + { + "epoch": 1.2315688161693936, + "grad_norm": 1.9399640560150146, + "learning_rate": 6.450645407105594e-05, + "loss": 0.9189, + "step": 31990 + }, + { + "epoch": 1.2317613089509143, + "grad_norm": 1.849877953529358, + "learning_rate": 6.447818496479927e-05, + "loss": 0.8623, + "step": 31995 + }, + { + "epoch": 1.231953801732435, + "grad_norm": 0.9687787890434265, + "learning_rate": 6.444991910677725e-05, + "loss": 0.7928, + "step": 32000 + }, + { + "epoch": 1.2321462945139556, + "grad_norm": 1.2318834066390991, + "learning_rate": 6.442165649957467e-05, + "loss": 0.9324, + "step": 32005 + }, + { + "epoch": 1.2323387872954763, + "grad_norm": 1.2354921102523804, + "learning_rate": 6.439339714577592e-05, + "loss": 0.9398, + "step": 32010 + }, + { + "epoch": 1.2325312800769972, + "grad_norm": 1.4701021909713745, + "learning_rate": 6.436514104796507e-05, + "loss": 0.8559, + "step": 32015 + }, + { + "epoch": 1.2327237728585179, + "grad_norm": 0.9806448221206665, + "learning_rate": 6.433688820872607e-05, + "loss": 0.8319, + "step": 32020 + }, + { + "epoch": 1.2329162656400385, + "grad_norm": 1.2079403400421143, + "learning_rate": 6.430863863064238e-05, + "loss": 0.8578, + "step": 32025 + }, + { + "epoch": 1.2331087584215592, + "grad_norm": 1.240198016166687, + "learning_rate": 6.428039231629723e-05, + "loss": 0.8681, + "step": 32030 + }, + { + "epoch": 1.2333012512030799, + "grad_norm": 1.0847597122192383, + "learning_rate": 6.425214926827361e-05, + "loss": 0.8595, + "step": 32035 + }, + { + "epoch": 1.2334937439846005, + "grad_norm": 1.1608355045318604, + "learning_rate": 6.422390948915414e-05, + "loss": 0.7778, + "step": 32040 + }, + { + "epoch": 1.2336862367661212, + "grad_norm": 0.8766851425170898, + "learning_rate": 6.419567298152111e-05, + "loss": 0.8284, + "step": 32045 + }, + { + "epoch": 1.2338787295476419, + "grad_norm": 1.4966709613800049, + "learning_rate": 6.416743974795665e-05, + "loss": 0.7442, + "step": 32050 + }, + { + "epoch": 1.2340712223291628, + "grad_norm": 1.342774748802185, + "learning_rate": 6.413920979104244e-05, + "loss": 0.875, + "step": 32055 + }, + { + "epoch": 1.2342637151106834, + "grad_norm": 1.4193564653396606, + "learning_rate": 6.411098311335993e-05, + "loss": 0.7753, + "step": 32060 + }, + { + "epoch": 1.2344562078922041, + "grad_norm": 1.7735865116119385, + "learning_rate": 6.408275971749027e-05, + "loss": 0.7883, + "step": 32065 + }, + { + "epoch": 1.2346487006737248, + "grad_norm": 1.5808089971542358, + "learning_rate": 6.405453960601432e-05, + "loss": 0.7633, + "step": 32070 + }, + { + "epoch": 1.2348411934552455, + "grad_norm": 1.0840727090835571, + "learning_rate": 6.402632278151259e-05, + "loss": 0.8083, + "step": 32075 + }, + { + "epoch": 1.2350336862367661, + "grad_norm": 1.475110650062561, + "learning_rate": 6.399810924656537e-05, + "loss": 0.6657, + "step": 32080 + }, + { + "epoch": 1.2352261790182868, + "grad_norm": 0.9028719663619995, + "learning_rate": 6.396989900375256e-05, + "loss": 0.9991, + "step": 32085 + }, + { + "epoch": 1.2354186717998075, + "grad_norm": 1.3939464092254639, + "learning_rate": 6.394169205565377e-05, + "loss": 0.8523, + "step": 32090 + }, + { + "epoch": 1.2356111645813281, + "grad_norm": 1.7557820081710815, + "learning_rate": 6.391348840484841e-05, + "loss": 0.7734, + "step": 32095 + }, + { + "epoch": 1.2358036573628488, + "grad_norm": 1.2897764444351196, + "learning_rate": 6.388528805391548e-05, + "loss": 0.7278, + "step": 32100 + }, + { + "epoch": 1.2359961501443695, + "grad_norm": 1.3506861925125122, + "learning_rate": 6.38570910054337e-05, + "loss": 0.795, + "step": 32105 + }, + { + "epoch": 1.2361886429258904, + "grad_norm": 2.072033405303955, + "learning_rate": 6.382889726198154e-05, + "loss": 0.8527, + "step": 32110 + }, + { + "epoch": 1.236381135707411, + "grad_norm": 1.8020820617675781, + "learning_rate": 6.380070682613711e-05, + "loss": 0.9461, + "step": 32115 + }, + { + "epoch": 1.2365736284889317, + "grad_norm": 2.3181891441345215, + "learning_rate": 6.377251970047822e-05, + "loss": 0.8269, + "step": 32120 + }, + { + "epoch": 1.2367661212704524, + "grad_norm": 1.5341979265213013, + "learning_rate": 6.374433588758246e-05, + "loss": 0.9718, + "step": 32125 + }, + { + "epoch": 1.236958614051973, + "grad_norm": 1.0951958894729614, + "learning_rate": 6.3716155390027e-05, + "loss": 0.8376, + "step": 32130 + }, + { + "epoch": 1.2371511068334937, + "grad_norm": 1.2054345607757568, + "learning_rate": 6.368797821038874e-05, + "loss": 0.7169, + "step": 32135 + }, + { + "epoch": 1.2373435996150144, + "grad_norm": 1.4558501243591309, + "learning_rate": 6.365980435124435e-05, + "loss": 0.8068, + "step": 32140 + }, + { + "epoch": 1.237536092396535, + "grad_norm": 1.592523217201233, + "learning_rate": 6.363163381517015e-05, + "loss": 0.8709, + "step": 32145 + }, + { + "epoch": 1.2377285851780557, + "grad_norm": 1.191009759902954, + "learning_rate": 6.360346660474206e-05, + "loss": 0.9404, + "step": 32150 + }, + { + "epoch": 1.2379210779595766, + "grad_norm": 1.2547787427902222, + "learning_rate": 6.357530272253587e-05, + "loss": 0.7077, + "step": 32155 + }, + { + "epoch": 1.2381135707410973, + "grad_norm": 1.442322850227356, + "learning_rate": 6.354714217112698e-05, + "loss": 0.8027, + "step": 32160 + }, + { + "epoch": 1.238306063522618, + "grad_norm": 1.3657112121582031, + "learning_rate": 6.351898495309041e-05, + "loss": 0.8016, + "step": 32165 + }, + { + "epoch": 1.2384985563041386, + "grad_norm": 1.7618088722229004, + "learning_rate": 6.349083107100104e-05, + "loss": 0.8553, + "step": 32170 + }, + { + "epoch": 1.2386910490856593, + "grad_norm": 1.7059003114700317, + "learning_rate": 6.346268052743331e-05, + "loss": 0.5883, + "step": 32175 + }, + { + "epoch": 1.23888354186718, + "grad_norm": 1.33328115940094, + "learning_rate": 6.343453332496141e-05, + "loss": 0.7425, + "step": 32180 + }, + { + "epoch": 1.2390760346487006, + "grad_norm": 1.3579884767532349, + "learning_rate": 6.340638946615922e-05, + "loss": 0.7521, + "step": 32185 + }, + { + "epoch": 1.2392685274302213, + "grad_norm": 1.7783567905426025, + "learning_rate": 6.337824895360028e-05, + "loss": 0.8292, + "step": 32190 + }, + { + "epoch": 1.239461020211742, + "grad_norm": 1.4413247108459473, + "learning_rate": 6.335011178985788e-05, + "loss": 0.781, + "step": 32195 + }, + { + "epoch": 1.2396535129932627, + "grad_norm": 1.817164421081543, + "learning_rate": 6.3321977977505e-05, + "loss": 0.8307, + "step": 32200 + }, + { + "epoch": 1.2398460057747833, + "grad_norm": 0.7775023579597473, + "learning_rate": 6.329384751911422e-05, + "loss": 0.8496, + "step": 32205 + }, + { + "epoch": 1.2400384985563042, + "grad_norm": 1.8241838216781616, + "learning_rate": 6.326572041725795e-05, + "loss": 0.8688, + "step": 32210 + }, + { + "epoch": 1.2402309913378249, + "grad_norm": 0.9852082133293152, + "learning_rate": 6.323759667450824e-05, + "loss": 0.8371, + "step": 32215 + }, + { + "epoch": 1.2404234841193456, + "grad_norm": 1.4170035123825073, + "learning_rate": 6.320947629343672e-05, + "loss": 1.0588, + "step": 32220 + }, + { + "epoch": 1.2406159769008662, + "grad_norm": 1.4265270233154297, + "learning_rate": 6.318135927661493e-05, + "loss": 0.8352, + "step": 32225 + }, + { + "epoch": 1.240808469682387, + "grad_norm": 0.9820341467857361, + "learning_rate": 6.315324562661393e-05, + "loss": 0.9075, + "step": 32230 + }, + { + "epoch": 1.2410009624639076, + "grad_norm": 1.0748177766799927, + "learning_rate": 6.312513534600448e-05, + "loss": 0.9917, + "step": 32235 + }, + { + "epoch": 1.2411934552454282, + "grad_norm": 1.1158158779144287, + "learning_rate": 6.309702843735719e-05, + "loss": 0.851, + "step": 32240 + }, + { + "epoch": 1.241385948026949, + "grad_norm": 1.3649555444717407, + "learning_rate": 6.306892490324217e-05, + "loss": 0.9673, + "step": 32245 + }, + { + "epoch": 1.2415784408084698, + "grad_norm": 1.7820628881454468, + "learning_rate": 6.30408247462293e-05, + "loss": 0.8305, + "step": 32250 + }, + { + "epoch": 1.2417709335899905, + "grad_norm": 1.373089075088501, + "learning_rate": 6.30127279688882e-05, + "loss": 0.881, + "step": 32255 + }, + { + "epoch": 1.2419634263715111, + "grad_norm": 1.3298135995864868, + "learning_rate": 6.298463457378812e-05, + "loss": 0.9557, + "step": 32260 + }, + { + "epoch": 1.2421559191530318, + "grad_norm": 1.0167731046676636, + "learning_rate": 6.295654456349794e-05, + "loss": 0.9262, + "step": 32265 + }, + { + "epoch": 1.2423484119345525, + "grad_norm": 0.9300015568733215, + "learning_rate": 6.292845794058644e-05, + "loss": 0.8913, + "step": 32270 + }, + { + "epoch": 1.2425409047160731, + "grad_norm": 2.170581102371216, + "learning_rate": 6.290037470762186e-05, + "loss": 0.8422, + "step": 32275 + }, + { + "epoch": 1.2427333974975938, + "grad_norm": 2.3894753456115723, + "learning_rate": 6.287229486717224e-05, + "loss": 0.8744, + "step": 32280 + }, + { + "epoch": 1.2429258902791145, + "grad_norm": 1.2343543767929077, + "learning_rate": 6.284421842180534e-05, + "loss": 0.757, + "step": 32285 + }, + { + "epoch": 1.2431183830606352, + "grad_norm": 1.0302842855453491, + "learning_rate": 6.281614537408849e-05, + "loss": 0.8185, + "step": 32290 + }, + { + "epoch": 1.2433108758421558, + "grad_norm": 1.0366548299789429, + "learning_rate": 6.278807572658883e-05, + "loss": 0.9143, + "step": 32295 + }, + { + "epoch": 1.2435033686236765, + "grad_norm": 1.3391799926757812, + "learning_rate": 6.276000948187317e-05, + "loss": 0.8389, + "step": 32300 + }, + { + "epoch": 1.2436958614051974, + "grad_norm": 1.0500547885894775, + "learning_rate": 6.273194664250795e-05, + "loss": 0.7855, + "step": 32305 + }, + { + "epoch": 1.243888354186718, + "grad_norm": 1.3002054691314697, + "learning_rate": 6.270388721105927e-05, + "loss": 0.8311, + "step": 32310 + }, + { + "epoch": 1.2440808469682387, + "grad_norm": 1.2565590143203735, + "learning_rate": 6.267583119009309e-05, + "loss": 0.7857, + "step": 32315 + }, + { + "epoch": 1.2442733397497594, + "grad_norm": 1.8590493202209473, + "learning_rate": 6.26477785821749e-05, + "loss": 0.9076, + "step": 32320 + }, + { + "epoch": 1.24446583253128, + "grad_norm": 2.736550807952881, + "learning_rate": 6.261972938986989e-05, + "loss": 0.8985, + "step": 32325 + }, + { + "epoch": 1.2446583253128007, + "grad_norm": 1.4446461200714111, + "learning_rate": 6.259168361574303e-05, + "loss": 0.848, + "step": 32330 + }, + { + "epoch": 1.2448508180943214, + "grad_norm": 2.1344213485717773, + "learning_rate": 6.256364126235892e-05, + "loss": 0.9543, + "step": 32335 + }, + { + "epoch": 1.245043310875842, + "grad_norm": 1.068670392036438, + "learning_rate": 6.253560233228176e-05, + "loss": 0.8341, + "step": 32340 + }, + { + "epoch": 1.245235803657363, + "grad_norm": 0.9258108735084534, + "learning_rate": 6.250756682807564e-05, + "loss": 0.7102, + "step": 32345 + }, + { + "epoch": 1.2454282964388836, + "grad_norm": 1.4921131134033203, + "learning_rate": 6.247953475230417e-05, + "loss": 0.854, + "step": 32350 + }, + { + "epoch": 1.2456207892204043, + "grad_norm": 1.102432131767273, + "learning_rate": 6.245150610753066e-05, + "loss": 0.9141, + "step": 32355 + }, + { + "epoch": 1.245813282001925, + "grad_norm": 1.5257620811462402, + "learning_rate": 6.24234808963182e-05, + "loss": 0.8949, + "step": 32360 + }, + { + "epoch": 1.2460057747834457, + "grad_norm": 2.053269624710083, + "learning_rate": 6.239545912122951e-05, + "loss": 0.9681, + "step": 32365 + }, + { + "epoch": 1.2461982675649663, + "grad_norm": 2.117687702178955, + "learning_rate": 6.236744078482693e-05, + "loss": 0.8536, + "step": 32370 + }, + { + "epoch": 1.246390760346487, + "grad_norm": 1.0575151443481445, + "learning_rate": 6.233942588967264e-05, + "loss": 0.6996, + "step": 32375 + }, + { + "epoch": 1.2465832531280077, + "grad_norm": 1.6774235963821411, + "learning_rate": 6.231141443832835e-05, + "loss": 0.7842, + "step": 32380 + }, + { + "epoch": 1.2467757459095283, + "grad_norm": 0.9772216081619263, + "learning_rate": 6.228340643335554e-05, + "loss": 0.825, + "step": 32385 + }, + { + "epoch": 1.246968238691049, + "grad_norm": 1.221349835395813, + "learning_rate": 6.225540187731538e-05, + "loss": 0.8491, + "step": 32390 + }, + { + "epoch": 1.2471607314725697, + "grad_norm": 1.2611428499221802, + "learning_rate": 6.222740077276869e-05, + "loss": 0.8939, + "step": 32395 + }, + { + "epoch": 1.2473532242540906, + "grad_norm": 1.5732476711273193, + "learning_rate": 6.219940312227596e-05, + "loss": 0.7606, + "step": 32400 + }, + { + "epoch": 1.2475457170356112, + "grad_norm": 1.725197196006775, + "learning_rate": 6.217140892839744e-05, + "loss": 0.8467, + "step": 32405 + }, + { + "epoch": 1.247738209817132, + "grad_norm": 1.427253246307373, + "learning_rate": 6.214341819369294e-05, + "loss": 0.7289, + "step": 32410 + }, + { + "epoch": 1.2479307025986526, + "grad_norm": 1.0685710906982422, + "learning_rate": 6.211543092072209e-05, + "loss": 0.9277, + "step": 32415 + }, + { + "epoch": 1.2481231953801732, + "grad_norm": 1.519148826599121, + "learning_rate": 6.208744711204413e-05, + "loss": 0.9213, + "step": 32420 + }, + { + "epoch": 1.248315688161694, + "grad_norm": 0.941789448261261, + "learning_rate": 6.205946677021797e-05, + "loss": 0.8961, + "step": 32425 + }, + { + "epoch": 1.2485081809432146, + "grad_norm": 1.4877368211746216, + "learning_rate": 6.203148989780223e-05, + "loss": 0.8506, + "step": 32430 + }, + { + "epoch": 1.2487006737247353, + "grad_norm": 1.1104483604431152, + "learning_rate": 6.200351649735524e-05, + "loss": 0.859, + "step": 32435 + }, + { + "epoch": 1.248893166506256, + "grad_norm": 1.6220526695251465, + "learning_rate": 6.197554657143496e-05, + "loss": 0.978, + "step": 32440 + }, + { + "epoch": 1.2490856592877768, + "grad_norm": 1.7555328607559204, + "learning_rate": 6.194758012259903e-05, + "loss": 1.0804, + "step": 32445 + }, + { + "epoch": 1.2492781520692975, + "grad_norm": 1.939076542854309, + "learning_rate": 6.191961715340487e-05, + "loss": 0.8503, + "step": 32450 + }, + { + "epoch": 1.2494706448508182, + "grad_norm": 1.9745993614196777, + "learning_rate": 6.18916576664094e-05, + "loss": 0.9226, + "step": 32455 + }, + { + "epoch": 1.2496631376323388, + "grad_norm": 0.7877691984176636, + "learning_rate": 6.186370166416943e-05, + "loss": 0.8174, + "step": 32460 + }, + { + "epoch": 1.2498556304138595, + "grad_norm": 1.0937036275863647, + "learning_rate": 6.183574914924135e-05, + "loss": 0.9914, + "step": 32465 + }, + { + "epoch": 1.2500481231953802, + "grad_norm": 1.2272846698760986, + "learning_rate": 6.180780012418112e-05, + "loss": 0.8287, + "step": 32470 + }, + { + "epoch": 1.2502406159769008, + "grad_norm": 1.2231870889663696, + "learning_rate": 6.177985459154462e-05, + "loss": 0.889, + "step": 32475 + }, + { + "epoch": 1.2504331087584215, + "grad_norm": 1.420955777168274, + "learning_rate": 6.175191255388723e-05, + "loss": 0.9343, + "step": 32480 + }, + { + "epoch": 1.2506256015399422, + "grad_norm": 0.7461833953857422, + "learning_rate": 6.172397401376404e-05, + "loss": 0.7098, + "step": 32485 + }, + { + "epoch": 1.2508180943214628, + "grad_norm": 0.9062157869338989, + "learning_rate": 6.169603897372988e-05, + "loss": 0.8073, + "step": 32490 + }, + { + "epoch": 1.2510105871029835, + "grad_norm": 0.5169772505760193, + "learning_rate": 6.166810743633924e-05, + "loss": 0.9059, + "step": 32495 + }, + { + "epoch": 1.2512030798845044, + "grad_norm": 1.1018903255462646, + "learning_rate": 6.164017940414621e-05, + "loss": 0.9242, + "step": 32500 + }, + { + "epoch": 1.251395572666025, + "grad_norm": 1.6499050855636597, + "learning_rate": 6.16122548797047e-05, + "loss": 0.7794, + "step": 32505 + }, + { + "epoch": 1.2515880654475458, + "grad_norm": 1.915618658065796, + "learning_rate": 6.158433386556817e-05, + "loss": 0.8513, + "step": 32510 + }, + { + "epoch": 1.2517805582290664, + "grad_norm": 1.2275880575180054, + "learning_rate": 6.155641636428981e-05, + "loss": 0.8407, + "step": 32515 + }, + { + "epoch": 1.251973051010587, + "grad_norm": 1.4273251295089722, + "learning_rate": 6.152850237842255e-05, + "loss": 0.7142, + "step": 32520 + }, + { + "epoch": 1.2521655437921078, + "grad_norm": 1.4185856580734253, + "learning_rate": 6.15005919105189e-05, + "loss": 0.7348, + "step": 32525 + }, + { + "epoch": 1.2523580365736284, + "grad_norm": 1.3497728109359741, + "learning_rate": 6.147268496313101e-05, + "loss": 0.9843, + "step": 32530 + }, + { + "epoch": 1.252550529355149, + "grad_norm": 1.5742275714874268, + "learning_rate": 6.144478153881093e-05, + "loss": 0.9799, + "step": 32535 + }, + { + "epoch": 1.25274302213667, + "grad_norm": 1.4493976831436157, + "learning_rate": 6.141688164011014e-05, + "loss": 0.8815, + "step": 32540 + }, + { + "epoch": 1.2529355149181907, + "grad_norm": 1.0895041227340698, + "learning_rate": 6.138898526957993e-05, + "loss": 0.8332, + "step": 32545 + }, + { + "epoch": 1.2531280076997113, + "grad_norm": 2.0370841026306152, + "learning_rate": 6.136109242977126e-05, + "loss": 1.0292, + "step": 32550 + }, + { + "epoch": 1.253320500481232, + "grad_norm": 1.0882296562194824, + "learning_rate": 6.133320312323473e-05, + "loss": 0.7347, + "step": 32555 + }, + { + "epoch": 1.2535129932627527, + "grad_norm": 1.2545933723449707, + "learning_rate": 6.13053173525206e-05, + "loss": 0.7906, + "step": 32560 + }, + { + "epoch": 1.2537054860442733, + "grad_norm": 1.7179350852966309, + "learning_rate": 6.127743512017892e-05, + "loss": 0.9072, + "step": 32565 + }, + { + "epoch": 1.253897978825794, + "grad_norm": 1.0224990844726562, + "learning_rate": 6.124955642875927e-05, + "loss": 0.8066, + "step": 32570 + }, + { + "epoch": 1.2540904716073147, + "grad_norm": 1.681639313697815, + "learning_rate": 6.122168128081096e-05, + "loss": 0.9829, + "step": 32575 + }, + { + "epoch": 1.2542829643888354, + "grad_norm": 2.0990216732025146, + "learning_rate": 6.119380967888304e-05, + "loss": 1.0276, + "step": 32580 + }, + { + "epoch": 1.254475457170356, + "grad_norm": 1.4491196870803833, + "learning_rate": 6.116594162552416e-05, + "loss": 0.8186, + "step": 32585 + }, + { + "epoch": 1.2546679499518767, + "grad_norm": 1.1606724262237549, + "learning_rate": 6.113807712328261e-05, + "loss": 0.857, + "step": 32590 + }, + { + "epoch": 1.2548604427333974, + "grad_norm": 1.064584493637085, + "learning_rate": 6.111021617470654e-05, + "loss": 0.7753, + "step": 32595 + }, + { + "epoch": 1.2550529355149183, + "grad_norm": 1.3536344766616821, + "learning_rate": 6.108235878234357e-05, + "loss": 0.8481, + "step": 32600 + }, + { + "epoch": 1.255245428296439, + "grad_norm": 1.1210103034973145, + "learning_rate": 6.105450494874105e-05, + "loss": 0.8809, + "step": 32605 + }, + { + "epoch": 1.2554379210779596, + "grad_norm": 1.9485646486282349, + "learning_rate": 6.102665467644607e-05, + "loss": 0.909, + "step": 32610 + }, + { + "epoch": 1.2556304138594803, + "grad_norm": 0.9706501960754395, + "learning_rate": 6.099880796800537e-05, + "loss": 0.9546, + "step": 32615 + }, + { + "epoch": 1.255822906641001, + "grad_norm": 0.9644052386283875, + "learning_rate": 6.097096482596529e-05, + "loss": 0.7045, + "step": 32620 + }, + { + "epoch": 1.2560153994225216, + "grad_norm": 1.0382925271987915, + "learning_rate": 6.094312525287197e-05, + "loss": 0.7066, + "step": 32625 + }, + { + "epoch": 1.2562078922040423, + "grad_norm": 1.0624562501907349, + "learning_rate": 6.0915289251271076e-05, + "loss": 0.7864, + "step": 32630 + }, + { + "epoch": 1.2564003849855632, + "grad_norm": 0.8789560794830322, + "learning_rate": 6.088745682370809e-05, + "loss": 0.9018, + "step": 32635 + }, + { + "epoch": 1.2565928777670838, + "grad_norm": 1.3996667861938477, + "learning_rate": 6.0859627972728086e-05, + "loss": 0.8411, + "step": 32640 + }, + { + "epoch": 1.2567853705486045, + "grad_norm": 1.1187728643417358, + "learning_rate": 6.083180270087583e-05, + "loss": 0.7984, + "step": 32645 + }, + { + "epoch": 1.2569778633301252, + "grad_norm": 1.481068730354309, + "learning_rate": 6.080398101069571e-05, + "loss": 0.7356, + "step": 32650 + }, + { + "epoch": 1.2571703561116458, + "grad_norm": 1.1698410511016846, + "learning_rate": 6.0776162904731915e-05, + "loss": 0.9059, + "step": 32655 + }, + { + "epoch": 1.2573628488931665, + "grad_norm": 1.0608761310577393, + "learning_rate": 6.0748348385528185e-05, + "loss": 0.8176, + "step": 32660 + }, + { + "epoch": 1.2575553416746872, + "grad_norm": 1.4496049880981445, + "learning_rate": 6.0720537455627944e-05, + "loss": 1.139, + "step": 32665 + }, + { + "epoch": 1.2577478344562079, + "grad_norm": 1.7745342254638672, + "learning_rate": 6.069273011757439e-05, + "loss": 0.924, + "step": 32670 + }, + { + "epoch": 1.2579403272377285, + "grad_norm": 1.3103641271591187, + "learning_rate": 6.066492637391028e-05, + "loss": 0.9955, + "step": 32675 + }, + { + "epoch": 1.2581328200192492, + "grad_norm": 1.4893795251846313, + "learning_rate": 6.063712622717803e-05, + "loss": 0.7122, + "step": 32680 + }, + { + "epoch": 1.2583253128007699, + "grad_norm": 1.8530791997909546, + "learning_rate": 6.060932967991988e-05, + "loss": 0.7563, + "step": 32685 + }, + { + "epoch": 1.2585178055822905, + "grad_norm": 0.8076843619346619, + "learning_rate": 6.058153673467759e-05, + "loss": 0.7344, + "step": 32690 + }, + { + "epoch": 1.2587102983638114, + "grad_norm": 1.2442243099212646, + "learning_rate": 6.055374739399261e-05, + "loss": 0.8517, + "step": 32695 + }, + { + "epoch": 1.258902791145332, + "grad_norm": 1.8538851737976074, + "learning_rate": 6.052596166040616e-05, + "loss": 0.799, + "step": 32700 + }, + { + "epoch": 1.2590952839268528, + "grad_norm": 1.2174726724624634, + "learning_rate": 6.049817953645902e-05, + "loss": 0.734, + "step": 32705 + }, + { + "epoch": 1.2592877767083734, + "grad_norm": 1.1436805725097656, + "learning_rate": 6.0470401024691646e-05, + "loss": 0.8512, + "step": 32710 + }, + { + "epoch": 1.259480269489894, + "grad_norm": 0.9243611097335815, + "learning_rate": 6.044262612764429e-05, + "loss": 0.7716, + "step": 32715 + }, + { + "epoch": 1.2596727622714148, + "grad_norm": 1.3061074018478394, + "learning_rate": 6.0414854847856694e-05, + "loss": 0.7524, + "step": 32720 + }, + { + "epoch": 1.2598652550529355, + "grad_norm": 1.4081029891967773, + "learning_rate": 6.038708718786843e-05, + "loss": 0.8902, + "step": 32725 + }, + { + "epoch": 1.2600577478344563, + "grad_norm": 1.0270228385925293, + "learning_rate": 6.0359323150218616e-05, + "loss": 1.0107, + "step": 32730 + }, + { + "epoch": 1.260250240615977, + "grad_norm": 1.0451138019561768, + "learning_rate": 6.033156273744607e-05, + "loss": 0.9819, + "step": 32735 + }, + { + "epoch": 1.2604427333974977, + "grad_norm": 0.9528657793998718, + "learning_rate": 6.030380595208938e-05, + "loss": 0.8202, + "step": 32740 + }, + { + "epoch": 1.2606352261790184, + "grad_norm": 2.1417109966278076, + "learning_rate": 6.027605279668666e-05, + "loss": 0.8817, + "step": 32745 + }, + { + "epoch": 1.260827718960539, + "grad_norm": 1.5614452362060547, + "learning_rate": 6.024830327377573e-05, + "loss": 0.8378, + "step": 32750 + }, + { + "epoch": 1.2610202117420597, + "grad_norm": 1.5928235054016113, + "learning_rate": 6.0220557385894184e-05, + "loss": 0.9096, + "step": 32755 + }, + { + "epoch": 1.2612127045235804, + "grad_norm": 0.8466683030128479, + "learning_rate": 6.019281513557913e-05, + "loss": 0.9457, + "step": 32760 + }, + { + "epoch": 1.261405197305101, + "grad_norm": 1.4196317195892334, + "learning_rate": 6.01650765253674e-05, + "loss": 0.7927, + "step": 32765 + }, + { + "epoch": 1.2615976900866217, + "grad_norm": 0.9929688572883606, + "learning_rate": 6.01373415577956e-05, + "loss": 0.8208, + "step": 32770 + }, + { + "epoch": 1.2617901828681424, + "grad_norm": 1.321770429611206, + "learning_rate": 6.0109610235399826e-05, + "loss": 0.9258, + "step": 32775 + }, + { + "epoch": 1.261982675649663, + "grad_norm": 1.1568975448608398, + "learning_rate": 6.008188256071592e-05, + "loss": 0.9201, + "step": 32780 + }, + { + "epoch": 1.2621751684311837, + "grad_norm": 1.2595579624176025, + "learning_rate": 6.0054158536279446e-05, + "loss": 0.8037, + "step": 32785 + }, + { + "epoch": 1.2623676612127046, + "grad_norm": 1.0337523221969604, + "learning_rate": 6.0026438164625565e-05, + "loss": 0.7372, + "step": 32790 + }, + { + "epoch": 1.2625601539942253, + "grad_norm": 2.097609758377075, + "learning_rate": 5.999872144828907e-05, + "loss": 0.9208, + "step": 32795 + }, + { + "epoch": 1.262752646775746, + "grad_norm": 0.9220959544181824, + "learning_rate": 5.997100838980456e-05, + "loss": 0.7774, + "step": 32800 + }, + { + "epoch": 1.2629451395572666, + "grad_norm": 0.9968023896217346, + "learning_rate": 5.994329899170617e-05, + "loss": 0.8029, + "step": 32805 + }, + { + "epoch": 1.2631376323387873, + "grad_norm": 2.015770196914673, + "learning_rate": 5.991559325652769e-05, + "loss": 0.8083, + "step": 32810 + }, + { + "epoch": 1.263330125120308, + "grad_norm": 1.5899488925933838, + "learning_rate": 5.988789118680272e-05, + "loss": 0.7113, + "step": 32815 + }, + { + "epoch": 1.2635226179018286, + "grad_norm": 1.1025646924972534, + "learning_rate": 5.986019278506436e-05, + "loss": 0.8005, + "step": 32820 + }, + { + "epoch": 1.2637151106833493, + "grad_norm": 1.4543554782867432, + "learning_rate": 5.983249805384545e-05, + "loss": 0.7893, + "step": 32825 + }, + { + "epoch": 1.2639076034648702, + "grad_norm": 1.1230753660202026, + "learning_rate": 5.980480699567854e-05, + "loss": 0.9324, + "step": 32830 + }, + { + "epoch": 1.2641000962463909, + "grad_norm": 1.583644986152649, + "learning_rate": 5.9777119613095755e-05, + "loss": 0.7477, + "step": 32835 + }, + { + "epoch": 1.2642925890279115, + "grad_norm": 1.9501726627349854, + "learning_rate": 5.9749435908628915e-05, + "loss": 0.8344, + "step": 32840 + }, + { + "epoch": 1.2644850818094322, + "grad_norm": 0.6011807918548584, + "learning_rate": 5.972175588480954e-05, + "loss": 0.7556, + "step": 32845 + }, + { + "epoch": 1.2646775745909529, + "grad_norm": 1.0539261102676392, + "learning_rate": 5.9694079544168766e-05, + "loss": 0.8686, + "step": 32850 + }, + { + "epoch": 1.2648700673724735, + "grad_norm": 1.184226393699646, + "learning_rate": 5.966640688923739e-05, + "loss": 0.908, + "step": 32855 + }, + { + "epoch": 1.2650625601539942, + "grad_norm": 0.9844930768013, + "learning_rate": 5.963873792254595e-05, + "loss": 0.724, + "step": 32860 + }, + { + "epoch": 1.2652550529355149, + "grad_norm": 1.4224261045455933, + "learning_rate": 5.9611072646624565e-05, + "loss": 0.8247, + "step": 32865 + }, + { + "epoch": 1.2654475457170355, + "grad_norm": 1.5510427951812744, + "learning_rate": 5.9583411064002995e-05, + "loss": 0.8445, + "step": 32870 + }, + { + "epoch": 1.2656400384985562, + "grad_norm": 1.473929524421692, + "learning_rate": 5.955575317721078e-05, + "loss": 0.7831, + "step": 32875 + }, + { + "epoch": 1.265832531280077, + "grad_norm": 1.2767577171325684, + "learning_rate": 5.9528098988777024e-05, + "loss": 0.7781, + "step": 32880 + }, + { + "epoch": 1.2660250240615976, + "grad_norm": 1.8396121263504028, + "learning_rate": 5.950044850123047e-05, + "loss": 0.856, + "step": 32885 + }, + { + "epoch": 1.2662175168431185, + "grad_norm": 0.8659001588821411, + "learning_rate": 5.947280171709966e-05, + "loss": 0.9269, + "step": 32890 + }, + { + "epoch": 1.2664100096246391, + "grad_norm": 2.1501402854919434, + "learning_rate": 5.9445158638912646e-05, + "loss": 0.8831, + "step": 32895 + }, + { + "epoch": 1.2666025024061598, + "grad_norm": 0.7777537703514099, + "learning_rate": 5.941751926919721e-05, + "loss": 0.7503, + "step": 32900 + }, + { + "epoch": 1.2667949951876805, + "grad_norm": 1.1741917133331299, + "learning_rate": 5.9389883610480835e-05, + "loss": 0.9508, + "step": 32905 + }, + { + "epoch": 1.2669874879692011, + "grad_norm": 1.2577232122421265, + "learning_rate": 5.936225166529057e-05, + "loss": 0.7789, + "step": 32910 + }, + { + "epoch": 1.2671799807507218, + "grad_norm": 1.0245388746261597, + "learning_rate": 5.933462343615317e-05, + "loss": 0.8104, + "step": 32915 + }, + { + "epoch": 1.2673724735322425, + "grad_norm": 1.5200345516204834, + "learning_rate": 5.9306998925595105e-05, + "loss": 0.8895, + "step": 32920 + }, + { + "epoch": 1.2675649663137634, + "grad_norm": 1.0815972089767456, + "learning_rate": 5.927937813614241e-05, + "loss": 0.746, + "step": 32925 + }, + { + "epoch": 1.267757459095284, + "grad_norm": 1.1192761659622192, + "learning_rate": 5.9251761070320845e-05, + "loss": 0.8543, + "step": 32930 + }, + { + "epoch": 1.2679499518768047, + "grad_norm": 1.0540046691894531, + "learning_rate": 5.9224147730655766e-05, + "loss": 0.9522, + "step": 32935 + }, + { + "epoch": 1.2681424446583254, + "grad_norm": 1.5147613286972046, + "learning_rate": 5.9196538119672297e-05, + "loss": 0.868, + "step": 32940 + }, + { + "epoch": 1.268334937439846, + "grad_norm": 2.390244960784912, + "learning_rate": 5.91689322398951e-05, + "loss": 0.7859, + "step": 32945 + }, + { + "epoch": 1.2685274302213667, + "grad_norm": 1.3347294330596924, + "learning_rate": 5.914133009384859e-05, + "loss": 0.8369, + "step": 32950 + }, + { + "epoch": 1.2687199230028874, + "grad_norm": 1.1025590896606445, + "learning_rate": 5.911373168405676e-05, + "loss": 0.7856, + "step": 32955 + }, + { + "epoch": 1.268912415784408, + "grad_norm": 1.335947871208191, + "learning_rate": 5.908613701304331e-05, + "loss": 0.8423, + "step": 32960 + }, + { + "epoch": 1.2691049085659287, + "grad_norm": 1.9916443824768066, + "learning_rate": 5.905854608333163e-05, + "loss": 0.7697, + "step": 32965 + }, + { + "epoch": 1.2692974013474494, + "grad_norm": 1.1374703645706177, + "learning_rate": 5.903095889744466e-05, + "loss": 0.6431, + "step": 32970 + }, + { + "epoch": 1.26948989412897, + "grad_norm": 1.484373688697815, + "learning_rate": 5.900337545790513e-05, + "loss": 0.8287, + "step": 32975 + }, + { + "epoch": 1.2696823869104907, + "grad_norm": 1.3706483840942383, + "learning_rate": 5.8975795767235354e-05, + "loss": 0.9413, + "step": 32980 + }, + { + "epoch": 1.2698748796920116, + "grad_norm": 1.4376710653305054, + "learning_rate": 5.894821982795724e-05, + "loss": 0.7843, + "step": 32985 + }, + { + "epoch": 1.2700673724735323, + "grad_norm": 1.0955665111541748, + "learning_rate": 5.892064764259253e-05, + "loss": 0.8953, + "step": 32990 + }, + { + "epoch": 1.270259865255053, + "grad_norm": 1.9772429466247559, + "learning_rate": 5.889307921366245e-05, + "loss": 0.7771, + "step": 32995 + }, + { + "epoch": 1.2704523580365736, + "grad_norm": 0.92624431848526, + "learning_rate": 5.886551454368794e-05, + "loss": 0.8009, + "step": 33000 + }, + { + "epoch": 1.2706448508180943, + "grad_norm": 1.8379896879196167, + "learning_rate": 5.883795363518968e-05, + "loss": 1.0357, + "step": 33005 + }, + { + "epoch": 1.270837343599615, + "grad_norm": 0.9164409041404724, + "learning_rate": 5.881039649068788e-05, + "loss": 0.8232, + "step": 33010 + }, + { + "epoch": 1.2710298363811356, + "grad_norm": 1.9721314907073975, + "learning_rate": 5.8782843112702436e-05, + "loss": 0.6249, + "step": 33015 + }, + { + "epoch": 1.2712223291626565, + "grad_norm": 1.4756965637207031, + "learning_rate": 5.8755293503752975e-05, + "loss": 0.7346, + "step": 33020 + }, + { + "epoch": 1.2714148219441772, + "grad_norm": 1.0704786777496338, + "learning_rate": 5.8727747666358704e-05, + "loss": 0.8372, + "step": 33025 + }, + { + "epoch": 1.2716073147256979, + "grad_norm": 0.9195441007614136, + "learning_rate": 5.8700205603038484e-05, + "loss": 0.8975, + "step": 33030 + }, + { + "epoch": 1.2717998075072185, + "grad_norm": 0.9442548155784607, + "learning_rate": 5.867266731631088e-05, + "loss": 0.8154, + "step": 33035 + }, + { + "epoch": 1.2719923002887392, + "grad_norm": 1.0243034362792969, + "learning_rate": 5.86451328086941e-05, + "loss": 0.9787, + "step": 33040 + }, + { + "epoch": 1.27218479307026, + "grad_norm": 1.9684053659439087, + "learning_rate": 5.8617602082705955e-05, + "loss": 0.8663, + "step": 33045 + }, + { + "epoch": 1.2723772858517806, + "grad_norm": 2.0619711875915527, + "learning_rate": 5.859007514086399e-05, + "loss": 0.8495, + "step": 33050 + }, + { + "epoch": 1.2725697786333012, + "grad_norm": 1.4054616689682007, + "learning_rate": 5.856255198568532e-05, + "loss": 0.8247, + "step": 33055 + }, + { + "epoch": 1.272762271414822, + "grad_norm": 1.5287714004516602, + "learning_rate": 5.853503261968676e-05, + "loss": 0.9497, + "step": 33060 + }, + { + "epoch": 1.2729547641963426, + "grad_norm": 2.1714093685150146, + "learning_rate": 5.850751704538482e-05, + "loss": 0.8473, + "step": 33065 + }, + { + "epoch": 1.2731472569778632, + "grad_norm": 2.1262662410736084, + "learning_rate": 5.848000526529558e-05, + "loss": 0.9971, + "step": 33070 + }, + { + "epoch": 1.273339749759384, + "grad_norm": 0.8755722641944885, + "learning_rate": 5.845249728193476e-05, + "loss": 0.8989, + "step": 33075 + }, + { + "epoch": 1.2735322425409046, + "grad_norm": 1.1650525331497192, + "learning_rate": 5.842499309781789e-05, + "loss": 0.8994, + "step": 33080 + }, + { + "epoch": 1.2737247353224255, + "grad_norm": 1.2620556354522705, + "learning_rate": 5.839749271545999e-05, + "loss": 0.8267, + "step": 33085 + }, + { + "epoch": 1.2739172281039461, + "grad_norm": 1.1734013557434082, + "learning_rate": 5.836999613737577e-05, + "loss": 0.9163, + "step": 33090 + }, + { + "epoch": 1.2741097208854668, + "grad_norm": 2.227886199951172, + "learning_rate": 5.8342503366079624e-05, + "loss": 0.878, + "step": 33095 + }, + { + "epoch": 1.2743022136669875, + "grad_norm": 0.975045382976532, + "learning_rate": 5.83150144040856e-05, + "loss": 0.873, + "step": 33100 + }, + { + "epoch": 1.2744947064485082, + "grad_norm": 0.9172561764717102, + "learning_rate": 5.828752925390737e-05, + "loss": 0.7909, + "step": 33105 + }, + { + "epoch": 1.2746871992300288, + "grad_norm": 1.7539204359054565, + "learning_rate": 5.82600479180583e-05, + "loss": 0.915, + "step": 33110 + }, + { + "epoch": 1.2748796920115495, + "grad_norm": 2.9440081119537354, + "learning_rate": 5.823257039905131e-05, + "loss": 0.8617, + "step": 33115 + }, + { + "epoch": 1.2750721847930704, + "grad_norm": 1.3572540283203125, + "learning_rate": 5.8205096699399074e-05, + "loss": 0.8082, + "step": 33120 + }, + { + "epoch": 1.275264677574591, + "grad_norm": 1.449960470199585, + "learning_rate": 5.8177626821613876e-05, + "loss": 0.8901, + "step": 33125 + }, + { + "epoch": 1.2754571703561117, + "grad_norm": 1.7100377082824707, + "learning_rate": 5.8150160768207704e-05, + "loss": 0.9157, + "step": 33130 + }, + { + "epoch": 1.2756496631376324, + "grad_norm": 0.9856832027435303, + "learning_rate": 5.812269854169201e-05, + "loss": 0.7319, + "step": 33135 + }, + { + "epoch": 1.275842155919153, + "grad_norm": 2.1571691036224365, + "learning_rate": 5.809524014457821e-05, + "loss": 0.8611, + "step": 33140 + }, + { + "epoch": 1.2760346487006737, + "grad_norm": 1.537744402885437, + "learning_rate": 5.806778557937707e-05, + "loss": 0.967, + "step": 33145 + }, + { + "epoch": 1.2762271414821944, + "grad_norm": 2.2577595710754395, + "learning_rate": 5.8040334848599166e-05, + "loss": 0.8516, + "step": 33150 + }, + { + "epoch": 1.276419634263715, + "grad_norm": 1.4101483821868896, + "learning_rate": 5.801288795475466e-05, + "loss": 0.8992, + "step": 33155 + }, + { + "epoch": 1.2766121270452357, + "grad_norm": 1.332582950592041, + "learning_rate": 5.798544490035347e-05, + "loss": 0.8865, + "step": 33160 + }, + { + "epoch": 1.2768046198267564, + "grad_norm": 1.6516834497451782, + "learning_rate": 5.795800568790495e-05, + "loss": 0.8555, + "step": 33165 + }, + { + "epoch": 1.276997112608277, + "grad_norm": 1.0843029022216797, + "learning_rate": 5.793057031991836e-05, + "loss": 0.7221, + "step": 33170 + }, + { + "epoch": 1.2771896053897978, + "grad_norm": 0.715171754360199, + "learning_rate": 5.790313879890241e-05, + "loss": 0.8228, + "step": 33175 + }, + { + "epoch": 1.2773820981713186, + "grad_norm": 1.1554157733917236, + "learning_rate": 5.787571112736554e-05, + "loss": 0.8982, + "step": 33180 + }, + { + "epoch": 1.2775745909528393, + "grad_norm": 1.4646668434143066, + "learning_rate": 5.784828730781585e-05, + "loss": 0.9118, + "step": 33185 + }, + { + "epoch": 1.27776708373436, + "grad_norm": 1.4582480192184448, + "learning_rate": 5.782086734276109e-05, + "loss": 0.6899, + "step": 33190 + }, + { + "epoch": 1.2779595765158807, + "grad_norm": 0.8803904056549072, + "learning_rate": 5.779345123470853e-05, + "loss": 0.918, + "step": 33195 + }, + { + "epoch": 1.2781520692974013, + "grad_norm": 1.423751711845398, + "learning_rate": 5.7766038986165325e-05, + "loss": 0.7573, + "step": 33200 + }, + { + "epoch": 1.278344562078922, + "grad_norm": 1.1513595581054688, + "learning_rate": 5.7738630599638054e-05, + "loss": 0.8907, + "step": 33205 + }, + { + "epoch": 1.2785370548604427, + "grad_norm": 0.7591530084609985, + "learning_rate": 5.7711226077633065e-05, + "loss": 0.7822, + "step": 33210 + }, + { + "epoch": 1.2787295476419636, + "grad_norm": 1.3467390537261963, + "learning_rate": 5.768382542265629e-05, + "loss": 0.97, + "step": 33215 + }, + { + "epoch": 1.2789220404234842, + "grad_norm": 0.9523210525512695, + "learning_rate": 5.765642863721341e-05, + "loss": 0.861, + "step": 33220 + }, + { + "epoch": 1.279114533205005, + "grad_norm": 1.2797752618789673, + "learning_rate": 5.76290357238096e-05, + "loss": 0.8446, + "step": 33225 + }, + { + "epoch": 1.2793070259865256, + "grad_norm": 2.0043842792510986, + "learning_rate": 5.7601646684949784e-05, + "loss": 0.8505, + "step": 33230 + }, + { + "epoch": 1.2794995187680462, + "grad_norm": 1.1926679611206055, + "learning_rate": 5.7574261523138514e-05, + "loss": 1.039, + "step": 33235 + }, + { + "epoch": 1.279692011549567, + "grad_norm": 1.4371801614761353, + "learning_rate": 5.754688024088e-05, + "loss": 0.975, + "step": 33240 + }, + { + "epoch": 1.2798845043310876, + "grad_norm": 0.7704469561576843, + "learning_rate": 5.751950284067809e-05, + "loss": 0.6496, + "step": 33245 + }, + { + "epoch": 1.2800769971126083, + "grad_norm": 1.707034707069397, + "learning_rate": 5.749212932503614e-05, + "loss": 0.8091, + "step": 33250 + }, + { + "epoch": 1.280269489894129, + "grad_norm": 1.1753286123275757, + "learning_rate": 5.746475969645747e-05, + "loss": 0.7916, + "step": 33255 + }, + { + "epoch": 1.2804619826756496, + "grad_norm": 1.4952518939971924, + "learning_rate": 5.7437393957444716e-05, + "loss": 0.8581, + "step": 33260 + }, + { + "epoch": 1.2806544754571703, + "grad_norm": 1.7860945463180542, + "learning_rate": 5.7410032110500335e-05, + "loss": 0.6714, + "step": 33265 + }, + { + "epoch": 1.280846968238691, + "grad_norm": 1.6719321012496948, + "learning_rate": 5.7382674158126385e-05, + "loss": 0.9253, + "step": 33270 + }, + { + "epoch": 1.2810394610202118, + "grad_norm": 1.7569420337677002, + "learning_rate": 5.735532010282461e-05, + "loss": 0.8774, + "step": 33275 + }, + { + "epoch": 1.2812319538017325, + "grad_norm": 0.898158073425293, + "learning_rate": 5.732796994709623e-05, + "loss": 0.8034, + "step": 33280 + }, + { + "epoch": 1.2814244465832532, + "grad_norm": 1.5631740093231201, + "learning_rate": 5.73006236934424e-05, + "loss": 0.8423, + "step": 33285 + }, + { + "epoch": 1.2816169393647738, + "grad_norm": 1.7265678644180298, + "learning_rate": 5.727328134436364e-05, + "loss": 0.8724, + "step": 33290 + }, + { + "epoch": 1.2818094321462945, + "grad_norm": 0.971207320690155, + "learning_rate": 5.7245942902360274e-05, + "loss": 0.7923, + "step": 33295 + }, + { + "epoch": 1.2820019249278152, + "grad_norm": 1.0335525274276733, + "learning_rate": 5.721860836993218e-05, + "loss": 0.6989, + "step": 33300 + }, + { + "epoch": 1.2821944177093358, + "grad_norm": 0.9981820583343506, + "learning_rate": 5.719127774957902e-05, + "loss": 0.9665, + "step": 33305 + }, + { + "epoch": 1.2823869104908565, + "grad_norm": 2.178272247314453, + "learning_rate": 5.716395104379988e-05, + "loss": 0.8747, + "step": 33310 + }, + { + "epoch": 1.2825794032723774, + "grad_norm": 1.31416654586792, + "learning_rate": 5.713662825509365e-05, + "loss": 0.8943, + "step": 33315 + }, + { + "epoch": 1.282771896053898, + "grad_norm": 1.1709874868392944, + "learning_rate": 5.710930938595882e-05, + "loss": 0.8149, + "step": 33320 + }, + { + "epoch": 1.2829643888354187, + "grad_norm": 1.7127902507781982, + "learning_rate": 5.708199443889353e-05, + "loss": 0.8275, + "step": 33325 + }, + { + "epoch": 1.2831568816169394, + "grad_norm": 0.7726919054985046, + "learning_rate": 5.705468341639557e-05, + "loss": 0.7617, + "step": 33330 + }, + { + "epoch": 1.28334937439846, + "grad_norm": 1.722057819366455, + "learning_rate": 5.702737632096229e-05, + "loss": 0.8446, + "step": 33335 + }, + { + "epoch": 1.2835418671799808, + "grad_norm": 1.3741867542266846, + "learning_rate": 5.700007315509078e-05, + "loss": 0.9602, + "step": 33340 + }, + { + "epoch": 1.2837343599615014, + "grad_norm": 2.013511896133423, + "learning_rate": 5.6972773921277734e-05, + "loss": 0.8998, + "step": 33345 + }, + { + "epoch": 1.283926852743022, + "grad_norm": 2.3441731929779053, + "learning_rate": 5.6945478622019524e-05, + "loss": 0.857, + "step": 33350 + }, + { + "epoch": 1.2841193455245428, + "grad_norm": 1.1220638751983643, + "learning_rate": 5.691818725981199e-05, + "loss": 0.9617, + "step": 33355 + }, + { + "epoch": 1.2843118383060634, + "grad_norm": 1.3120940923690796, + "learning_rate": 5.6890899837150944e-05, + "loss": 0.8752, + "step": 33360 + }, + { + "epoch": 1.284504331087584, + "grad_norm": 1.2576730251312256, + "learning_rate": 5.686361635653148e-05, + "loss": 0.8307, + "step": 33365 + }, + { + "epoch": 1.2846968238691048, + "grad_norm": 1.0636831521987915, + "learning_rate": 5.6836336820448556e-05, + "loss": 0.7714, + "step": 33370 + }, + { + "epoch": 1.2848893166506257, + "grad_norm": 0.9599676728248596, + "learning_rate": 5.680906123139669e-05, + "loss": 0.8351, + "step": 33375 + }, + { + "epoch": 1.2850818094321463, + "grad_norm": 1.8764914274215698, + "learning_rate": 5.67817895918701e-05, + "loss": 0.9712, + "step": 33380 + }, + { + "epoch": 1.285274302213667, + "grad_norm": 1.3599872589111328, + "learning_rate": 5.675452190436248e-05, + "loss": 0.9119, + "step": 33385 + }, + { + "epoch": 1.2854667949951877, + "grad_norm": 0.9683537483215332, + "learning_rate": 5.672725817136744e-05, + "loss": 0.8115, + "step": 33390 + }, + { + "epoch": 1.2856592877767083, + "grad_norm": 1.1662638187408447, + "learning_rate": 5.669999839537794e-05, + "loss": 0.9159, + "step": 33395 + }, + { + "epoch": 1.285851780558229, + "grad_norm": 2.2945556640625, + "learning_rate": 5.667274257888675e-05, + "loss": 0.8131, + "step": 33400 + }, + { + "epoch": 1.2860442733397497, + "grad_norm": 0.9132623076438904, + "learning_rate": 5.664549072438624e-05, + "loss": 0.799, + "step": 33405 + }, + { + "epoch": 1.2862367661212706, + "grad_norm": 0.9698240756988525, + "learning_rate": 5.661824283436844e-05, + "loss": 0.9786, + "step": 33410 + }, + { + "epoch": 1.2864292589027913, + "grad_norm": 0.7265766263008118, + "learning_rate": 5.659099891132488e-05, + "loss": 0.7017, + "step": 33415 + }, + { + "epoch": 1.286621751684312, + "grad_norm": 0.8965588212013245, + "learning_rate": 5.656375895774699e-05, + "loss": 0.8478, + "step": 33420 + }, + { + "epoch": 1.2868142444658326, + "grad_norm": 1.5300185680389404, + "learning_rate": 5.653652297612556e-05, + "loss": 0.851, + "step": 33425 + }, + { + "epoch": 1.2870067372473533, + "grad_norm": 1.6665253639221191, + "learning_rate": 5.650929096895119e-05, + "loss": 0.8494, + "step": 33430 + }, + { + "epoch": 1.287199230028874, + "grad_norm": 1.2362685203552246, + "learning_rate": 5.6482062938714095e-05, + "loss": 0.7511, + "step": 33435 + }, + { + "epoch": 1.2873917228103946, + "grad_norm": 1.0025197267532349, + "learning_rate": 5.645483888790404e-05, + "loss": 0.8445, + "step": 33440 + }, + { + "epoch": 1.2875842155919153, + "grad_norm": 3.0761525630950928, + "learning_rate": 5.6427618819010486e-05, + "loss": 0.9236, + "step": 33445 + }, + { + "epoch": 1.287776708373436, + "grad_norm": 1.8657230138778687, + "learning_rate": 5.640040273452256e-05, + "loss": 0.925, + "step": 33450 + }, + { + "epoch": 1.2879692011549566, + "grad_norm": 1.3410662412643433, + "learning_rate": 5.637319063692903e-05, + "loss": 0.7961, + "step": 33455 + }, + { + "epoch": 1.2881616939364773, + "grad_norm": 1.056356430053711, + "learning_rate": 5.6345982528718125e-05, + "loss": 0.8616, + "step": 33460 + }, + { + "epoch": 1.288354186717998, + "grad_norm": 2.1360607147216797, + "learning_rate": 5.6318778412378024e-05, + "loss": 0.8894, + "step": 33465 + }, + { + "epoch": 1.2885466794995188, + "grad_norm": 1.3970694541931152, + "learning_rate": 5.629157829039623e-05, + "loss": 1.0804, + "step": 33470 + }, + { + "epoch": 1.2887391722810395, + "grad_norm": 1.7947211265563965, + "learning_rate": 5.6264382165260065e-05, + "loss": 0.6816, + "step": 33475 + }, + { + "epoch": 1.2889316650625602, + "grad_norm": 1.3382033109664917, + "learning_rate": 5.6237190039456425e-05, + "loss": 0.9228, + "step": 33480 + }, + { + "epoch": 1.2891241578440809, + "grad_norm": 1.0116480588912964, + "learning_rate": 5.6210001915471896e-05, + "loss": 0.9483, + "step": 33485 + }, + { + "epoch": 1.2893166506256015, + "grad_norm": 1.6364978551864624, + "learning_rate": 5.618281779579253e-05, + "loss": 1.0702, + "step": 33490 + }, + { + "epoch": 1.2895091434071222, + "grad_norm": 1.1366705894470215, + "learning_rate": 5.6155637682904284e-05, + "loss": 0.8214, + "step": 33495 + }, + { + "epoch": 1.2897016361886429, + "grad_norm": 1.1931660175323486, + "learning_rate": 5.6128461579292456e-05, + "loss": 0.6575, + "step": 33500 + }, + { + "epoch": 1.2898941289701638, + "grad_norm": 1.7498856782913208, + "learning_rate": 5.610128948744229e-05, + "loss": 0.843, + "step": 33505 + }, + { + "epoch": 1.2900866217516844, + "grad_norm": 1.446738839149475, + "learning_rate": 5.6074121409838345e-05, + "loss": 0.7902, + "step": 33510 + }, + { + "epoch": 1.290279114533205, + "grad_norm": 1.2919175624847412, + "learning_rate": 5.6046957348965014e-05, + "loss": 0.8396, + "step": 33515 + }, + { + "epoch": 1.2904716073147258, + "grad_norm": 1.1369450092315674, + "learning_rate": 5.6019797307306264e-05, + "loss": 0.9185, + "step": 33520 + }, + { + "epoch": 1.2906641000962464, + "grad_norm": 1.1856272220611572, + "learning_rate": 5.599264128734576e-05, + "loss": 0.8774, + "step": 33525 + }, + { + "epoch": 1.290856592877767, + "grad_norm": 2.4234628677368164, + "learning_rate": 5.5965489291566645e-05, + "loss": 1.0206, + "step": 33530 + }, + { + "epoch": 1.2910490856592878, + "grad_norm": 1.700831651687622, + "learning_rate": 5.593834132245183e-05, + "loss": 0.8308, + "step": 33535 + }, + { + "epoch": 1.2912415784408084, + "grad_norm": 0.7165193557739258, + "learning_rate": 5.591119738248382e-05, + "loss": 0.7948, + "step": 33540 + }, + { + "epoch": 1.2914340712223291, + "grad_norm": 1.512609601020813, + "learning_rate": 5.5884057474144736e-05, + "loss": 0.8691, + "step": 33545 + }, + { + "epoch": 1.2916265640038498, + "grad_norm": 1.0967670679092407, + "learning_rate": 5.585692159991641e-05, + "loss": 0.7975, + "step": 33550 + }, + { + "epoch": 1.2918190567853705, + "grad_norm": 1.0149413347244263, + "learning_rate": 5.582978976228015e-05, + "loss": 0.6579, + "step": 33555 + }, + { + "epoch": 1.2920115495668911, + "grad_norm": 1.2751359939575195, + "learning_rate": 5.5802661963716994e-05, + "loss": 0.8688, + "step": 33560 + }, + { + "epoch": 1.2922040423484118, + "grad_norm": 0.7386071681976318, + "learning_rate": 5.5775538206707636e-05, + "loss": 0.819, + "step": 33565 + }, + { + "epoch": 1.2923965351299327, + "grad_norm": 1.0043171644210815, + "learning_rate": 5.574841849373238e-05, + "loss": 0.7907, + "step": 33570 + }, + { + "epoch": 1.2925890279114534, + "grad_norm": 0.9456561803817749, + "learning_rate": 5.5721302827271035e-05, + "loss": 1.0571, + "step": 33575 + }, + { + "epoch": 1.292781520692974, + "grad_norm": 2.149979591369629, + "learning_rate": 5.5694191209803313e-05, + "loss": 0.8367, + "step": 33580 + }, + { + "epoch": 1.2929740134744947, + "grad_norm": 1.2153743505477905, + "learning_rate": 5.566708364380826e-05, + "loss": 0.7986, + "step": 33585 + }, + { + "epoch": 1.2931665062560154, + "grad_norm": 1.090076208114624, + "learning_rate": 5.563998013176474e-05, + "loss": 0.7111, + "step": 33590 + }, + { + "epoch": 1.293358999037536, + "grad_norm": 1.1182845830917358, + "learning_rate": 5.5612880676151154e-05, + "loss": 0.6748, + "step": 33595 + }, + { + "epoch": 1.2935514918190567, + "grad_norm": 1.2427207231521606, + "learning_rate": 5.5585785279445654e-05, + "loss": 0.8749, + "step": 33600 + }, + { + "epoch": 1.2937439846005776, + "grad_norm": 1.2770304679870605, + "learning_rate": 5.555869394412578e-05, + "loss": 0.9485, + "step": 33605 + }, + { + "epoch": 1.2939364773820983, + "grad_norm": 1.2712188959121704, + "learning_rate": 5.5531606672669045e-05, + "loss": 0.7984, + "step": 33610 + }, + { + "epoch": 1.294128970163619, + "grad_norm": 1.0220839977264404, + "learning_rate": 5.550452346755225e-05, + "loss": 0.8363, + "step": 33615 + }, + { + "epoch": 1.2943214629451396, + "grad_norm": 2.0478289127349854, + "learning_rate": 5.547744433125204e-05, + "loss": 0.8417, + "step": 33620 + }, + { + "epoch": 1.2945139557266603, + "grad_norm": 0.9348976612091064, + "learning_rate": 5.5450369266244595e-05, + "loss": 0.8596, + "step": 33625 + }, + { + "epoch": 1.294706448508181, + "grad_norm": 1.4484078884124756, + "learning_rate": 5.542329827500581e-05, + "loss": 0.8122, + "step": 33630 + }, + { + "epoch": 1.2948989412897016, + "grad_norm": 1.7183796167373657, + "learning_rate": 5.5396231360011074e-05, + "loss": 0.857, + "step": 33635 + }, + { + "epoch": 1.2950914340712223, + "grad_norm": 0.8953412175178528, + "learning_rate": 5.5369168523735505e-05, + "loss": 1.0084, + "step": 33640 + }, + { + "epoch": 1.295283926852743, + "grad_norm": 0.8453150987625122, + "learning_rate": 5.5342109768653815e-05, + "loss": 0.8156, + "step": 33645 + }, + { + "epoch": 1.2954764196342636, + "grad_norm": 1.296000599861145, + "learning_rate": 5.531505509724036e-05, + "loss": 0.808, + "step": 33650 + }, + { + "epoch": 1.2956689124157843, + "grad_norm": Infinity, + "learning_rate": 5.529341430201327e-05, + "loss": 0.8415, + "step": 33655 + }, + { + "epoch": 1.295861405197305, + "grad_norm": 1.4675425291061401, + "learning_rate": 5.526636698743678e-05, + "loss": 0.8757, + "step": 33660 + }, + { + "epoch": 1.2960538979788259, + "grad_norm": 1.3060146570205688, + "learning_rate": 5.5239323763454696e-05, + "loss": 0.8328, + "step": 33665 + }, + { + "epoch": 1.2962463907603465, + "grad_norm": 1.034527063369751, + "learning_rate": 5.5212284632539976e-05, + "loss": 0.9086, + "step": 33670 + }, + { + "epoch": 1.2964388835418672, + "grad_norm": 1.3816114664077759, + "learning_rate": 5.518524959716507e-05, + "loss": 0.8794, + "step": 33675 + }, + { + "epoch": 1.2966313763233879, + "grad_norm": 1.4892889261245728, + "learning_rate": 5.515821865980228e-05, + "loss": 0.8859, + "step": 33680 + }, + { + "epoch": 1.2968238691049085, + "grad_norm": 1.8635764122009277, + "learning_rate": 5.513119182292332e-05, + "loss": 0.8079, + "step": 33685 + }, + { + "epoch": 1.2970163618864292, + "grad_norm": 1.203260898590088, + "learning_rate": 5.5104169088999644e-05, + "loss": 0.7971, + "step": 33690 + }, + { + "epoch": 1.2972088546679499, + "grad_norm": 1.3224972486495972, + "learning_rate": 5.507715046050228e-05, + "loss": 0.916, + "step": 33695 + }, + { + "epoch": 1.2974013474494708, + "grad_norm": 1.774021029472351, + "learning_rate": 5.505013593990197e-05, + "loss": 0.9424, + "step": 33700 + }, + { + "epoch": 1.2975938402309914, + "grad_norm": 0.8528000712394714, + "learning_rate": 5.502312552966892e-05, + "loss": 0.7671, + "step": 33705 + }, + { + "epoch": 1.2977863330125121, + "grad_norm": 2.2666587829589844, + "learning_rate": 5.49961192322731e-05, + "loss": 0.8172, + "step": 33710 + }, + { + "epoch": 1.2979788257940328, + "grad_norm": 1.4332960844039917, + "learning_rate": 5.496911705018404e-05, + "loss": 0.9298, + "step": 33715 + }, + { + "epoch": 1.2981713185755535, + "grad_norm": 2.359644889831543, + "learning_rate": 5.494211898587094e-05, + "loss": 0.9154, + "step": 33720 + }, + { + "epoch": 1.2983638113570741, + "grad_norm": 1.5822657346725464, + "learning_rate": 5.491512504180261e-05, + "loss": 0.8183, + "step": 33725 + }, + { + "epoch": 1.2985563041385948, + "grad_norm": 1.935848593711853, + "learning_rate": 5.488813522044739e-05, + "loss": 0.813, + "step": 33730 + }, + { + "epoch": 1.2987487969201155, + "grad_norm": 2.000776529312134, + "learning_rate": 5.486114952427337e-05, + "loss": 0.7426, + "step": 33735 + }, + { + "epoch": 1.2989412897016361, + "grad_norm": 1.2055097818374634, + "learning_rate": 5.4834167955748204e-05, + "loss": 0.82, + "step": 33740 + }, + { + "epoch": 1.2991337824831568, + "grad_norm": 1.9020949602127075, + "learning_rate": 5.480719051733918e-05, + "loss": 0.788, + "step": 33745 + }, + { + "epoch": 1.2993262752646775, + "grad_norm": 1.5985333919525146, + "learning_rate": 5.478021721151323e-05, + "loss": 0.7221, + "step": 33750 + }, + { + "epoch": 1.2995187680461981, + "grad_norm": 1.5660638809204102, + "learning_rate": 5.475324804073687e-05, + "loss": 0.8852, + "step": 33755 + }, + { + "epoch": 1.299711260827719, + "grad_norm": 1.3215242624282837, + "learning_rate": 5.4726283007476196e-05, + "loss": 0.7963, + "step": 33760 + }, + { + "epoch": 1.2999037536092397, + "grad_norm": 1.6176220178604126, + "learning_rate": 5.4699322114197084e-05, + "loss": 0.8704, + "step": 33765 + }, + { + "epoch": 1.3000962463907604, + "grad_norm": 1.5978866815567017, + "learning_rate": 5.4672365363364855e-05, + "loss": 0.8481, + "step": 33770 + }, + { + "epoch": 1.300288739172281, + "grad_norm": 0.9393962621688843, + "learning_rate": 5.4645412757444525e-05, + "loss": 0.7539, + "step": 33775 + }, + { + "epoch": 1.3004812319538017, + "grad_norm": 1.5960298776626587, + "learning_rate": 5.461846429890077e-05, + "loss": 0.8225, + "step": 33780 + }, + { + "epoch": 1.3006737247353224, + "grad_norm": 1.2632452249526978, + "learning_rate": 5.459151999019787e-05, + "loss": 0.8851, + "step": 33785 + }, + { + "epoch": 1.300866217516843, + "grad_norm": 0.8271371722221375, + "learning_rate": 5.456457983379957e-05, + "loss": 0.8399, + "step": 33790 + }, + { + "epoch": 1.3010587102983637, + "grad_norm": 1.2971397638320923, + "learning_rate": 5.453764383216955e-05, + "loss": 0.7467, + "step": 33795 + }, + { + "epoch": 1.3012512030798846, + "grad_norm": 1.0965392589569092, + "learning_rate": 5.45107119877708e-05, + "loss": 0.7249, + "step": 33800 + }, + { + "epoch": 1.3014436958614053, + "grad_norm": 1.258906364440918, + "learning_rate": 5.4483784303066096e-05, + "loss": 0.8473, + "step": 33805 + }, + { + "epoch": 1.301636188642926, + "grad_norm": 1.6706708669662476, + "learning_rate": 5.44568607805178e-05, + "loss": 0.8842, + "step": 33810 + }, + { + "epoch": 1.3018286814244466, + "grad_norm": 1.3727566003799438, + "learning_rate": 5.442994142258794e-05, + "loss": 0.8538, + "step": 33815 + }, + { + "epoch": 1.3020211742059673, + "grad_norm": 0.9679449796676636, + "learning_rate": 5.440302623173801e-05, + "loss": 0.739, + "step": 33820 + }, + { + "epoch": 1.302213666987488, + "grad_norm": 1.0687127113342285, + "learning_rate": 5.437611521042929e-05, + "loss": 0.665, + "step": 33825 + }, + { + "epoch": 1.3024061597690086, + "grad_norm": 1.3849055767059326, + "learning_rate": 5.4349208361122604e-05, + "loss": 0.9492, + "step": 33830 + }, + { + "epoch": 1.3025986525505293, + "grad_norm": 2.082472562789917, + "learning_rate": 5.4322305686278386e-05, + "loss": 0.8852, + "step": 33835 + }, + { + "epoch": 1.30279114533205, + "grad_norm": 1.0276436805725098, + "learning_rate": 5.4295407188356784e-05, + "loss": 0.7791, + "step": 33840 + }, + { + "epoch": 1.3029836381135707, + "grad_norm": 1.8117910623550415, + "learning_rate": 5.426851286981738e-05, + "loss": 1.0149, + "step": 33845 + }, + { + "epoch": 1.3031761308950913, + "grad_norm": 1.362642526626587, + "learning_rate": 5.4241622733119545e-05, + "loss": 0.8697, + "step": 33850 + }, + { + "epoch": 1.303368623676612, + "grad_norm": 1.1220524311065674, + "learning_rate": 5.421473678072217e-05, + "loss": 0.7154, + "step": 33855 + }, + { + "epoch": 1.3035611164581329, + "grad_norm": 1.0112050771713257, + "learning_rate": 5.4187855015083875e-05, + "loss": 0.7251, + "step": 33860 + }, + { + "epoch": 1.3037536092396536, + "grad_norm": 1.8056666851043701, + "learning_rate": 5.4160977438662665e-05, + "loss": 1.0671, + "step": 33865 + }, + { + "epoch": 1.3039461020211742, + "grad_norm": 1.237334966659546, + "learning_rate": 5.41341040539165e-05, + "loss": 0.7798, + "step": 33870 + }, + { + "epoch": 1.304138594802695, + "grad_norm": 1.337863802909851, + "learning_rate": 5.410723486330265e-05, + "loss": 0.8027, + "step": 33875 + }, + { + "epoch": 1.3043310875842156, + "grad_norm": 1.7887320518493652, + "learning_rate": 5.408036986927816e-05, + "loss": 0.8194, + "step": 33880 + }, + { + "epoch": 1.3045235803657362, + "grad_norm": 1.657067894935608, + "learning_rate": 5.405350907429965e-05, + "loss": 0.9871, + "step": 33885 + }, + { + "epoch": 1.304716073147257, + "grad_norm": 1.5351759195327759, + "learning_rate": 5.40266524808234e-05, + "loss": 0.9261, + "step": 33890 + }, + { + "epoch": 1.3049085659287778, + "grad_norm": 1.6100406646728516, + "learning_rate": 5.399980009130516e-05, + "loss": 0.8397, + "step": 33895 + }, + { + "epoch": 1.3051010587102985, + "grad_norm": 1.196283221244812, + "learning_rate": 5.397295190820058e-05, + "loss": 0.832, + "step": 33900 + }, + { + "epoch": 1.3052935514918191, + "grad_norm": 1.709205150604248, + "learning_rate": 5.3946107933964576e-05, + "loss": 0.9753, + "step": 33905 + }, + { + "epoch": 1.3054860442733398, + "grad_norm": 1.0625784397125244, + "learning_rate": 5.391926817105194e-05, + "loss": 0.7821, + "step": 33910 + }, + { + "epoch": 1.3056785370548605, + "grad_norm": 1.3273873329162598, + "learning_rate": 5.389243262191697e-05, + "loss": 0.9206, + "step": 33915 + }, + { + "epoch": 1.3058710298363811, + "grad_norm": 2.023766040802002, + "learning_rate": 5.3865601289013636e-05, + "loss": 0.8539, + "step": 33920 + }, + { + "epoch": 1.3060635226179018, + "grad_norm": 1.1720694303512573, + "learning_rate": 5.383877417479541e-05, + "loss": 0.8685, + "step": 33925 + }, + { + "epoch": 1.3062560153994225, + "grad_norm": 1.6969107389450073, + "learning_rate": 5.38119512817155e-05, + "loss": 0.8275, + "step": 33930 + }, + { + "epoch": 1.3064485081809432, + "grad_norm": 1.802259922027588, + "learning_rate": 5.378513261222669e-05, + "loss": 0.8782, + "step": 33935 + }, + { + "epoch": 1.3066410009624638, + "grad_norm": 0.5480902791023254, + "learning_rate": 5.375831816878134e-05, + "loss": 0.8165, + "step": 33940 + }, + { + "epoch": 1.3068334937439845, + "grad_norm": 2.5565297603607178, + "learning_rate": 5.3731507953831514e-05, + "loss": 0.8427, + "step": 33945 + }, + { + "epoch": 1.3070259865255052, + "grad_norm": 1.1750694513320923, + "learning_rate": 5.3704701969828754e-05, + "loss": 0.9518, + "step": 33950 + }, + { + "epoch": 1.307218479307026, + "grad_norm": 0.9000421762466431, + "learning_rate": 5.367790021922431e-05, + "loss": 0.7622, + "step": 33955 + }, + { + "epoch": 1.3074109720885467, + "grad_norm": 1.3734017610549927, + "learning_rate": 5.365110270446904e-05, + "loss": 0.835, + "step": 33960 + }, + { + "epoch": 1.3076034648700674, + "grad_norm": 1.1932064294815063, + "learning_rate": 5.362430942801343e-05, + "loss": 0.8661, + "step": 33965 + }, + { + "epoch": 1.307795957651588, + "grad_norm": 1.3349897861480713, + "learning_rate": 5.3597520392307434e-05, + "loss": 0.7571, + "step": 33970 + }, + { + "epoch": 1.3079884504331087, + "grad_norm": 1.5428650379180908, + "learning_rate": 5.357073559980089e-05, + "loss": 0.7853, + "step": 33975 + }, + { + "epoch": 1.3081809432146294, + "grad_norm": 1.3346575498580933, + "learning_rate": 5.354395505294297e-05, + "loss": 0.7985, + "step": 33980 + }, + { + "epoch": 1.30837343599615, + "grad_norm": 1.840759038925171, + "learning_rate": 5.351717875418263e-05, + "loss": 0.9832, + "step": 33985 + }, + { + "epoch": 1.308565928777671, + "grad_norm": 1.7884091138839722, + "learning_rate": 5.349040670596835e-05, + "loss": 0.8081, + "step": 33990 + }, + { + "epoch": 1.3087584215591916, + "grad_norm": 1.082604169845581, + "learning_rate": 5.346363891074833e-05, + "loss": 0.7615, + "step": 33995 + }, + { + "epoch": 1.3089509143407123, + "grad_norm": 1.071516990661621, + "learning_rate": 5.3436875370970176e-05, + "loss": 0.9025, + "step": 34000 + }, + { + "epoch": 1.309143407122233, + "grad_norm": 0.9769271612167358, + "learning_rate": 5.3410116089081394e-05, + "loss": 0.7979, + "step": 34005 + }, + { + "epoch": 1.3093358999037537, + "grad_norm": 1.50300133228302, + "learning_rate": 5.3383361067528795e-05, + "loss": 0.9068, + "step": 34010 + }, + { + "epoch": 1.3095283926852743, + "grad_norm": 1.6212974786758423, + "learning_rate": 5.335661030875909e-05, + "loss": 0.8189, + "step": 34015 + }, + { + "epoch": 1.309720885466795, + "grad_norm": 1.2834120988845825, + "learning_rate": 5.3329863815218354e-05, + "loss": 0.9268, + "step": 34020 + }, + { + "epoch": 1.3099133782483157, + "grad_norm": 1.2864872217178345, + "learning_rate": 5.33031215893524e-05, + "loss": 0.8804, + "step": 34025 + }, + { + "epoch": 1.3101058710298363, + "grad_norm": 1.1591218709945679, + "learning_rate": 5.3276383633606656e-05, + "loss": 0.9498, + "step": 34030 + }, + { + "epoch": 1.310298363811357, + "grad_norm": 1.4199464321136475, + "learning_rate": 5.324964995042614e-05, + "loss": 0.7704, + "step": 34035 + }, + { + "epoch": 1.3104908565928777, + "grad_norm": 1.1240346431732178, + "learning_rate": 5.322292054225539e-05, + "loss": 0.846, + "step": 34040 + }, + { + "epoch": 1.3106833493743983, + "grad_norm": 1.1645770072937012, + "learning_rate": 5.3196195411538706e-05, + "loss": 0.8291, + "step": 34045 + }, + { + "epoch": 1.3108758421559192, + "grad_norm": 1.1007463932037354, + "learning_rate": 5.316947456071994e-05, + "loss": 0.7795, + "step": 34050 + }, + { + "epoch": 1.31106833493744, + "grad_norm": 1.4447979927062988, + "learning_rate": 5.314275799224243e-05, + "loss": 0.8513, + "step": 34055 + }, + { + "epoch": 1.3112608277189606, + "grad_norm": 1.0073038339614868, + "learning_rate": 5.311604570854938e-05, + "loss": 0.7007, + "step": 34060 + }, + { + "epoch": 1.3114533205004812, + "grad_norm": 1.1897684335708618, + "learning_rate": 5.308933771208332e-05, + "loss": 0.8345, + "step": 34065 + }, + { + "epoch": 1.311645813282002, + "grad_norm": 0.9960253834724426, + "learning_rate": 5.3062634005286586e-05, + "loss": 0.7782, + "step": 34070 + }, + { + "epoch": 1.3118383060635226, + "grad_norm": 1.1287857294082642, + "learning_rate": 5.303593459060103e-05, + "loss": 0.8248, + "step": 34075 + }, + { + "epoch": 1.3120307988450433, + "grad_norm": 1.133621096611023, + "learning_rate": 5.300923947046819e-05, + "loss": 0.7083, + "step": 34080 + }, + { + "epoch": 1.312223291626564, + "grad_norm": 0.9852873086929321, + "learning_rate": 5.2982548647329034e-05, + "loss": 0.8222, + "step": 34085 + }, + { + "epoch": 1.3124157844080848, + "grad_norm": 1.1880803108215332, + "learning_rate": 5.295586212362442e-05, + "loss": 1.0247, + "step": 34090 + }, + { + "epoch": 1.3126082771896055, + "grad_norm": 2.150893449783325, + "learning_rate": 5.292917990179453e-05, + "loss": 0.8991, + "step": 34095 + }, + { + "epoch": 1.3128007699711262, + "grad_norm": 1.5206340551376343, + "learning_rate": 5.290250198427934e-05, + "loss": 0.8353, + "step": 34100 + }, + { + "epoch": 1.3129932627526468, + "grad_norm": 1.733988642692566, + "learning_rate": 5.2875828373518344e-05, + "loss": 0.8553, + "step": 34105 + }, + { + "epoch": 1.3131857555341675, + "grad_norm": 2.8545048236846924, + "learning_rate": 5.2849159071950716e-05, + "loss": 0.835, + "step": 34110 + }, + { + "epoch": 1.3133782483156882, + "grad_norm": 1.2156729698181152, + "learning_rate": 5.282249408201505e-05, + "loss": 0.8605, + "step": 34115 + }, + { + "epoch": 1.3135707410972088, + "grad_norm": 1.5454976558685303, + "learning_rate": 5.2795833406149876e-05, + "loss": 0.7758, + "step": 34120 + }, + { + "epoch": 1.3137632338787295, + "grad_norm": 1.7499946355819702, + "learning_rate": 5.276917704679299e-05, + "loss": 0.8126, + "step": 34125 + }, + { + "epoch": 1.3139557266602502, + "grad_norm": 1.7527188062667847, + "learning_rate": 5.2742525006381994e-05, + "loss": 0.8095, + "step": 34130 + }, + { + "epoch": 1.3141482194417708, + "grad_norm": 1.1449981927871704, + "learning_rate": 5.271587728735402e-05, + "loss": 0.9137, + "step": 34135 + }, + { + "epoch": 1.3143407122232915, + "grad_norm": 1.3999851942062378, + "learning_rate": 5.268923389214588e-05, + "loss": 0.8175, + "step": 34140 + }, + { + "epoch": 1.3145332050048122, + "grad_norm": 1.119730830192566, + "learning_rate": 5.2662594823193865e-05, + "loss": 0.9228, + "step": 34145 + }, + { + "epoch": 1.314725697786333, + "grad_norm": 0.8965536952018738, + "learning_rate": 5.263596008293398e-05, + "loss": 0.8623, + "step": 34150 + }, + { + "epoch": 1.3149181905678538, + "grad_norm": 1.6485599279403687, + "learning_rate": 5.260932967380178e-05, + "loss": 0.7832, + "step": 34155 + }, + { + "epoch": 1.3151106833493744, + "grad_norm": 1.0954639911651611, + "learning_rate": 5.2582703598232444e-05, + "loss": 0.7878, + "step": 34160 + }, + { + "epoch": 1.315303176130895, + "grad_norm": 1.3316006660461426, + "learning_rate": 5.255608185866079e-05, + "loss": 0.8128, + "step": 34165 + }, + { + "epoch": 1.3154956689124158, + "grad_norm": 0.8699563145637512, + "learning_rate": 5.252946445752113e-05, + "loss": 0.8295, + "step": 34170 + }, + { + "epoch": 1.3156881616939364, + "grad_norm": 1.7148654460906982, + "learning_rate": 5.2502851397247476e-05, + "loss": 0.9354, + "step": 34175 + }, + { + "epoch": 1.315880654475457, + "grad_norm": 1.602002501487732, + "learning_rate": 5.247624268027342e-05, + "loss": 0.6457, + "step": 34180 + }, + { + "epoch": 1.316073147256978, + "grad_norm": 0.8863468170166016, + "learning_rate": 5.24496383090322e-05, + "loss": 0.8983, + "step": 34185 + }, + { + "epoch": 1.3162656400384987, + "grad_norm": 1.1766276359558105, + "learning_rate": 5.242303828595649e-05, + "loss": 0.7426, + "step": 34190 + }, + { + "epoch": 1.3164581328200193, + "grad_norm": 1.0617400407791138, + "learning_rate": 5.2396442613478825e-05, + "loss": 0.8074, + "step": 34195 + }, + { + "epoch": 1.31665062560154, + "grad_norm": 1.5441138744354248, + "learning_rate": 5.236985129403112e-05, + "loss": 0.7365, + "step": 34200 + }, + { + "epoch": 1.3168431183830607, + "grad_norm": 1.1525336503982544, + "learning_rate": 5.234326433004497e-05, + "loss": 0.8009, + "step": 34205 + }, + { + "epoch": 1.3170356111645813, + "grad_norm": 1.6178624629974365, + "learning_rate": 5.231668172395161e-05, + "loss": 0.8423, + "step": 34210 + }, + { + "epoch": 1.317228103946102, + "grad_norm": 0.7218144536018372, + "learning_rate": 5.229010347818187e-05, + "loss": 0.5935, + "step": 34215 + }, + { + "epoch": 1.3174205967276227, + "grad_norm": 1.203621506690979, + "learning_rate": 5.226352959516605e-05, + "loss": 0.812, + "step": 34220 + }, + { + "epoch": 1.3176130895091434, + "grad_norm": 1.475656509399414, + "learning_rate": 5.2236960077334296e-05, + "loss": 1.0383, + "step": 34225 + }, + { + "epoch": 1.317805582290664, + "grad_norm": 1.2111153602600098, + "learning_rate": 5.2210394927116105e-05, + "loss": 0.8406, + "step": 34230 + }, + { + "epoch": 1.3179980750721847, + "grad_norm": 2.3121533393859863, + "learning_rate": 5.21838341469407e-05, + "loss": 0.8353, + "step": 34235 + }, + { + "epoch": 1.3181905678537054, + "grad_norm": 1.2453131675720215, + "learning_rate": 5.215727773923693e-05, + "loss": 0.8723, + "step": 34240 + }, + { + "epoch": 1.3183830606352263, + "grad_norm": 1.351366639137268, + "learning_rate": 5.2130725706433224e-05, + "loss": 0.905, + "step": 34245 + }, + { + "epoch": 1.318575553416747, + "grad_norm": 1.8964182138442993, + "learning_rate": 5.21041780509575e-05, + "loss": 0.9963, + "step": 34250 + }, + { + "epoch": 1.3187680461982676, + "grad_norm": 1.1896933317184448, + "learning_rate": 5.2077634775237415e-05, + "loss": 0.8957, + "step": 34255 + }, + { + "epoch": 1.3189605389797883, + "grad_norm": 2.013298511505127, + "learning_rate": 5.205109588170016e-05, + "loss": 0.8777, + "step": 34260 + }, + { + "epoch": 1.319153031761309, + "grad_norm": 1.3011314868927002, + "learning_rate": 5.202456137277256e-05, + "loss": 1.0396, + "step": 34265 + }, + { + "epoch": 1.3193455245428296, + "grad_norm": 1.5230119228363037, + "learning_rate": 5.199803125088107e-05, + "loss": 1.0502, + "step": 34270 + }, + { + "epoch": 1.3195380173243503, + "grad_norm": 1.011749505996704, + "learning_rate": 5.197150551845155e-05, + "loss": 0.7362, + "step": 34275 + }, + { + "epoch": 1.3197305101058712, + "grad_norm": 1.0205798149108887, + "learning_rate": 5.1944984177909765e-05, + "loss": 0.8891, + "step": 34280 + }, + { + "epoch": 1.3199230028873918, + "grad_norm": 1.5326029062271118, + "learning_rate": 5.1918467231680815e-05, + "loss": 0.7624, + "step": 34285 + }, + { + "epoch": 1.3201154956689125, + "grad_norm": 1.5170354843139648, + "learning_rate": 5.1891954682189505e-05, + "loss": 0.8149, + "step": 34290 + }, + { + "epoch": 1.3203079884504332, + "grad_norm": 0.9941301345825195, + "learning_rate": 5.186544653186026e-05, + "loss": 0.9011, + "step": 34295 + }, + { + "epoch": 1.3205004812319538, + "grad_norm": 3.1168370246887207, + "learning_rate": 5.183894278311712e-05, + "loss": 0.9253, + "step": 34300 + }, + { + "epoch": 1.3206929740134745, + "grad_norm": 0.9092430472373962, + "learning_rate": 5.181244343838353e-05, + "loss": 0.8621, + "step": 34305 + }, + { + "epoch": 1.3208854667949952, + "grad_norm": 0.8857255578041077, + "learning_rate": 5.178594850008286e-05, + "loss": 0.7869, + "step": 34310 + }, + { + "epoch": 1.3210779595765159, + "grad_norm": 0.8203316330909729, + "learning_rate": 5.175945797063777e-05, + "loss": 0.9274, + "step": 34315 + }, + { + "epoch": 1.3212704523580365, + "grad_norm": 1.180347204208374, + "learning_rate": 5.173297185247068e-05, + "loss": 0.7436, + "step": 34320 + }, + { + "epoch": 1.3214629451395572, + "grad_norm": 2.1619889736175537, + "learning_rate": 5.170649014800358e-05, + "loss": 0.7501, + "step": 34325 + }, + { + "epoch": 1.3216554379210779, + "grad_norm": 1.5437921285629272, + "learning_rate": 5.168001285965808e-05, + "loss": 0.8124, + "step": 34330 + }, + { + "epoch": 1.3218479307025985, + "grad_norm": 1.1066104173660278, + "learning_rate": 5.165353998985523e-05, + "loss": 0.8066, + "step": 34335 + }, + { + "epoch": 1.3220404234841192, + "grad_norm": 0.8616192936897278, + "learning_rate": 5.162707154101597e-05, + "loss": 0.7576, + "step": 34340 + }, + { + "epoch": 1.32223291626564, + "grad_norm": 0.889975368976593, + "learning_rate": 5.160060751556053e-05, + "loss": 0.7436, + "step": 34345 + }, + { + "epoch": 1.3224254090471608, + "grad_norm": 2.0296168327331543, + "learning_rate": 5.157414791590891e-05, + "loss": 0.6742, + "step": 34350 + }, + { + "epoch": 1.3226179018286814, + "grad_norm": 1.0081722736358643, + "learning_rate": 5.154769274448068e-05, + "loss": 0.7889, + "step": 34355 + }, + { + "epoch": 1.3228103946102021, + "grad_norm": 1.0939741134643555, + "learning_rate": 5.152124200369503e-05, + "loss": 0.8367, + "step": 34360 + }, + { + "epoch": 1.3230028873917228, + "grad_norm": 1.5666990280151367, + "learning_rate": 5.14947956959706e-05, + "loss": 0.8282, + "step": 34365 + }, + { + "epoch": 1.3231953801732435, + "grad_norm": 1.164833903312683, + "learning_rate": 5.146835382372579e-05, + "loss": 0.8148, + "step": 34370 + }, + { + "epoch": 1.3233878729547641, + "grad_norm": 1.489606499671936, + "learning_rate": 5.144191638937854e-05, + "loss": 0.8357, + "step": 34375 + }, + { + "epoch": 1.323580365736285, + "grad_norm": 1.4213351011276245, + "learning_rate": 5.1415483395346356e-05, + "loss": 0.9297, + "step": 34380 + }, + { + "epoch": 1.3237728585178057, + "grad_norm": 1.414014220237732, + "learning_rate": 5.138905484404641e-05, + "loss": 0.9808, + "step": 34385 + }, + { + "epoch": 1.3239653512993264, + "grad_norm": 1.255434274673462, + "learning_rate": 5.136263073789536e-05, + "loss": 0.8107, + "step": 34390 + }, + { + "epoch": 1.324157844080847, + "grad_norm": 1.3263583183288574, + "learning_rate": 5.133621107930951e-05, + "loss": 0.837, + "step": 34395 + }, + { + "epoch": 1.3243503368623677, + "grad_norm": 1.525387167930603, + "learning_rate": 5.1309795870704815e-05, + "loss": 0.8036, + "step": 34400 + }, + { + "epoch": 1.3245428296438884, + "grad_norm": 0.9029211401939392, + "learning_rate": 5.128338511449676e-05, + "loss": 0.9465, + "step": 34405 + }, + { + "epoch": 1.324735322425409, + "grad_norm": 1.5629209280014038, + "learning_rate": 5.1256978813100354e-05, + "loss": 0.8984, + "step": 34410 + }, + { + "epoch": 1.3249278152069297, + "grad_norm": 1.4381479024887085, + "learning_rate": 5.123057696893042e-05, + "loss": 0.7762, + "step": 34415 + }, + { + "epoch": 1.3251203079884504, + "grad_norm": 1.8477442264556885, + "learning_rate": 5.1204179584401115e-05, + "loss": 0.8943, + "step": 34420 + }, + { + "epoch": 1.325312800769971, + "grad_norm": 1.091605305671692, + "learning_rate": 5.117778666192634e-05, + "loss": 0.7989, + "step": 34425 + }, + { + "epoch": 1.3255052935514917, + "grad_norm": 1.6211384534835815, + "learning_rate": 5.1151398203919564e-05, + "loss": 0.7188, + "step": 34430 + }, + { + "epoch": 1.3256977863330124, + "grad_norm": 1.434945821762085, + "learning_rate": 5.1125014212793854e-05, + "loss": 0.8346, + "step": 34435 + }, + { + "epoch": 1.3258902791145333, + "grad_norm": 1.272206425666809, + "learning_rate": 5.1098634690961765e-05, + "loss": 0.8576, + "step": 34440 + }, + { + "epoch": 1.326082771896054, + "grad_norm": 0.9219827055931091, + "learning_rate": 5.107225964083566e-05, + "loss": 0.7725, + "step": 34445 + }, + { + "epoch": 1.3262752646775746, + "grad_norm": 1.1530919075012207, + "learning_rate": 5.1045889064827255e-05, + "loss": 0.9481, + "step": 34450 + }, + { + "epoch": 1.3264677574590953, + "grad_norm": 0.9978432655334473, + "learning_rate": 5.101952296534802e-05, + "loss": 0.8516, + "step": 34455 + }, + { + "epoch": 1.326660250240616, + "grad_norm": 1.018656611442566, + "learning_rate": 5.0993161344808924e-05, + "loss": 0.9756, + "step": 34460 + }, + { + "epoch": 1.3268527430221366, + "grad_norm": 1.7046258449554443, + "learning_rate": 5.0966804205620635e-05, + "loss": 1.018, + "step": 34465 + }, + { + "epoch": 1.3270452358036573, + "grad_norm": 1.1494272947311401, + "learning_rate": 5.094045155019325e-05, + "loss": 0.9536, + "step": 34470 + }, + { + "epoch": 1.3272377285851782, + "grad_norm": 2.197441339492798, + "learning_rate": 5.0914103380936564e-05, + "loss": 0.7673, + "step": 34475 + }, + { + "epoch": 1.3274302213666989, + "grad_norm": 1.758847713470459, + "learning_rate": 5.0887759700259965e-05, + "loss": 0.8132, + "step": 34480 + }, + { + "epoch": 1.3276227141482195, + "grad_norm": 1.2169286012649536, + "learning_rate": 5.086142051057241e-05, + "loss": 0.7716, + "step": 34485 + }, + { + "epoch": 1.3278152069297402, + "grad_norm": 1.7917555570602417, + "learning_rate": 5.083508581428247e-05, + "loss": 0.9653, + "step": 34490 + }, + { + "epoch": 1.3280076997112609, + "grad_norm": 1.4146066904067993, + "learning_rate": 5.080875561379821e-05, + "loss": 0.8737, + "step": 34495 + }, + { + "epoch": 1.3282001924927815, + "grad_norm": 1.2591365575790405, + "learning_rate": 5.0782429911527374e-05, + "loss": 0.7709, + "step": 34500 + }, + { + "epoch": 1.3283926852743022, + "grad_norm": 1.0664514303207397, + "learning_rate": 5.07561087098773e-05, + "loss": 0.8534, + "step": 34505 + }, + { + "epoch": 1.3285851780558229, + "grad_norm": 2.1911308765411377, + "learning_rate": 5.072979201125491e-05, + "loss": 0.944, + "step": 34510 + }, + { + "epoch": 1.3287776708373435, + "grad_norm": 0.8972740769386292, + "learning_rate": 5.070347981806657e-05, + "loss": 0.7574, + "step": 34515 + }, + { + "epoch": 1.3289701636188642, + "grad_norm": 1.0241605043411255, + "learning_rate": 5.067717213271852e-05, + "loss": 0.9743, + "step": 34520 + }, + { + "epoch": 1.329162656400385, + "grad_norm": 1.0732407569885254, + "learning_rate": 5.065086895761628e-05, + "loss": 0.7564, + "step": 34525 + }, + { + "epoch": 1.3293551491819056, + "grad_norm": 1.0235189199447632, + "learning_rate": 5.062457029516523e-05, + "loss": 0.9152, + "step": 34530 + }, + { + "epoch": 1.3295476419634265, + "grad_norm": 2.0122148990631104, + "learning_rate": 5.059827614777011e-05, + "loss": 1.0821, + "step": 34535 + }, + { + "epoch": 1.3297401347449471, + "grad_norm": 0.8569071292877197, + "learning_rate": 5.057198651783538e-05, + "loss": 0.7275, + "step": 34540 + }, + { + "epoch": 1.3299326275264678, + "grad_norm": 1.707134485244751, + "learning_rate": 5.0545701407765045e-05, + "loss": 0.9075, + "step": 34545 + }, + { + "epoch": 1.3301251203079885, + "grad_norm": 1.5125768184661865, + "learning_rate": 5.051942081996276e-05, + "loss": 0.6996, + "step": 34550 + }, + { + "epoch": 1.3303176130895091, + "grad_norm": 0.8091332316398621, + "learning_rate": 5.049314475683158e-05, + "loss": 0.6558, + "step": 34555 + }, + { + "epoch": 1.3305101058710298, + "grad_norm": 1.32486093044281, + "learning_rate": 5.046687322077444e-05, + "loss": 0.8852, + "step": 34560 + }, + { + "epoch": 1.3307025986525505, + "grad_norm": 1.4415032863616943, + "learning_rate": 5.0440606214193574e-05, + "loss": 0.8404, + "step": 34565 + }, + { + "epoch": 1.3308950914340711, + "grad_norm": 1.403547763824463, + "learning_rate": 5.0414343739490975e-05, + "loss": 0.7527, + "step": 34570 + }, + { + "epoch": 1.331087584215592, + "grad_norm": 2.1569442749023438, + "learning_rate": 5.038808579906816e-05, + "loss": 0.7985, + "step": 34575 + }, + { + "epoch": 1.3312800769971127, + "grad_norm": 1.6453166007995605, + "learning_rate": 5.036183239532629e-05, + "loss": 1.0465, + "step": 34580 + }, + { + "epoch": 1.3314725697786334, + "grad_norm": 1.2388137578964233, + "learning_rate": 5.0335583530665985e-05, + "loss": 0.8442, + "step": 34585 + }, + { + "epoch": 1.331665062560154, + "grad_norm": 1.0725327730178833, + "learning_rate": 5.0309339207487574e-05, + "loss": 0.7288, + "step": 34590 + }, + { + "epoch": 1.3318575553416747, + "grad_norm": 2.725146532058716, + "learning_rate": 5.028309942819091e-05, + "loss": 0.89, + "step": 34595 + }, + { + "epoch": 1.3320500481231954, + "grad_norm": 0.9890113472938538, + "learning_rate": 5.025686419517548e-05, + "loss": 0.7072, + "step": 34600 + }, + { + "epoch": 1.332242540904716, + "grad_norm": 1.8415369987487793, + "learning_rate": 5.023063351084033e-05, + "loss": 0.8123, + "step": 34605 + }, + { + "epoch": 1.3324350336862367, + "grad_norm": 0.9944572448730469, + "learning_rate": 5.020440737758401e-05, + "loss": 0.9136, + "step": 34610 + }, + { + "epoch": 1.3326275264677574, + "grad_norm": 1.131773591041565, + "learning_rate": 5.017818579780478e-05, + "loss": 0.8766, + "step": 34615 + }, + { + "epoch": 1.332820019249278, + "grad_norm": 1.1717694997787476, + "learning_rate": 5.01519687739004e-05, + "loss": 0.8695, + "step": 34620 + }, + { + "epoch": 1.3330125120307987, + "grad_norm": 2.2559797763824463, + "learning_rate": 5.0125756308268324e-05, + "loss": 0.7504, + "step": 34625 + }, + { + "epoch": 1.3332050048123194, + "grad_norm": 1.2070955038070679, + "learning_rate": 5.0099548403305354e-05, + "loss": 0.7427, + "step": 34630 + }, + { + "epoch": 1.3333974975938403, + "grad_norm": 1.3688490390777588, + "learning_rate": 5.0073345061408205e-05, + "loss": 0.8539, + "step": 34635 + }, + { + "epoch": 1.333589990375361, + "grad_norm": 0.8127152919769287, + "learning_rate": 5.004714628497288e-05, + "loss": 0.8662, + "step": 34640 + }, + { + "epoch": 1.3337824831568816, + "grad_norm": 1.9359740018844604, + "learning_rate": 5.0020952076395124e-05, + "loss": 0.9428, + "step": 34645 + }, + { + "epoch": 1.3339749759384023, + "grad_norm": 2.281846523284912, + "learning_rate": 4.999476243807021e-05, + "loss": 0.9462, + "step": 34650 + }, + { + "epoch": 1.334167468719923, + "grad_norm": 0.9672248959541321, + "learning_rate": 4.996857737239305e-05, + "loss": 0.6946, + "step": 34655 + }, + { + "epoch": 1.3343599615014436, + "grad_norm": 1.2799361944198608, + "learning_rate": 4.994239688175799e-05, + "loss": 0.8645, + "step": 34660 + }, + { + "epoch": 1.3345524542829643, + "grad_norm": 1.4629347324371338, + "learning_rate": 4.991622096855923e-05, + "loss": 0.7476, + "step": 34665 + }, + { + "epoch": 1.3347449470644852, + "grad_norm": 1.2526726722717285, + "learning_rate": 4.9890049635190216e-05, + "loss": 0.6983, + "step": 34670 + }, + { + "epoch": 1.3349374398460059, + "grad_norm": 1.9672549962997437, + "learning_rate": 4.9863882884044234e-05, + "loss": 0.9105, + "step": 34675 + }, + { + "epoch": 1.3351299326275265, + "grad_norm": 1.3764963150024414, + "learning_rate": 4.983772071751405e-05, + "loss": 0.8268, + "step": 34680 + }, + { + "epoch": 1.3353224254090472, + "grad_norm": 1.1114428043365479, + "learning_rate": 4.9811563137992036e-05, + "loss": 0.6683, + "step": 34685 + }, + { + "epoch": 1.335514918190568, + "grad_norm": 1.1706123352050781, + "learning_rate": 4.978541014787006e-05, + "loss": 0.8398, + "step": 34690 + }, + { + "epoch": 1.3357074109720886, + "grad_norm": 1.0030032396316528, + "learning_rate": 4.9759261749539695e-05, + "loss": 0.7736, + "step": 34695 + }, + { + "epoch": 1.3358999037536092, + "grad_norm": 2.052016258239746, + "learning_rate": 4.9733117945392026e-05, + "loss": 0.9667, + "step": 34700 + }, + { + "epoch": 1.33609239653513, + "grad_norm": 2.7888071537017822, + "learning_rate": 4.970697873781774e-05, + "loss": 1.0169, + "step": 34705 + }, + { + "epoch": 1.3362848893166506, + "grad_norm": 1.691266655921936, + "learning_rate": 4.968084412920712e-05, + "loss": 0.6926, + "step": 34710 + }, + { + "epoch": 1.3364773820981712, + "grad_norm": 1.3733407258987427, + "learning_rate": 4.965471412194993e-05, + "loss": 0.7997, + "step": 34715 + }, + { + "epoch": 1.336669874879692, + "grad_norm": 2.068976879119873, + "learning_rate": 4.9628588718435634e-05, + "loss": 0.9425, + "step": 34720 + }, + { + "epoch": 1.3368623676612126, + "grad_norm": 1.9062495231628418, + "learning_rate": 4.960246792105322e-05, + "loss": 0.8659, + "step": 34725 + }, + { + "epoch": 1.3370548604427335, + "grad_norm": 1.3141814470291138, + "learning_rate": 4.957635173219129e-05, + "loss": 0.7289, + "step": 34730 + }, + { + "epoch": 1.3372473532242541, + "grad_norm": 0.9774174094200134, + "learning_rate": 4.955024015423789e-05, + "loss": 0.7244, + "step": 34735 + }, + { + "epoch": 1.3374398460057748, + "grad_norm": 1.086158037185669, + "learning_rate": 4.952413318958092e-05, + "loss": 0.9397, + "step": 34740 + }, + { + "epoch": 1.3376323387872955, + "grad_norm": 1.092448353767395, + "learning_rate": 4.9498030840607547e-05, + "loss": 0.7571, + "step": 34745 + }, + { + "epoch": 1.3378248315688162, + "grad_norm": 1.8962229490280151, + "learning_rate": 4.947193310970471e-05, + "loss": 0.8264, + "step": 34750 + }, + { + "epoch": 1.3380173243503368, + "grad_norm": 1.5247009992599487, + "learning_rate": 4.944583999925888e-05, + "loss": 0.7856, + "step": 34755 + }, + { + "epoch": 1.3382098171318575, + "grad_norm": 1.6550134420394897, + "learning_rate": 4.941975151165613e-05, + "loss": 1.024, + "step": 34760 + }, + { + "epoch": 1.3384023099133784, + "grad_norm": 1.2675317525863647, + "learning_rate": 4.939366764928196e-05, + "loss": 0.8908, + "step": 34765 + }, + { + "epoch": 1.338594802694899, + "grad_norm": 1.3318907022476196, + "learning_rate": 4.9367588414521714e-05, + "loss": 0.7813, + "step": 34770 + }, + { + "epoch": 1.3387872954764197, + "grad_norm": 1.605064034461975, + "learning_rate": 4.934151380976007e-05, + "loss": 0.9145, + "step": 34775 + }, + { + "epoch": 1.3389797882579404, + "grad_norm": 1.6478382349014282, + "learning_rate": 4.9315443837381417e-05, + "loss": 1.0189, + "step": 34780 + }, + { + "epoch": 1.339172281039461, + "grad_norm": 1.617727279663086, + "learning_rate": 4.9289378499769655e-05, + "loss": 0.7663, + "step": 34785 + }, + { + "epoch": 1.3393647738209817, + "grad_norm": 2.157989025115967, + "learning_rate": 4.9263317799308305e-05, + "loss": 0.9308, + "step": 34790 + }, + { + "epoch": 1.3395572666025024, + "grad_norm": 1.5868369340896606, + "learning_rate": 4.923726173838048e-05, + "loss": 1.0234, + "step": 34795 + }, + { + "epoch": 1.339749759384023, + "grad_norm": 1.0689306259155273, + "learning_rate": 4.921121031936876e-05, + "loss": 0.9376, + "step": 34800 + }, + { + "epoch": 1.3399422521655437, + "grad_norm": 1.227166771888733, + "learning_rate": 4.918516354465541e-05, + "loss": 0.9385, + "step": 34805 + }, + { + "epoch": 1.3401347449470644, + "grad_norm": 0.9989410638809204, + "learning_rate": 4.915912141662225e-05, + "loss": 0.865, + "step": 34810 + }, + { + "epoch": 1.340327237728585, + "grad_norm": 1.1852025985717773, + "learning_rate": 4.913308393765066e-05, + "loss": 0.9824, + "step": 34815 + }, + { + "epoch": 1.3405197305101058, + "grad_norm": 1.3979310989379883, + "learning_rate": 4.910705111012153e-05, + "loss": 0.9297, + "step": 34820 + }, + { + "epoch": 1.3407122232916264, + "grad_norm": 1.6673941612243652, + "learning_rate": 4.90810229364155e-05, + "loss": 0.777, + "step": 34825 + }, + { + "epoch": 1.3409047160731473, + "grad_norm": 1.139458417892456, + "learning_rate": 4.9054999418912586e-05, + "loss": 0.7229, + "step": 34830 + }, + { + "epoch": 1.341097208854668, + "grad_norm": 0.8161736726760864, + "learning_rate": 4.902898055999249e-05, + "loss": 0.7305, + "step": 34835 + }, + { + "epoch": 1.3412897016361887, + "grad_norm": 1.6782662868499756, + "learning_rate": 4.9002966362034464e-05, + "loss": 0.8107, + "step": 34840 + }, + { + "epoch": 1.3414821944177093, + "grad_norm": 1.5833643674850464, + "learning_rate": 4.897695682741739e-05, + "loss": 0.7968, + "step": 34845 + }, + { + "epoch": 1.34167468719923, + "grad_norm": 1.409138798713684, + "learning_rate": 4.895095195851953e-05, + "loss": 0.9225, + "step": 34850 + }, + { + "epoch": 1.3418671799807507, + "grad_norm": 1.4280591011047363, + "learning_rate": 4.892495175771903e-05, + "loss": 0.7304, + "step": 34855 + }, + { + "epoch": 1.3420596727622713, + "grad_norm": 1.0369322299957275, + "learning_rate": 4.889895622739331e-05, + "loss": 0.884, + "step": 34860 + }, + { + "epoch": 1.3422521655437922, + "grad_norm": 0.9493328928947449, + "learning_rate": 4.887296536991953e-05, + "loss": 0.8159, + "step": 34865 + }, + { + "epoch": 1.342444658325313, + "grad_norm": 1.258876085281372, + "learning_rate": 4.884697918767438e-05, + "loss": 0.8475, + "step": 34870 + }, + { + "epoch": 1.3426371511068336, + "grad_norm": 1.2037972211837769, + "learning_rate": 4.8820997683034166e-05, + "loss": 0.8062, + "step": 34875 + }, + { + "epoch": 1.3428296438883542, + "grad_norm": 1.7686017751693726, + "learning_rate": 4.879502085837461e-05, + "loss": 0.8544, + "step": 34880 + }, + { + "epoch": 1.343022136669875, + "grad_norm": 1.291748285293579, + "learning_rate": 4.8769048716071264e-05, + "loss": 0.8541, + "step": 34885 + }, + { + "epoch": 1.3432146294513956, + "grad_norm": 1.2345646619796753, + "learning_rate": 4.8743081258499005e-05, + "loss": 0.9746, + "step": 34890 + }, + { + "epoch": 1.3434071222329163, + "grad_norm": 2.3298258781433105, + "learning_rate": 4.871711848803241e-05, + "loss": 0.9702, + "step": 34895 + }, + { + "epoch": 1.343599615014437, + "grad_norm": 1.7724379301071167, + "learning_rate": 4.869116040704562e-05, + "loss": 0.8757, + "step": 34900 + }, + { + "epoch": 1.3437921077959576, + "grad_norm": 1.737610101699829, + "learning_rate": 4.866520701791235e-05, + "loss": 0.7819, + "step": 34905 + }, + { + "epoch": 1.3439846005774783, + "grad_norm": 1.1426078081130981, + "learning_rate": 4.863925832300581e-05, + "loss": 0.8039, + "step": 34910 + }, + { + "epoch": 1.344177093358999, + "grad_norm": 1.7456549406051636, + "learning_rate": 4.8613314324698855e-05, + "loss": 0.899, + "step": 34915 + }, + { + "epoch": 1.3443695861405196, + "grad_norm": 1.9409685134887695, + "learning_rate": 4.8587375025363914e-05, + "loss": 0.9121, + "step": 34920 + }, + { + "epoch": 1.3445620789220405, + "grad_norm": 0.8889964818954468, + "learning_rate": 4.856144042737293e-05, + "loss": 0.8982, + "step": 34925 + }, + { + "epoch": 1.3447545717035612, + "grad_norm": 1.207217812538147, + "learning_rate": 4.8535510533097516e-05, + "loss": 0.9708, + "step": 34930 + }, + { + "epoch": 1.3449470644850818, + "grad_norm": 1.7888151407241821, + "learning_rate": 4.8509585344908705e-05, + "loss": 0.7838, + "step": 34935 + }, + { + "epoch": 1.3451395572666025, + "grad_norm": 1.8165149688720703, + "learning_rate": 4.8483664865177226e-05, + "loss": 0.8495, + "step": 34940 + }, + { + "epoch": 1.3453320500481232, + "grad_norm": 1.4321086406707764, + "learning_rate": 4.845774909627332e-05, + "loss": 0.8093, + "step": 34945 + }, + { + "epoch": 1.3455245428296438, + "grad_norm": 1.2221522331237793, + "learning_rate": 4.843183804056687e-05, + "loss": 0.906, + "step": 34950 + }, + { + "epoch": 1.3457170356111645, + "grad_norm": 1.2078337669372559, + "learning_rate": 4.8405931700427145e-05, + "loss": 0.7406, + "step": 34955 + }, + { + "epoch": 1.3459095283926854, + "grad_norm": 1.0744653940200806, + "learning_rate": 4.838003007822326e-05, + "loss": 0.8133, + "step": 34960 + }, + { + "epoch": 1.346102021174206, + "grad_norm": 1.2182228565216064, + "learning_rate": 4.835413317632363e-05, + "loss": 0.8024, + "step": 34965 + }, + { + "epoch": 1.3462945139557267, + "grad_norm": 1.1697922945022583, + "learning_rate": 4.8328240997096406e-05, + "loss": 0.9592, + "step": 34970 + }, + { + "epoch": 1.3464870067372474, + "grad_norm": 1.4923276901245117, + "learning_rate": 4.830235354290925e-05, + "loss": 0.7919, + "step": 34975 + }, + { + "epoch": 1.346679499518768, + "grad_norm": 1.7563990354537964, + "learning_rate": 4.827647081612944e-05, + "loss": 0.8882, + "step": 34980 + }, + { + "epoch": 1.3468719923002888, + "grad_norm": 2.2947778701782227, + "learning_rate": 4.825059281912365e-05, + "loss": 0.9188, + "step": 34985 + }, + { + "epoch": 1.3470644850818094, + "grad_norm": 1.5248280763626099, + "learning_rate": 4.822471955425841e-05, + "loss": 0.965, + "step": 34990 + }, + { + "epoch": 1.34725697786333, + "grad_norm": 0.8243402242660522, + "learning_rate": 4.819885102389956e-05, + "loss": 0.8224, + "step": 34995 + }, + { + "epoch": 1.3474494706448508, + "grad_norm": 0.7794885039329529, + "learning_rate": 4.817298723041264e-05, + "loss": 0.7754, + "step": 35000 + }, + { + "epoch": 1.3476419634263714, + "grad_norm": 1.2712163925170898, + "learning_rate": 4.8147128176162695e-05, + "loss": 0.8612, + "step": 35005 + }, + { + "epoch": 1.347834456207892, + "grad_norm": 0.9318971633911133, + "learning_rate": 4.8121273863514435e-05, + "loss": 0.7837, + "step": 35010 + }, + { + "epoch": 1.3480269489894128, + "grad_norm": 1.127020001411438, + "learning_rate": 4.809542429483197e-05, + "loss": 0.7845, + "step": 35015 + }, + { + "epoch": 1.3482194417709337, + "grad_norm": 1.16558039188385, + "learning_rate": 4.806957947247912e-05, + "loss": 0.8318, + "step": 35020 + }, + { + "epoch": 1.3484119345524543, + "grad_norm": 1.0364631414413452, + "learning_rate": 4.804373939881922e-05, + "loss": 0.9216, + "step": 35025 + }, + { + "epoch": 1.348604427333975, + "grad_norm": 1.3050929307937622, + "learning_rate": 4.801790407621518e-05, + "loss": 0.9627, + "step": 35030 + }, + { + "epoch": 1.3487969201154957, + "grad_norm": 1.1998785734176636, + "learning_rate": 4.799207350702949e-05, + "loss": 0.884, + "step": 35035 + }, + { + "epoch": 1.3489894128970163, + "grad_norm": 0.8209556341171265, + "learning_rate": 4.796624769362409e-05, + "loss": 0.6925, + "step": 35040 + }, + { + "epoch": 1.349181905678537, + "grad_norm": 2.4278218746185303, + "learning_rate": 4.794042663836071e-05, + "loss": 0.7594, + "step": 35045 + }, + { + "epoch": 1.3493743984600577, + "grad_norm": 1.8275748491287231, + "learning_rate": 4.791461034360043e-05, + "loss": 0.9516, + "step": 35050 + }, + { + "epoch": 1.3495668912415784, + "grad_norm": 1.21636962890625, + "learning_rate": 4.7888798811703985e-05, + "loss": 0.9819, + "step": 35055 + }, + { + "epoch": 1.3497593840230993, + "grad_norm": 1.5131886005401611, + "learning_rate": 4.7862992045031684e-05, + "loss": 0.955, + "step": 35060 + }, + { + "epoch": 1.34995187680462, + "grad_norm": 1.186279296875, + "learning_rate": 4.7837190045943436e-05, + "loss": 0.8632, + "step": 35065 + }, + { + "epoch": 1.3501443695861406, + "grad_norm": 1.2006460428237915, + "learning_rate": 4.7811392816798525e-05, + "loss": 0.8132, + "step": 35070 + }, + { + "epoch": 1.3503368623676613, + "grad_norm": 1.6792789697647095, + "learning_rate": 4.7785600359956096e-05, + "loss": 0.8756, + "step": 35075 + }, + { + "epoch": 1.350529355149182, + "grad_norm": 1.5003536939620972, + "learning_rate": 4.77598126777746e-05, + "loss": 0.7711, + "step": 35080 + }, + { + "epoch": 1.3507218479307026, + "grad_norm": 1.4191226959228516, + "learning_rate": 4.7734029772612165e-05, + "loss": 0.7895, + "step": 35085 + }, + { + "epoch": 1.3509143407122233, + "grad_norm": 1.391128659248352, + "learning_rate": 4.7708251646826476e-05, + "loss": 0.8829, + "step": 35090 + }, + { + "epoch": 1.351106833493744, + "grad_norm": 1.1470931768417358, + "learning_rate": 4.7682478302774816e-05, + "loss": 0.8812, + "step": 35095 + }, + { + "epoch": 1.3512993262752646, + "grad_norm": 1.3453543186187744, + "learning_rate": 4.765670974281386e-05, + "loss": 0.9689, + "step": 35100 + }, + { + "epoch": 1.3514918190567853, + "grad_norm": 1.4172368049621582, + "learning_rate": 4.763094596930014e-05, + "loss": 0.9129, + "step": 35105 + }, + { + "epoch": 1.351684311838306, + "grad_norm": 1.895308494567871, + "learning_rate": 4.7605186984589456e-05, + "loss": 0.8221, + "step": 35110 + }, + { + "epoch": 1.3518768046198266, + "grad_norm": 1.4038105010986328, + "learning_rate": 4.7579432791037335e-05, + "loss": 0.9491, + "step": 35115 + }, + { + "epoch": 1.3520692974013475, + "grad_norm": 1.111763834953308, + "learning_rate": 4.755368339099884e-05, + "loss": 0.7322, + "step": 35120 + }, + { + "epoch": 1.3522617901828682, + "grad_norm": 1.7646887302398682, + "learning_rate": 4.752793878682861e-05, + "loss": 0.8792, + "step": 35125 + }, + { + "epoch": 1.3524542829643889, + "grad_norm": 1.1743024587631226, + "learning_rate": 4.750219898088073e-05, + "loss": 0.9847, + "step": 35130 + }, + { + "epoch": 1.3526467757459095, + "grad_norm": 1.2739955186843872, + "learning_rate": 4.7476463975509e-05, + "loss": 0.7783, + "step": 35135 + }, + { + "epoch": 1.3528392685274302, + "grad_norm": 1.317272424697876, + "learning_rate": 4.74507337730667e-05, + "loss": 0.9875, + "step": 35140 + }, + { + "epoch": 1.3530317613089509, + "grad_norm": 0.9840208888053894, + "learning_rate": 4.74250083759067e-05, + "loss": 0.8396, + "step": 35145 + }, + { + "epoch": 1.3532242540904715, + "grad_norm": 1.4527745246887207, + "learning_rate": 4.739928778638143e-05, + "loss": 0.8583, + "step": 35150 + }, + { + "epoch": 1.3534167468719924, + "grad_norm": 1.6825000047683716, + "learning_rate": 4.7373572006842806e-05, + "loss": 0.9443, + "step": 35155 + }, + { + "epoch": 1.353609239653513, + "grad_norm": 1.2413737773895264, + "learning_rate": 4.734786103964242e-05, + "loss": 0.9722, + "step": 35160 + }, + { + "epoch": 1.3538017324350338, + "grad_norm": 1.2684810161590576, + "learning_rate": 4.732215488713133e-05, + "loss": 0.7281, + "step": 35165 + }, + { + "epoch": 1.3539942252165544, + "grad_norm": 1.8992153406143188, + "learning_rate": 4.729645355166027e-05, + "loss": 0.8221, + "step": 35170 + }, + { + "epoch": 1.354186717998075, + "grad_norm": 1.6467632055282593, + "learning_rate": 4.7270757035579325e-05, + "loss": 0.8439, + "step": 35175 + }, + { + "epoch": 1.3543792107795958, + "grad_norm": 1.0282069444656372, + "learning_rate": 4.724506534123843e-05, + "loss": 0.8158, + "step": 35180 + }, + { + "epoch": 1.3545717035611164, + "grad_norm": 1.7379052639007568, + "learning_rate": 4.72193784709868e-05, + "loss": 0.8245, + "step": 35185 + }, + { + "epoch": 1.3547641963426371, + "grad_norm": 2.1506638526916504, + "learning_rate": 4.719369642717336e-05, + "loss": 0.8818, + "step": 35190 + }, + { + "epoch": 1.3549566891241578, + "grad_norm": 1.284745454788208, + "learning_rate": 4.7168019212146576e-05, + "loss": 0.8457, + "step": 35195 + }, + { + "epoch": 1.3551491819056785, + "grad_norm": 0.897658109664917, + "learning_rate": 4.71423468282545e-05, + "loss": 0.8173, + "step": 35200 + }, + { + "epoch": 1.3553416746871991, + "grad_norm": 2.404270887374878, + "learning_rate": 4.711667927784458e-05, + "loss": 0.8396, + "step": 35205 + }, + { + "epoch": 1.3555341674687198, + "grad_norm": 1.8264572620391846, + "learning_rate": 4.7091016563264087e-05, + "loss": 0.9482, + "step": 35210 + }, + { + "epoch": 1.3557266602502407, + "grad_norm": 2.137230396270752, + "learning_rate": 4.70653586868596e-05, + "loss": 0.8982, + "step": 35215 + }, + { + "epoch": 1.3559191530317614, + "grad_norm": 1.1221061944961548, + "learning_rate": 4.703970565097742e-05, + "loss": 0.8983, + "step": 35220 + }, + { + "epoch": 1.356111645813282, + "grad_norm": 1.6483477354049683, + "learning_rate": 4.7014057457963315e-05, + "loss": 0.801, + "step": 35225 + }, + { + "epoch": 1.3563041385948027, + "grad_norm": 1.8643723726272583, + "learning_rate": 4.698841411016269e-05, + "loss": 1.0251, + "step": 35230 + }, + { + "epoch": 1.3564966313763234, + "grad_norm": 1.128015398979187, + "learning_rate": 4.6962775609920394e-05, + "loss": 0.844, + "step": 35235 + }, + { + "epoch": 1.356689124157844, + "grad_norm": 0.9045073390007019, + "learning_rate": 4.693714195958092e-05, + "loss": 0.869, + "step": 35240 + }, + { + "epoch": 1.3568816169393647, + "grad_norm": 1.8441210985183716, + "learning_rate": 4.691151316148832e-05, + "loss": 0.9395, + "step": 35245 + }, + { + "epoch": 1.3570741097208856, + "grad_norm": 1.437767505645752, + "learning_rate": 4.688588921798616e-05, + "loss": 0.6946, + "step": 35250 + }, + { + "epoch": 1.3572666025024063, + "grad_norm": 1.447969675064087, + "learning_rate": 4.68602701314176e-05, + "loss": 0.8183, + "step": 35255 + }, + { + "epoch": 1.357459095283927, + "grad_norm": 2.16900634765625, + "learning_rate": 4.68346559041253e-05, + "loss": 0.7718, + "step": 35260 + }, + { + "epoch": 1.3576515880654476, + "grad_norm": 1.2576295137405396, + "learning_rate": 4.680904653845152e-05, + "loss": 0.8408, + "step": 35265 + }, + { + "epoch": 1.3578440808469683, + "grad_norm": 1.3232860565185547, + "learning_rate": 4.678344203673808e-05, + "loss": 0.8454, + "step": 35270 + }, + { + "epoch": 1.358036573628489, + "grad_norm": 1.5161672830581665, + "learning_rate": 4.675784240132638e-05, + "loss": 0.8271, + "step": 35275 + }, + { + "epoch": 1.3582290664100096, + "grad_norm": 1.339381217956543, + "learning_rate": 4.6732247634557214e-05, + "loss": 0.8369, + "step": 35280 + }, + { + "epoch": 1.3584215591915303, + "grad_norm": 1.2155077457427979, + "learning_rate": 4.670665773877121e-05, + "loss": 0.8823, + "step": 35285 + }, + { + "epoch": 1.358614051973051, + "grad_norm": 1.6625285148620605, + "learning_rate": 4.6681072716308285e-05, + "loss": 0.9456, + "step": 35290 + }, + { + "epoch": 1.3588065447545716, + "grad_norm": 0.9515489339828491, + "learning_rate": 4.6655492569508056e-05, + "loss": 0.889, + "step": 35295 + }, + { + "epoch": 1.3589990375360923, + "grad_norm": 1.9020076990127563, + "learning_rate": 4.662991730070966e-05, + "loss": 0.8547, + "step": 35300 + }, + { + "epoch": 1.359191530317613, + "grad_norm": 1.41860032081604, + "learning_rate": 4.660434691225177e-05, + "loss": 0.8674, + "step": 35305 + }, + { + "epoch": 1.3593840230991339, + "grad_norm": 1.2650240659713745, + "learning_rate": 4.657878140647265e-05, + "loss": 0.834, + "step": 35310 + }, + { + "epoch": 1.3595765158806545, + "grad_norm": 1.1325031518936157, + "learning_rate": 4.655322078571013e-05, + "loss": 0.8641, + "step": 35315 + }, + { + "epoch": 1.3597690086621752, + "grad_norm": 1.4202800989151, + "learning_rate": 4.652766505230143e-05, + "loss": 0.8613, + "step": 35320 + }, + { + "epoch": 1.3599615014436959, + "grad_norm": 1.1674113273620605, + "learning_rate": 4.650211420858361e-05, + "loss": 0.9188, + "step": 35325 + }, + { + "epoch": 1.3601539942252165, + "grad_norm": 1.0588910579681396, + "learning_rate": 4.6476568256893025e-05, + "loss": 0.9119, + "step": 35330 + }, + { + "epoch": 1.3603464870067372, + "grad_norm": 2.010552167892456, + "learning_rate": 4.645102719956572e-05, + "loss": 0.7566, + "step": 35335 + }, + { + "epoch": 1.3605389797882579, + "grad_norm": 1.1530786752700806, + "learning_rate": 4.6425491038937244e-05, + "loss": 0.8898, + "step": 35340 + }, + { + "epoch": 1.3607314725697786, + "grad_norm": 1.016413927078247, + "learning_rate": 4.6399959777342746e-05, + "loss": 0.9265, + "step": 35345 + }, + { + "epoch": 1.3609239653512994, + "grad_norm": 0.9319851398468018, + "learning_rate": 4.6374433417116826e-05, + "loss": 0.8908, + "step": 35350 + }, + { + "epoch": 1.3611164581328201, + "grad_norm": 1.6301521062850952, + "learning_rate": 4.6348911960593736e-05, + "loss": 0.8063, + "step": 35355 + }, + { + "epoch": 1.3613089509143408, + "grad_norm": 1.0370879173278809, + "learning_rate": 4.632339541010726e-05, + "loss": 0.8456, + "step": 35360 + }, + { + "epoch": 1.3615014436958615, + "grad_norm": 1.1396170854568481, + "learning_rate": 4.629788376799065e-05, + "loss": 0.7626, + "step": 35365 + }, + { + "epoch": 1.3616939364773821, + "grad_norm": 1.1902436017990112, + "learning_rate": 4.6272377036576886e-05, + "loss": 0.8911, + "step": 35370 + }, + { + "epoch": 1.3618864292589028, + "grad_norm": 1.496978521347046, + "learning_rate": 4.6246875218198294e-05, + "loss": 0.7299, + "step": 35375 + }, + { + "epoch": 1.3620789220404235, + "grad_norm": 1.5510118007659912, + "learning_rate": 4.622137831518688e-05, + "loss": 0.7804, + "step": 35380 + }, + { + "epoch": 1.3622714148219441, + "grad_norm": 1.1670730113983154, + "learning_rate": 4.619588632987416e-05, + "loss": 0.8348, + "step": 35385 + }, + { + "epoch": 1.3624639076034648, + "grad_norm": 1.505469799041748, + "learning_rate": 4.617039926459127e-05, + "loss": 0.7221, + "step": 35390 + }, + { + "epoch": 1.3626564003849855, + "grad_norm": 1.3715969324111938, + "learning_rate": 4.61449171216687e-05, + "loss": 0.9265, + "step": 35395 + }, + { + "epoch": 1.3628488931665061, + "grad_norm": 0.8129710555076599, + "learning_rate": 4.611943990343677e-05, + "loss": 0.8726, + "step": 35400 + }, + { + "epoch": 1.3630413859480268, + "grad_norm": 0.745360791683197, + "learning_rate": 4.60939676122251e-05, + "loss": 0.8237, + "step": 35405 + }, + { + "epoch": 1.3632338787295477, + "grad_norm": 1.287010908126831, + "learning_rate": 4.606850025036299e-05, + "loss": 0.8159, + "step": 35410 + }, + { + "epoch": 1.3634263715110684, + "grad_norm": 1.7442882061004639, + "learning_rate": 4.604303782017928e-05, + "loss": 0.6109, + "step": 35415 + }, + { + "epoch": 1.363618864292589, + "grad_norm": 2.1345295906066895, + "learning_rate": 4.6017580324002364e-05, + "loss": 0.9068, + "step": 35420 + }, + { + "epoch": 1.3638113570741097, + "grad_norm": 1.987751841545105, + "learning_rate": 4.5992127764160054e-05, + "loss": 0.932, + "step": 35425 + }, + { + "epoch": 1.3640038498556304, + "grad_norm": 1.3692355155944824, + "learning_rate": 4.5966680142979954e-05, + "loss": 0.7202, + "step": 35430 + }, + { + "epoch": 1.364196342637151, + "grad_norm": 1.3024131059646606, + "learning_rate": 4.594123746278899e-05, + "loss": 0.9063, + "step": 35435 + }, + { + "epoch": 1.3643888354186717, + "grad_norm": 1.8773390054702759, + "learning_rate": 4.591579972591376e-05, + "loss": 0.9294, + "step": 35440 + }, + { + "epoch": 1.3645813282001926, + "grad_norm": 1.8753331899642944, + "learning_rate": 4.589036693468035e-05, + "loss": 0.9329, + "step": 35445 + }, + { + "epoch": 1.3647738209817133, + "grad_norm": 0.8658643364906311, + "learning_rate": 4.5864939091414495e-05, + "loss": 0.7045, + "step": 35450 + }, + { + "epoch": 1.364966313763234, + "grad_norm": 1.2768117189407349, + "learning_rate": 4.5839516198441304e-05, + "loss": 0.9337, + "step": 35455 + }, + { + "epoch": 1.3651588065447546, + "grad_norm": 1.145644187927246, + "learning_rate": 4.581409825808557e-05, + "loss": 0.8107, + "step": 35460 + }, + { + "epoch": 1.3653512993262753, + "grad_norm": 1.1863698959350586, + "learning_rate": 4.5788685272671605e-05, + "loss": 0.6387, + "step": 35465 + }, + { + "epoch": 1.365543792107796, + "grad_norm": 1.2812061309814453, + "learning_rate": 4.576327724452326e-05, + "loss": 0.756, + "step": 35470 + }, + { + "epoch": 1.3657362848893166, + "grad_norm": 1.6907789707183838, + "learning_rate": 4.5737874175963956e-05, + "loss": 0.9023, + "step": 35475 + }, + { + "epoch": 1.3659287776708373, + "grad_norm": 1.9304261207580566, + "learning_rate": 4.5712476069316576e-05, + "loss": 0.8313, + "step": 35480 + }, + { + "epoch": 1.366121270452358, + "grad_norm": 1.2450244426727295, + "learning_rate": 4.568708292690364e-05, + "loss": 0.8494, + "step": 35485 + }, + { + "epoch": 1.3663137632338787, + "grad_norm": 1.577697515487671, + "learning_rate": 4.566169475104717e-05, + "loss": 0.8129, + "step": 35490 + }, + { + "epoch": 1.3665062560153993, + "grad_norm": 2.367910146713257, + "learning_rate": 4.56363115440688e-05, + "loss": 0.8501, + "step": 35495 + }, + { + "epoch": 1.36669874879692, + "grad_norm": 1.588900089263916, + "learning_rate": 4.561093330828954e-05, + "loss": 0.9732, + "step": 35500 + }, + { + "epoch": 1.3668912415784409, + "grad_norm": 1.3308700323104858, + "learning_rate": 4.558556004603019e-05, + "loss": 0.8681, + "step": 35505 + }, + { + "epoch": 1.3670837343599616, + "grad_norm": 1.5678372383117676, + "learning_rate": 4.556019175961091e-05, + "loss": 0.8269, + "step": 35510 + }, + { + "epoch": 1.3672762271414822, + "grad_norm": 1.0886204242706299, + "learning_rate": 4.553482845135143e-05, + "loss": 0.9315, + "step": 35515 + }, + { + "epoch": 1.367468719923003, + "grad_norm": 1.2117904424667358, + "learning_rate": 4.5509470123571095e-05, + "loss": 0.8139, + "step": 35520 + }, + { + "epoch": 1.3676612127045236, + "grad_norm": 1.5415525436401367, + "learning_rate": 4.5484116778588807e-05, + "loss": 0.8667, + "step": 35525 + }, + { + "epoch": 1.3678537054860442, + "grad_norm": 1.6939245462417603, + "learning_rate": 4.545876841872281e-05, + "loss": 0.8011, + "step": 35530 + }, + { + "epoch": 1.368046198267565, + "grad_norm": 1.9691619873046875, + "learning_rate": 4.5433425046291224e-05, + "loss": 0.9255, + "step": 35535 + }, + { + "epoch": 1.3682386910490858, + "grad_norm": 2.5566859245300293, + "learning_rate": 4.54080866636114e-05, + "loss": 0.998, + "step": 35540 + }, + { + "epoch": 1.3684311838306065, + "grad_norm": 1.200829029083252, + "learning_rate": 4.538275327300042e-05, + "loss": 0.7522, + "step": 35545 + }, + { + "epoch": 1.3686236766121271, + "grad_norm": 1.0914881229400635, + "learning_rate": 4.535742487677485e-05, + "loss": 0.8378, + "step": 35550 + }, + { + "epoch": 1.3688161693936478, + "grad_norm": 1.1408578157424927, + "learning_rate": 4.5332101477250796e-05, + "loss": 0.8735, + "step": 35555 + }, + { + "epoch": 1.3690086621751685, + "grad_norm": 0.8551640510559082, + "learning_rate": 4.5306783076743955e-05, + "loss": 0.8452, + "step": 35560 + }, + { + "epoch": 1.3692011549566891, + "grad_norm": 2.295720100402832, + "learning_rate": 4.5281469677569456e-05, + "loss": 1.0684, + "step": 35565 + }, + { + "epoch": 1.3693936477382098, + "grad_norm": 1.2390626668930054, + "learning_rate": 4.5256161282042085e-05, + "loss": 0.7534, + "step": 35570 + }, + { + "epoch": 1.3695861405197305, + "grad_norm": 1.0765421390533447, + "learning_rate": 4.5230857892476106e-05, + "loss": 0.8554, + "step": 35575 + }, + { + "epoch": 1.3697786333012512, + "grad_norm": 1.3477513790130615, + "learning_rate": 4.5205559511185415e-05, + "loss": 0.8043, + "step": 35580 + }, + { + "epoch": 1.3699711260827718, + "grad_norm": 1.3972609043121338, + "learning_rate": 4.518026614048324e-05, + "loss": 0.7081, + "step": 35585 + }, + { + "epoch": 1.3701636188642925, + "grad_norm": 0.9186348915100098, + "learning_rate": 4.515497778268266e-05, + "loss": 0.6965, + "step": 35590 + }, + { + "epoch": 1.3703561116458132, + "grad_norm": 1.2708579301834106, + "learning_rate": 4.5129694440096005e-05, + "loss": 0.801, + "step": 35595 + }, + { + "epoch": 1.3705486044273338, + "grad_norm": 1.0227978229522705, + "learning_rate": 4.5104416115035306e-05, + "loss": 0.7764, + "step": 35600 + }, + { + "epoch": 1.3707410972088547, + "grad_norm": 1.2434272766113281, + "learning_rate": 4.507914280981211e-05, + "loss": 0.7463, + "step": 35605 + }, + { + "epoch": 1.3709335899903754, + "grad_norm": 1.1386263370513916, + "learning_rate": 4.505387452673753e-05, + "loss": 0.8806, + "step": 35610 + }, + { + "epoch": 1.371126082771896, + "grad_norm": 0.9424572587013245, + "learning_rate": 4.502861126812205e-05, + "loss": 0.9189, + "step": 35615 + }, + { + "epoch": 1.3713185755534167, + "grad_norm": 0.7779520153999329, + "learning_rate": 4.500335303627601e-05, + "loss": 0.8486, + "step": 35620 + }, + { + "epoch": 1.3715110683349374, + "grad_norm": 1.1644350290298462, + "learning_rate": 4.497809983350897e-05, + "loss": 0.7089, + "step": 35625 + }, + { + "epoch": 1.371703561116458, + "grad_norm": 1.2244153022766113, + "learning_rate": 4.4952851662130216e-05, + "loss": 0.9906, + "step": 35630 + }, + { + "epoch": 1.3718960538979788, + "grad_norm": 2.0696568489074707, + "learning_rate": 4.4927608524448515e-05, + "loss": 0.7632, + "step": 35635 + }, + { + "epoch": 1.3720885466794996, + "grad_norm": 1.5171512365341187, + "learning_rate": 4.4902370422772233e-05, + "loss": 0.9423, + "step": 35640 + }, + { + "epoch": 1.3722810394610203, + "grad_norm": 1.1163785457611084, + "learning_rate": 4.4877137359409116e-05, + "loss": 0.7411, + "step": 35645 + }, + { + "epoch": 1.372473532242541, + "grad_norm": 1.2916820049285889, + "learning_rate": 4.485190933666671e-05, + "loss": 0.8892, + "step": 35650 + }, + { + "epoch": 1.3726660250240617, + "grad_norm": 1.1933842897415161, + "learning_rate": 4.4826686356851834e-05, + "loss": 0.8643, + "step": 35655 + }, + { + "epoch": 1.3728585178055823, + "grad_norm": 1.0002435445785522, + "learning_rate": 4.4801468422271e-05, + "loss": 0.7632, + "step": 35660 + }, + { + "epoch": 1.373051010587103, + "grad_norm": 1.1150939464569092, + "learning_rate": 4.4776255535230216e-05, + "loss": 0.6594, + "step": 35665 + }, + { + "epoch": 1.3732435033686237, + "grad_norm": 1.2958896160125732, + "learning_rate": 4.4751047698035075e-05, + "loss": 0.8391, + "step": 35670 + }, + { + "epoch": 1.3734359961501443, + "grad_norm": 1.687971591949463, + "learning_rate": 4.47258449129906e-05, + "loss": 0.6359, + "step": 35675 + }, + { + "epoch": 1.373628488931665, + "grad_norm": 1.6192028522491455, + "learning_rate": 4.4700647182401456e-05, + "loss": 0.8449, + "step": 35680 + }, + { + "epoch": 1.3738209817131857, + "grad_norm": 1.6572599411010742, + "learning_rate": 4.467545450857179e-05, + "loss": 0.8733, + "step": 35685 + }, + { + "epoch": 1.3740134744947063, + "grad_norm": 1.175234317779541, + "learning_rate": 4.465026689380532e-05, + "loss": 0.8851, + "step": 35690 + }, + { + "epoch": 1.374205967276227, + "grad_norm": 0.9541772603988647, + "learning_rate": 4.4625084340405333e-05, + "loss": 0.852, + "step": 35695 + }, + { + "epoch": 1.374398460057748, + "grad_norm": 1.063652515411377, + "learning_rate": 4.4599906850674514e-05, + "loss": 0.8568, + "step": 35700 + }, + { + "epoch": 1.3745909528392686, + "grad_norm": 1.2114187479019165, + "learning_rate": 4.457473442691522e-05, + "loss": 0.731, + "step": 35705 + }, + { + "epoch": 1.3747834456207892, + "grad_norm": 1.2288764715194702, + "learning_rate": 4.454956707142931e-05, + "loss": 0.7752, + "step": 35710 + }, + { + "epoch": 1.37497593840231, + "grad_norm": 1.9316095113754272, + "learning_rate": 4.452440478651819e-05, + "loss": 0.7302, + "step": 35715 + }, + { + "epoch": 1.3751684311838306, + "grad_norm": 1.061178207397461, + "learning_rate": 4.449924757448269e-05, + "loss": 0.8386, + "step": 35720 + }, + { + "epoch": 1.3753609239653513, + "grad_norm": 1.0967941284179688, + "learning_rate": 4.447409543762342e-05, + "loss": 0.7719, + "step": 35725 + }, + { + "epoch": 1.375553416746872, + "grad_norm": 2.174457311630249, + "learning_rate": 4.4448948378240264e-05, + "loss": 0.9424, + "step": 35730 + }, + { + "epoch": 1.3757459095283928, + "grad_norm": 1.3424992561340332, + "learning_rate": 4.442380639863277e-05, + "loss": 0.7166, + "step": 35735 + }, + { + "epoch": 1.3759384023099135, + "grad_norm": 1.0383292436599731, + "learning_rate": 4.4398669501100044e-05, + "loss": 0.8558, + "step": 35740 + }, + { + "epoch": 1.3761308950914342, + "grad_norm": 0.9943356513977051, + "learning_rate": 4.437353768794069e-05, + "loss": 0.6549, + "step": 35745 + }, + { + "epoch": 1.3763233878729548, + "grad_norm": 2.162888526916504, + "learning_rate": 4.4348410961452744e-05, + "loss": 0.8268, + "step": 35750 + }, + { + "epoch": 1.3765158806544755, + "grad_norm": 0.9995039701461792, + "learning_rate": 4.432328932393405e-05, + "loss": 0.8245, + "step": 35755 + }, + { + "epoch": 1.3767083734359962, + "grad_norm": 1.1602789163589478, + "learning_rate": 4.429817277768167e-05, + "loss": 0.8199, + "step": 35760 + }, + { + "epoch": 1.3769008662175168, + "grad_norm": 1.8660534620285034, + "learning_rate": 4.42730613249924e-05, + "loss": 0.9195, + "step": 35765 + }, + { + "epoch": 1.3770933589990375, + "grad_norm": 1.5461866855621338, + "learning_rate": 4.424795496816252e-05, + "loss": 0.801, + "step": 35770 + }, + { + "epoch": 1.3772858517805582, + "grad_norm": 1.3390212059020996, + "learning_rate": 4.4222853709487866e-05, + "loss": 0.8652, + "step": 35775 + }, + { + "epoch": 1.3774783445620788, + "grad_norm": 1.2544982433319092, + "learning_rate": 4.419775755126372e-05, + "loss": 0.7289, + "step": 35780 + }, + { + "epoch": 1.3776708373435995, + "grad_norm": 1.8691519498825073, + "learning_rate": 4.4172666495784984e-05, + "loss": 0.8905, + "step": 35785 + }, + { + "epoch": 1.3778633301251202, + "grad_norm": 2.0388388633728027, + "learning_rate": 4.414758054534608e-05, + "loss": 1.0152, + "step": 35790 + }, + { + "epoch": 1.378055822906641, + "grad_norm": 1.040433645248413, + "learning_rate": 4.4122499702240946e-05, + "loss": 0.8018, + "step": 35795 + }, + { + "epoch": 1.3782483156881618, + "grad_norm": 1.2068662643432617, + "learning_rate": 4.409742396876309e-05, + "loss": 0.6547, + "step": 35800 + }, + { + "epoch": 1.3784408084696824, + "grad_norm": 1.445438265800476, + "learning_rate": 4.4072353347205466e-05, + "loss": 0.8732, + "step": 35805 + }, + { + "epoch": 1.378633301251203, + "grad_norm": 0.9814010858535767, + "learning_rate": 4.404728783986063e-05, + "loss": 0.8397, + "step": 35810 + }, + { + "epoch": 1.3788257940327238, + "grad_norm": 2.0966150760650635, + "learning_rate": 4.4022227449020684e-05, + "loss": 0.936, + "step": 35815 + }, + { + "epoch": 1.3790182868142444, + "grad_norm": 1.3904740810394287, + "learning_rate": 4.39971721769772e-05, + "loss": 0.9165, + "step": 35820 + }, + { + "epoch": 1.379210779595765, + "grad_norm": 1.3555712699890137, + "learning_rate": 4.3972122026021346e-05, + "loss": 1.0844, + "step": 35825 + }, + { + "epoch": 1.3794032723772858, + "grad_norm": 1.2570695877075195, + "learning_rate": 4.394707699844381e-05, + "loss": 0.9331, + "step": 35830 + }, + { + "epoch": 1.3795957651588067, + "grad_norm": 0.8406626582145691, + "learning_rate": 4.392203709653471e-05, + "loss": 0.8221, + "step": 35835 + }, + { + "epoch": 1.3797882579403273, + "grad_norm": 1.3139702081680298, + "learning_rate": 4.3897002322583894e-05, + "loss": 0.9063, + "step": 35840 + }, + { + "epoch": 1.379980750721848, + "grad_norm": 0.7752715945243835, + "learning_rate": 4.3871972678880535e-05, + "loss": 0.7454, + "step": 35845 + }, + { + "epoch": 1.3801732435033687, + "grad_norm": 1.247069239616394, + "learning_rate": 4.384694816771345e-05, + "loss": 0.7338, + "step": 35850 + }, + { + "epoch": 1.3803657362848893, + "grad_norm": 1.1150643825531006, + "learning_rate": 4.3821928791370995e-05, + "loss": 0.9423, + "step": 35855 + }, + { + "epoch": 1.38055822906641, + "grad_norm": 1.191602349281311, + "learning_rate": 4.3796914552141035e-05, + "loss": 0.7278, + "step": 35860 + }, + { + "epoch": 1.3807507218479307, + "grad_norm": 1.868516206741333, + "learning_rate": 4.3771905452310844e-05, + "loss": 0.8649, + "step": 35865 + }, + { + "epoch": 1.3809432146294514, + "grad_norm": 1.443765640258789, + "learning_rate": 4.37469014941675e-05, + "loss": 0.8188, + "step": 35870 + }, + { + "epoch": 1.381135707410972, + "grad_norm": 1.2555128335952759, + "learning_rate": 4.372190267999734e-05, + "loss": 0.8372, + "step": 35875 + }, + { + "epoch": 1.3813282001924927, + "grad_norm": 1.0106040239334106, + "learning_rate": 4.369690901208637e-05, + "loss": 0.769, + "step": 35880 + }, + { + "epoch": 1.3815206929740134, + "grad_norm": 0.8901906609535217, + "learning_rate": 4.3671920492720095e-05, + "loss": 0.8127, + "step": 35885 + }, + { + "epoch": 1.381713185755534, + "grad_norm": 1.0237152576446533, + "learning_rate": 4.3646937124183594e-05, + "loss": 0.8046, + "step": 35890 + }, + { + "epoch": 1.381905678537055, + "grad_norm": 2.2792160511016846, + "learning_rate": 4.362195890876135e-05, + "loss": 0.8081, + "step": 35895 + }, + { + "epoch": 1.3820981713185756, + "grad_norm": 1.6603785753250122, + "learning_rate": 4.359698584873749e-05, + "loss": 0.8406, + "step": 35900 + }, + { + "epoch": 1.3822906641000963, + "grad_norm": 0.9667415618896484, + "learning_rate": 4.357201794639568e-05, + "loss": 0.7888, + "step": 35905 + }, + { + "epoch": 1.382483156881617, + "grad_norm": 1.3200799226760864, + "learning_rate": 4.354705520401895e-05, + "loss": 0.7056, + "step": 35910 + }, + { + "epoch": 1.3826756496631376, + "grad_norm": 1.4623115062713623, + "learning_rate": 4.352209762389013e-05, + "loss": 0.8487, + "step": 35915 + }, + { + "epoch": 1.3828681424446583, + "grad_norm": 1.7032670974731445, + "learning_rate": 4.3497145208291314e-05, + "loss": 0.9328, + "step": 35920 + }, + { + "epoch": 1.383060635226179, + "grad_norm": 0.9201510548591614, + "learning_rate": 4.347219795950427e-05, + "loss": 0.8634, + "step": 35925 + }, + { + "epoch": 1.3832531280076998, + "grad_norm": 1.2735239267349243, + "learning_rate": 4.3447255879810266e-05, + "loss": 0.7324, + "step": 35930 + }, + { + "epoch": 1.3834456207892205, + "grad_norm": 1.2430258989334106, + "learning_rate": 4.3422318971490116e-05, + "loss": 0.8445, + "step": 35935 + }, + { + "epoch": 1.3836381135707412, + "grad_norm": 0.880877673625946, + "learning_rate": 4.3397387236824025e-05, + "loss": 0.7624, + "step": 35940 + }, + { + "epoch": 1.3838306063522618, + "grad_norm": 0.9846094250679016, + "learning_rate": 4.3372460678091984e-05, + "loss": 0.7539, + "step": 35945 + }, + { + "epoch": 1.3840230991337825, + "grad_norm": 2.2208034992218018, + "learning_rate": 4.334753929757327e-05, + "loss": 0.7707, + "step": 35950 + }, + { + "epoch": 1.3842155919153032, + "grad_norm": 1.152341365814209, + "learning_rate": 4.332262309754679e-05, + "loss": 0.86, + "step": 35955 + }, + { + "epoch": 1.3844080846968239, + "grad_norm": 1.2790590524673462, + "learning_rate": 4.3297712080290975e-05, + "loss": 0.7621, + "step": 35960 + }, + { + "epoch": 1.3846005774783445, + "grad_norm": 1.3545982837677002, + "learning_rate": 4.327280624808381e-05, + "loss": 0.7659, + "step": 35965 + }, + { + "epoch": 1.3847930702598652, + "grad_norm": 1.7217894792556763, + "learning_rate": 4.324790560320265e-05, + "loss": 0.8812, + "step": 35970 + }, + { + "epoch": 1.3849855630413859, + "grad_norm": 1.3170634508132935, + "learning_rate": 4.322798882370276e-05, + "loss": 0.9111, + "step": 35975 + }, + { + "epoch": 1.3851780558229065, + "grad_norm": 1.05010986328125, + "learning_rate": 4.320309752174627e-05, + "loss": 0.8722, + "step": 35980 + }, + { + "epoch": 1.3853705486044272, + "grad_norm": 1.8216160535812378, + "learning_rate": 4.317821141349036e-05, + "loss": 0.9751, + "step": 35985 + }, + { + "epoch": 1.385563041385948, + "grad_norm": 1.5039763450622559, + "learning_rate": 4.315333050121055e-05, + "loss": 0.9311, + "step": 35990 + }, + { + "epoch": 1.3857555341674688, + "grad_norm": 1.1297041177749634, + "learning_rate": 4.312845478718211e-05, + "loss": 0.8074, + "step": 35995 + }, + { + "epoch": 1.3859480269489894, + "grad_norm": 1.2953757047653198, + "learning_rate": 4.310358427367972e-05, + "loss": 0.7828, + "step": 36000 + }, + { + "epoch": 1.3861405197305101, + "grad_norm": 1.7586846351623535, + "learning_rate": 4.3078718962977684e-05, + "loss": 0.9055, + "step": 36005 + }, + { + "epoch": 1.3863330125120308, + "grad_norm": 1.59012770652771, + "learning_rate": 4.305385885734966e-05, + "loss": 0.7722, + "step": 36010 + }, + { + "epoch": 1.3865255052935515, + "grad_norm": 0.7984145879745483, + "learning_rate": 4.302900395906909e-05, + "loss": 0.803, + "step": 36015 + }, + { + "epoch": 1.3867179980750721, + "grad_norm": 2.04715895652771, + "learning_rate": 4.3004154270408666e-05, + "loss": 0.9151, + "step": 36020 + }, + { + "epoch": 1.386910490856593, + "grad_norm": 1.13097083568573, + "learning_rate": 4.2979309793640776e-05, + "loss": 0.7337, + "step": 36025 + }, + { + "epoch": 1.3871029836381137, + "grad_norm": 1.4123458862304688, + "learning_rate": 4.295447053103727e-05, + "loss": 0.7956, + "step": 36030 + }, + { + "epoch": 1.3872954764196344, + "grad_norm": 1.3871421813964844, + "learning_rate": 4.2929636484869586e-05, + "loss": 0.8623, + "step": 36035 + }, + { + "epoch": 1.387487969201155, + "grad_norm": 0.9964479804039001, + "learning_rate": 4.290480765740851e-05, + "loss": 0.7375, + "step": 36040 + }, + { + "epoch": 1.3876804619826757, + "grad_norm": 2.126817464828491, + "learning_rate": 4.287998405092463e-05, + "loss": 0.9251, + "step": 36045 + }, + { + "epoch": 1.3878729547641964, + "grad_norm": 1.7415242195129395, + "learning_rate": 4.285516566768779e-05, + "loss": 0.88, + "step": 36050 + }, + { + "epoch": 1.388065447545717, + "grad_norm": 1.341607689857483, + "learning_rate": 4.2830352509967486e-05, + "loss": 0.8688, + "step": 36055 + }, + { + "epoch": 1.3882579403272377, + "grad_norm": 2.5146100521087646, + "learning_rate": 4.280554458003274e-05, + "loss": 0.7789, + "step": 36060 + }, + { + "epoch": 1.3884504331087584, + "grad_norm": 1.7495235204696655, + "learning_rate": 4.2780741880152106e-05, + "loss": 0.749, + "step": 36065 + }, + { + "epoch": 1.388642925890279, + "grad_norm": 1.4821887016296387, + "learning_rate": 4.275594441259354e-05, + "loss": 0.8738, + "step": 36070 + }, + { + "epoch": 1.3888354186717997, + "grad_norm": 1.3802404403686523, + "learning_rate": 4.273115217962466e-05, + "loss": 0.8245, + "step": 36075 + }, + { + "epoch": 1.3890279114533204, + "grad_norm": 1.0245096683502197, + "learning_rate": 4.270636518351252e-05, + "loss": 0.8295, + "step": 36080 + }, + { + "epoch": 1.389220404234841, + "grad_norm": 1.633852243423462, + "learning_rate": 4.268158342652376e-05, + "loss": 0.8341, + "step": 36085 + }, + { + "epoch": 1.389412897016362, + "grad_norm": 1.1715857982635498, + "learning_rate": 4.265680691092454e-05, + "loss": 0.8579, + "step": 36090 + }, + { + "epoch": 1.3896053897978826, + "grad_norm": 1.3279740810394287, + "learning_rate": 4.263203563898038e-05, + "loss": 0.8271, + "step": 36095 + }, + { + "epoch": 1.3897978825794033, + "grad_norm": 0.87353515625, + "learning_rate": 4.2607269612956615e-05, + "loss": 0.7767, + "step": 36100 + }, + { + "epoch": 1.389990375360924, + "grad_norm": 0.9114276170730591, + "learning_rate": 4.258250883511782e-05, + "loss": 0.6756, + "step": 36105 + }, + { + "epoch": 1.3901828681424446, + "grad_norm": 1.5339330434799194, + "learning_rate": 4.255775330772822e-05, + "loss": 0.6985, + "step": 36110 + }, + { + "epoch": 1.3903753609239653, + "grad_norm": 1.8716709613800049, + "learning_rate": 4.253300303305157e-05, + "loss": 0.8967, + "step": 36115 + }, + { + "epoch": 1.390567853705486, + "grad_norm": 1.4840834140777588, + "learning_rate": 4.250825801335114e-05, + "loss": 0.7657, + "step": 36120 + }, + { + "epoch": 1.3907603464870069, + "grad_norm": 1.0822495222091675, + "learning_rate": 4.24835182508896e-05, + "loss": 0.7959, + "step": 36125 + }, + { + "epoch": 1.3909528392685275, + "grad_norm": 1.4410738945007324, + "learning_rate": 4.2458783747929375e-05, + "loss": 0.8108, + "step": 36130 + }, + { + "epoch": 1.3911453320500482, + "grad_norm": 1.0785428285598755, + "learning_rate": 4.2434054506732157e-05, + "loss": 0.8385, + "step": 36135 + }, + { + "epoch": 1.3913378248315689, + "grad_norm": 1.1617215871810913, + "learning_rate": 4.240933052955932e-05, + "loss": 0.8655, + "step": 36140 + }, + { + "epoch": 1.3915303176130895, + "grad_norm": 1.7867512702941895, + "learning_rate": 4.238461181867171e-05, + "loss": 0.9933, + "step": 36145 + }, + { + "epoch": 1.3917228103946102, + "grad_norm": 1.6082379817962646, + "learning_rate": 4.235989837632971e-05, + "loss": 0.7886, + "step": 36150 + }, + { + "epoch": 1.3919153031761309, + "grad_norm": 1.7708418369293213, + "learning_rate": 4.23351902047931e-05, + "loss": 0.9419, + "step": 36155 + }, + { + "epoch": 1.3921077959576516, + "grad_norm": 2.8401410579681396, + "learning_rate": 4.231048730632142e-05, + "loss": 0.9881, + "step": 36160 + }, + { + "epoch": 1.3923002887391722, + "grad_norm": 2.1983916759490967, + "learning_rate": 4.228578968317349e-05, + "loss": 0.8503, + "step": 36165 + }, + { + "epoch": 1.392492781520693, + "grad_norm": 1.9838260412216187, + "learning_rate": 4.226109733760777e-05, + "loss": 0.9066, + "step": 36170 + }, + { + "epoch": 1.3926852743022136, + "grad_norm": 1.032500147819519, + "learning_rate": 4.223641027188226e-05, + "loss": 0.7624, + "step": 36175 + }, + { + "epoch": 1.3928777670837342, + "grad_norm": 2.5539069175720215, + "learning_rate": 4.221172848825432e-05, + "loss": 0.971, + "step": 36180 + }, + { + "epoch": 1.3930702598652551, + "grad_norm": 1.1746537685394287, + "learning_rate": 4.218705198898102e-05, + "loss": 0.8605, + "step": 36185 + }, + { + "epoch": 1.3932627526467758, + "grad_norm": 1.7174090147018433, + "learning_rate": 4.216238077631882e-05, + "loss": 0.8074, + "step": 36190 + }, + { + "epoch": 1.3934552454282965, + "grad_norm": 1.3414026498794556, + "learning_rate": 4.2137714852523814e-05, + "loss": 0.8017, + "step": 36195 + }, + { + "epoch": 1.3936477382098171, + "grad_norm": 1.387629747390747, + "learning_rate": 4.21130542198514e-05, + "loss": 0.8292, + "step": 36200 + }, + { + "epoch": 1.3938402309913378, + "grad_norm": 1.5130243301391602, + "learning_rate": 4.2088398880556786e-05, + "loss": 0.8602, + "step": 36205 + }, + { + "epoch": 1.3940327237728585, + "grad_norm": 1.6352779865264893, + "learning_rate": 4.206374883689444e-05, + "loss": 0.7357, + "step": 36210 + }, + { + "epoch": 1.3942252165543791, + "grad_norm": 1.0258371829986572, + "learning_rate": 4.203910409111845e-05, + "loss": 0.9261, + "step": 36215 + }, + { + "epoch": 1.3944177093359, + "grad_norm": 1.0647536516189575, + "learning_rate": 4.2014464645482444e-05, + "loss": 0.7186, + "step": 36220 + }, + { + "epoch": 1.3946102021174207, + "grad_norm": 1.0326642990112305, + "learning_rate": 4.198983050223957e-05, + "loss": 0.7769, + "step": 36225 + }, + { + "epoch": 1.3948026948989414, + "grad_norm": 2.0613622665405273, + "learning_rate": 4.196520166364233e-05, + "loss": 0.8248, + "step": 36230 + }, + { + "epoch": 1.394995187680462, + "grad_norm": 2.12548565864563, + "learning_rate": 4.194057813194303e-05, + "loss": 0.872, + "step": 36235 + }, + { + "epoch": 1.3951876804619827, + "grad_norm": 2.1166744232177734, + "learning_rate": 4.191595990939322e-05, + "loss": 0.8744, + "step": 36240 + }, + { + "epoch": 1.3953801732435034, + "grad_norm": 1.4747822284698486, + "learning_rate": 4.18913469982441e-05, + "loss": 0.8514, + "step": 36245 + }, + { + "epoch": 1.395572666025024, + "grad_norm": 1.3667523860931396, + "learning_rate": 4.186673940074637e-05, + "loss": 0.9452, + "step": 36250 + }, + { + "epoch": 1.3957651588065447, + "grad_norm": 1.1476492881774902, + "learning_rate": 4.1842137119150246e-05, + "loss": 0.7274, + "step": 36255 + }, + { + "epoch": 1.3959576515880654, + "grad_norm": 2.258908987045288, + "learning_rate": 4.181754015570536e-05, + "loss": 1.1584, + "step": 36260 + }, + { + "epoch": 1.396150144369586, + "grad_norm": 1.5475572347640991, + "learning_rate": 4.179294851266107e-05, + "loss": 0.8509, + "step": 36265 + }, + { + "epoch": 1.3963426371511067, + "grad_norm": 0.9235596656799316, + "learning_rate": 4.176836219226602e-05, + "loss": 0.6882, + "step": 36270 + }, + { + "epoch": 1.3965351299326274, + "grad_norm": 2.50726580619812, + "learning_rate": 4.17437811967685e-05, + "loss": 0.9346, + "step": 36275 + }, + { + "epoch": 1.3967276227141483, + "grad_norm": 2.3889918327331543, + "learning_rate": 4.171920552841627e-05, + "loss": 0.7456, + "step": 36280 + }, + { + "epoch": 1.396920115495669, + "grad_norm": 1.233176350593567, + "learning_rate": 4.169463518945667e-05, + "loss": 0.8134, + "step": 36285 + }, + { + "epoch": 1.3971126082771896, + "grad_norm": 1.5915166139602661, + "learning_rate": 4.167007018213639e-05, + "loss": 0.9035, + "step": 36290 + }, + { + "epoch": 1.3973051010587103, + "grad_norm": 1.1252813339233398, + "learning_rate": 4.1645510508701805e-05, + "loss": 0.7778, + "step": 36295 + }, + { + "epoch": 1.397497593840231, + "grad_norm": 1.1730810403823853, + "learning_rate": 4.162095617139871e-05, + "loss": 0.8332, + "step": 36300 + }, + { + "epoch": 1.3976900866217516, + "grad_norm": 1.0456668138504028, + "learning_rate": 4.159640717247245e-05, + "loss": 0.8277, + "step": 36305 + }, + { + "epoch": 1.3978825794032723, + "grad_norm": 1.0216035842895508, + "learning_rate": 4.157186351416791e-05, + "loss": 0.8409, + "step": 36310 + }, + { + "epoch": 1.398075072184793, + "grad_norm": 2.1484153270721436, + "learning_rate": 4.154732519872936e-05, + "loss": 0.9005, + "step": 36315 + }, + { + "epoch": 1.3982675649663139, + "grad_norm": 0.9150620698928833, + "learning_rate": 4.15227922284007e-05, + "loss": 0.8182, + "step": 36320 + }, + { + "epoch": 1.3984600577478346, + "grad_norm": 2.366243362426758, + "learning_rate": 4.149826460542532e-05, + "loss": 0.8423, + "step": 36325 + }, + { + "epoch": 1.3986525505293552, + "grad_norm": 1.0907059907913208, + "learning_rate": 4.147374233204611e-05, + "loss": 0.8498, + "step": 36330 + }, + { + "epoch": 1.398845043310876, + "grad_norm": 1.077556848526001, + "learning_rate": 4.1449225410505456e-05, + "loss": 0.8998, + "step": 36335 + }, + { + "epoch": 1.3990375360923966, + "grad_norm": 1.197625756263733, + "learning_rate": 4.1424713843045305e-05, + "loss": 0.7545, + "step": 36340 + }, + { + "epoch": 1.3992300288739172, + "grad_norm": 1.4147288799285889, + "learning_rate": 4.1400207631906985e-05, + "loss": 0.8045, + "step": 36345 + }, + { + "epoch": 1.399422521655438, + "grad_norm": 2.3983185291290283, + "learning_rate": 4.1375706779331556e-05, + "loss": 1.0072, + "step": 36350 + }, + { + "epoch": 1.3996150144369586, + "grad_norm": 1.1587196588516235, + "learning_rate": 4.1351211287559366e-05, + "loss": 0.7277, + "step": 36355 + }, + { + "epoch": 1.3998075072184792, + "grad_norm": 1.1984206438064575, + "learning_rate": 4.132672115883037e-05, + "loss": 0.8874, + "step": 36360 + }, + { + "epoch": 1.4, + "grad_norm": 1.4345682859420776, + "learning_rate": 4.130223639538406e-05, + "loss": 0.9031, + "step": 36365 + }, + { + "epoch": 1.4001924927815206, + "grad_norm": 1.10869300365448, + "learning_rate": 4.127775699945944e-05, + "loss": 0.7777, + "step": 36370 + }, + { + "epoch": 1.4003849855630413, + "grad_norm": 0.9464773535728455, + "learning_rate": 4.12532829732949e-05, + "loss": 0.7748, + "step": 36375 + }, + { + "epoch": 1.4005774783445621, + "grad_norm": 1.2995601892471313, + "learning_rate": 4.122881431912847e-05, + "loss": 0.9252, + "step": 36380 + }, + { + "epoch": 1.4007699711260828, + "grad_norm": 1.4381591081619263, + "learning_rate": 4.120435103919765e-05, + "loss": 0.7513, + "step": 36385 + }, + { + "epoch": 1.4009624639076035, + "grad_norm": 1.7021411657333374, + "learning_rate": 4.117989313573943e-05, + "loss": 0.9105, + "step": 36390 + }, + { + "epoch": 1.4011549566891242, + "grad_norm": 1.6943551301956177, + "learning_rate": 4.115544061099039e-05, + "loss": 0.7316, + "step": 36395 + }, + { + "epoch": 1.4013474494706448, + "grad_norm": 1.5277372598648071, + "learning_rate": 4.1130993467186454e-05, + "loss": 0.8301, + "step": 36400 + }, + { + "epoch": 1.4015399422521655, + "grad_norm": 1.3319824934005737, + "learning_rate": 4.110655170656319e-05, + "loss": 0.78, + "step": 36405 + }, + { + "epoch": 1.4017324350336862, + "grad_norm": 1.3265659809112549, + "learning_rate": 4.108211533135563e-05, + "loss": 0.8302, + "step": 36410 + }, + { + "epoch": 1.401924927815207, + "grad_norm": 1.4805601835250854, + "learning_rate": 4.1057684343798376e-05, + "loss": 0.8165, + "step": 36415 + }, + { + "epoch": 1.4021174205967277, + "grad_norm": 1.1178621053695679, + "learning_rate": 4.103325874612536e-05, + "loss": 0.7693, + "step": 36420 + }, + { + "epoch": 1.4023099133782484, + "grad_norm": 2.39166522026062, + "learning_rate": 4.1008838540570284e-05, + "loss": 0.8679, + "step": 36425 + }, + { + "epoch": 1.402502406159769, + "grad_norm": 1.941292405128479, + "learning_rate": 4.098442372936611e-05, + "loss": 0.8451, + "step": 36430 + }, + { + "epoch": 1.4026948989412897, + "grad_norm": 0.9209627509117126, + "learning_rate": 4.0960014314745435e-05, + "loss": 0.8751, + "step": 36435 + }, + { + "epoch": 1.4028873917228104, + "grad_norm": 1.5222151279449463, + "learning_rate": 4.093561029894034e-05, + "loss": 0.8304, + "step": 36440 + }, + { + "epoch": 1.403079884504331, + "grad_norm": 1.1667454242706299, + "learning_rate": 4.091121168418247e-05, + "loss": 0.8462, + "step": 36445 + }, + { + "epoch": 1.4032723772858517, + "grad_norm": 1.9360030889511108, + "learning_rate": 4.088681847270278e-05, + "loss": 0.9054, + "step": 36450 + }, + { + "epoch": 1.4034648700673724, + "grad_norm": 0.9800208806991577, + "learning_rate": 4.086243066673203e-05, + "loss": 0.8068, + "step": 36455 + }, + { + "epoch": 1.403657362848893, + "grad_norm": 1.7991399765014648, + "learning_rate": 4.083804826850022e-05, + "loss": 0.6918, + "step": 36460 + }, + { + "epoch": 1.4038498556304138, + "grad_norm": 1.230858564376831, + "learning_rate": 4.081367128023697e-05, + "loss": 0.7422, + "step": 36465 + }, + { + "epoch": 1.4040423484119344, + "grad_norm": 1.1397539377212524, + "learning_rate": 4.0789299704171405e-05, + "loss": 0.8296, + "step": 36470 + }, + { + "epoch": 1.4042348411934553, + "grad_norm": 0.9111887812614441, + "learning_rate": 4.076493354253219e-05, + "loss": 0.7259, + "step": 36475 + }, + { + "epoch": 1.404427333974976, + "grad_norm": 1.0511916875839233, + "learning_rate": 4.074057279754733e-05, + "loss": 0.7495, + "step": 36480 + }, + { + "epoch": 1.4046198267564967, + "grad_norm": 1.596725344657898, + "learning_rate": 4.071621747144461e-05, + "loss": 0.8516, + "step": 36485 + }, + { + "epoch": 1.4048123195380173, + "grad_norm": 1.4566305875778198, + "learning_rate": 4.069186756645104e-05, + "loss": 0.9033, + "step": 36490 + }, + { + "epoch": 1.405004812319538, + "grad_norm": 1.7779717445373535, + "learning_rate": 4.06675230847933e-05, + "loss": 0.9614, + "step": 36495 + }, + { + "epoch": 1.4051973051010587, + "grad_norm": 1.1408967971801758, + "learning_rate": 4.064318402869753e-05, + "loss": 0.8547, + "step": 36500 + }, + { + "epoch": 1.4053897978825793, + "grad_norm": 0.9641178846359253, + "learning_rate": 4.061885040038942e-05, + "loss": 0.7535, + "step": 36505 + }, + { + "epoch": 1.4055822906641002, + "grad_norm": 1.1024551391601562, + "learning_rate": 4.059452220209403e-05, + "loss": 0.7627, + "step": 36510 + }, + { + "epoch": 1.405774783445621, + "grad_norm": 1.3117225170135498, + "learning_rate": 4.057019943603607e-05, + "loss": 0.96, + "step": 36515 + }, + { + "epoch": 1.4059672762271416, + "grad_norm": 1.1285604238510132, + "learning_rate": 4.054588210443969e-05, + "loss": 0.8481, + "step": 36520 + }, + { + "epoch": 1.4061597690086622, + "grad_norm": 1.3916871547698975, + "learning_rate": 4.0521570209528534e-05, + "loss": 0.8622, + "step": 36525 + }, + { + "epoch": 1.406352261790183, + "grad_norm": 1.8372492790222168, + "learning_rate": 4.049726375352582e-05, + "loss": 0.8732, + "step": 36530 + }, + { + "epoch": 1.4065447545717036, + "grad_norm": 1.3047436475753784, + "learning_rate": 4.047296273865412e-05, + "loss": 0.8297, + "step": 36535 + }, + { + "epoch": 1.4067372473532243, + "grad_norm": 1.984266757965088, + "learning_rate": 4.044866716713565e-05, + "loss": 0.7249, + "step": 36540 + }, + { + "epoch": 1.406929740134745, + "grad_norm": 1.0713486671447754, + "learning_rate": 4.0424377041192075e-05, + "loss": 0.7389, + "step": 36545 + }, + { + "epoch": 1.4071222329162656, + "grad_norm": 1.0382349491119385, + "learning_rate": 4.0400092363044594e-05, + "loss": 0.7965, + "step": 36550 + }, + { + "epoch": 1.4073147256977863, + "grad_norm": 2.4277896881103516, + "learning_rate": 4.03758131349138e-05, + "loss": 0.7452, + "step": 36555 + }, + { + "epoch": 1.407507218479307, + "grad_norm": 1.7688560485839844, + "learning_rate": 4.0351539359019985e-05, + "loss": 0.8284, + "step": 36560 + }, + { + "epoch": 1.4076997112608276, + "grad_norm": 1.1814532279968262, + "learning_rate": 4.0327271037582726e-05, + "loss": 0.872, + "step": 36565 + }, + { + "epoch": 1.4078922040423485, + "grad_norm": 2.220003128051758, + "learning_rate": 4.0303008172821235e-05, + "loss": 0.8533, + "step": 36570 + }, + { + "epoch": 1.4080846968238692, + "grad_norm": 1.1941807270050049, + "learning_rate": 4.0278750766954176e-05, + "loss": 0.7296, + "step": 36575 + }, + { + "epoch": 1.4082771896053898, + "grad_norm": 1.4177144765853882, + "learning_rate": 4.025449882219979e-05, + "loss": 0.8877, + "step": 36580 + }, + { + "epoch": 1.4084696823869105, + "grad_norm": 1.0108013153076172, + "learning_rate": 4.023025234077564e-05, + "loss": 0.9344, + "step": 36585 + }, + { + "epoch": 1.4086621751684312, + "grad_norm": 1.2472012042999268, + "learning_rate": 4.0206011324899054e-05, + "loss": 0.9062, + "step": 36590 + }, + { + "epoch": 1.4088546679499518, + "grad_norm": 1.353990077972412, + "learning_rate": 4.018177577678658e-05, + "loss": 0.7711, + "step": 36595 + }, + { + "epoch": 1.4090471607314725, + "grad_norm": 1.1395666599273682, + "learning_rate": 4.0157545698654465e-05, + "loss": 0.6574, + "step": 36600 + }, + { + "epoch": 1.4092396535129932, + "grad_norm": 1.177996039390564, + "learning_rate": 4.013332109271837e-05, + "loss": 0.8385, + "step": 36605 + }, + { + "epoch": 1.409432146294514, + "grad_norm": 1.8291305303573608, + "learning_rate": 4.0109101961193474e-05, + "loss": 0.8298, + "step": 36610 + }, + { + "epoch": 1.4096246390760347, + "grad_norm": 1.83578622341156, + "learning_rate": 4.008488830629451e-05, + "loss": 1.0385, + "step": 36615 + }, + { + "epoch": 1.4098171318575554, + "grad_norm": 1.2734320163726807, + "learning_rate": 4.0060680130235564e-05, + "loss": 0.8062, + "step": 36620 + }, + { + "epoch": 1.410009624639076, + "grad_norm": 0.9485237002372742, + "learning_rate": 4.003647743523037e-05, + "loss": 0.958, + "step": 36625 + }, + { + "epoch": 1.4102021174205968, + "grad_norm": 1.1491374969482422, + "learning_rate": 4.0012280223492084e-05, + "loss": 0.7825, + "step": 36630 + }, + { + "epoch": 1.4103946102021174, + "grad_norm": 1.9539427757263184, + "learning_rate": 3.9988088497233424e-05, + "loss": 0.7944, + "step": 36635 + }, + { + "epoch": 1.410587102983638, + "grad_norm": 1.9131194353103638, + "learning_rate": 3.9963902258666465e-05, + "loss": 0.9911, + "step": 36640 + }, + { + "epoch": 1.4107795957651588, + "grad_norm": 0.8639621734619141, + "learning_rate": 3.9939721510003e-05, + "loss": 0.7923, + "step": 36645 + }, + { + "epoch": 1.4109720885466794, + "grad_norm": 1.7485369443893433, + "learning_rate": 3.991554625345412e-05, + "loss": 0.7266, + "step": 36650 + }, + { + "epoch": 1.4111645813282, + "grad_norm": 0.921247124671936, + "learning_rate": 3.989137649123051e-05, + "loss": 0.7662, + "step": 36655 + }, + { + "epoch": 1.4113570741097208, + "grad_norm": 0.8794036507606506, + "learning_rate": 3.986721222554232e-05, + "loss": 0.6367, + "step": 36660 + }, + { + "epoch": 1.4115495668912414, + "grad_norm": 1.2151905298233032, + "learning_rate": 3.984305345859928e-05, + "loss": 0.7589, + "step": 36665 + }, + { + "epoch": 1.4117420596727623, + "grad_norm": 1.3154112100601196, + "learning_rate": 3.981890019261042e-05, + "loss": 1.0049, + "step": 36670 + }, + { + "epoch": 1.411934552454283, + "grad_norm": 1.4974151849746704, + "learning_rate": 3.979475242978454e-05, + "loss": 0.7273, + "step": 36675 + }, + { + "epoch": 1.4121270452358037, + "grad_norm": 1.543226718902588, + "learning_rate": 3.97706101723297e-05, + "loss": 0.8911, + "step": 36680 + }, + { + "epoch": 1.4123195380173243, + "grad_norm": 1.6745073795318604, + "learning_rate": 3.974647342245358e-05, + "loss": 0.7785, + "step": 36685 + }, + { + "epoch": 1.412512030798845, + "grad_norm": 0.8186720609664917, + "learning_rate": 3.972234218236331e-05, + "loss": 0.8083, + "step": 36690 + }, + { + "epoch": 1.4127045235803657, + "grad_norm": 1.7771086692810059, + "learning_rate": 3.969821645426559e-05, + "loss": 0.6911, + "step": 36695 + }, + { + "epoch": 1.4128970163618864, + "grad_norm": 1.0022178888320923, + "learning_rate": 3.9674096240366445e-05, + "loss": 0.9787, + "step": 36700 + }, + { + "epoch": 1.4130895091434073, + "grad_norm": 0.9852539896965027, + "learning_rate": 3.964998154287164e-05, + "loss": 0.7145, + "step": 36705 + }, + { + "epoch": 1.413282001924928, + "grad_norm": 1.1110121011734009, + "learning_rate": 3.9625872363986205e-05, + "loss": 0.7933, + "step": 36710 + }, + { + "epoch": 1.4134744947064486, + "grad_norm": 1.6256468296051025, + "learning_rate": 3.96017687059148e-05, + "loss": 1.0778, + "step": 36715 + }, + { + "epoch": 1.4136669874879693, + "grad_norm": 0.9198029041290283, + "learning_rate": 3.957767057086158e-05, + "loss": 0.6291, + "step": 36720 + }, + { + "epoch": 1.41385948026949, + "grad_norm": 0.8681755065917969, + "learning_rate": 3.9553577961030094e-05, + "loss": 0.8225, + "step": 36725 + }, + { + "epoch": 1.4140519730510106, + "grad_norm": 0.9288125038146973, + "learning_rate": 3.952949087862349e-05, + "loss": 0.7758, + "step": 36730 + }, + { + "epoch": 1.4142444658325313, + "grad_norm": 1.3477758169174194, + "learning_rate": 3.9505409325844346e-05, + "loss": 0.8854, + "step": 36735 + }, + { + "epoch": 1.414436958614052, + "grad_norm": 1.035380482673645, + "learning_rate": 3.948133330489483e-05, + "loss": 0.9035, + "step": 36740 + }, + { + "epoch": 1.4146294513955726, + "grad_norm": 1.4395008087158203, + "learning_rate": 3.9457262817976405e-05, + "loss": 0.8584, + "step": 36745 + }, + { + "epoch": 1.4148219441770933, + "grad_norm": 1.6281189918518066, + "learning_rate": 3.943319786729031e-05, + "loss": 0.8451, + "step": 36750 + }, + { + "epoch": 1.415014436958614, + "grad_norm": 1.3493483066558838, + "learning_rate": 3.940913845503701e-05, + "loss": 0.9527, + "step": 36755 + }, + { + "epoch": 1.4152069297401346, + "grad_norm": 1.200760841369629, + "learning_rate": 3.938508458341663e-05, + "loss": 0.812, + "step": 36760 + }, + { + "epoch": 1.4153994225216555, + "grad_norm": 1.3952566385269165, + "learning_rate": 3.9361036254628726e-05, + "loss": 0.9364, + "step": 36765 + }, + { + "epoch": 1.4155919153031762, + "grad_norm": 1.607851266860962, + "learning_rate": 3.9336993470872385e-05, + "loss": 0.8368, + "step": 36770 + }, + { + "epoch": 1.4157844080846969, + "grad_norm": 1.7063467502593994, + "learning_rate": 3.931295623434608e-05, + "loss": 0.8981, + "step": 36775 + }, + { + "epoch": 1.4159769008662175, + "grad_norm": 2.502572536468506, + "learning_rate": 3.928892454724797e-05, + "loss": 0.8994, + "step": 36780 + }, + { + "epoch": 1.4161693936477382, + "grad_norm": 2.07956862449646, + "learning_rate": 3.926489841177551e-05, + "loss": 1.1487, + "step": 36785 + }, + { + "epoch": 1.4163618864292589, + "grad_norm": 1.2496376037597656, + "learning_rate": 3.924087783012575e-05, + "loss": 0.7492, + "step": 36790 + }, + { + "epoch": 1.4165543792107795, + "grad_norm": 1.3253594636917114, + "learning_rate": 3.921686280449523e-05, + "loss": 0.8016, + "step": 36795 + }, + { + "epoch": 1.4167468719923004, + "grad_norm": 0.9799445271492004, + "learning_rate": 3.919285333707998e-05, + "loss": 0.7401, + "step": 36800 + }, + { + "epoch": 1.416939364773821, + "grad_norm": 0.9721969366073608, + "learning_rate": 3.916884943007541e-05, + "loss": 0.6816, + "step": 36805 + }, + { + "epoch": 1.4171318575553418, + "grad_norm": 0.9141532778739929, + "learning_rate": 3.914485108567667e-05, + "loss": 0.8165, + "step": 36810 + }, + { + "epoch": 1.4173243503368624, + "grad_norm": 1.3266597986221313, + "learning_rate": 3.912085830607813e-05, + "loss": 0.8702, + "step": 36815 + }, + { + "epoch": 1.417516843118383, + "grad_norm": 1.6577297449111938, + "learning_rate": 3.909687109347382e-05, + "loss": 0.7582, + "step": 36820 + }, + { + "epoch": 1.4177093358999038, + "grad_norm": 1.3083659410476685, + "learning_rate": 3.9072889450057194e-05, + "loss": 0.7892, + "step": 36825 + }, + { + "epoch": 1.4179018286814244, + "grad_norm": 1.1180342435836792, + "learning_rate": 3.904891337802126e-05, + "loss": 0.9944, + "step": 36830 + }, + { + "epoch": 1.4180943214629451, + "grad_norm": 1.2931442260742188, + "learning_rate": 3.902494287955839e-05, + "loss": 0.769, + "step": 36835 + }, + { + "epoch": 1.4182868142444658, + "grad_norm": 1.012529730796814, + "learning_rate": 3.900097795686057e-05, + "loss": 0.6058, + "step": 36840 + }, + { + "epoch": 1.4184793070259865, + "grad_norm": 1.3702960014343262, + "learning_rate": 3.897701861211924e-05, + "loss": 0.8077, + "step": 36845 + }, + { + "epoch": 1.4186717998075071, + "grad_norm": 1.4221330881118774, + "learning_rate": 3.8953064847525324e-05, + "loss": 0.7043, + "step": 36850 + }, + { + "epoch": 1.4188642925890278, + "grad_norm": 1.2348641157150269, + "learning_rate": 3.892911666526926e-05, + "loss": 0.9082, + "step": 36855 + }, + { + "epoch": 1.4190567853705485, + "grad_norm": 1.2621009349822998, + "learning_rate": 3.890517406754085e-05, + "loss": 0.8831, + "step": 36860 + }, + { + "epoch": 1.4192492781520694, + "grad_norm": 1.2773290872573853, + "learning_rate": 3.888123705652962e-05, + "loss": 0.7864, + "step": 36865 + }, + { + "epoch": 1.41944177093359, + "grad_norm": 1.2234888076782227, + "learning_rate": 3.8857305634424356e-05, + "loss": 0.7859, + "step": 36870 + }, + { + "epoch": 1.4196342637151107, + "grad_norm": 1.1090830564498901, + "learning_rate": 3.8833379803413463e-05, + "loss": 0.8045, + "step": 36875 + }, + { + "epoch": 1.4198267564966314, + "grad_norm": 1.4224649667739868, + "learning_rate": 3.8809459565684784e-05, + "loss": 0.8793, + "step": 36880 + }, + { + "epoch": 1.420019249278152, + "grad_norm": 1.293678879737854, + "learning_rate": 3.878554492342572e-05, + "loss": 0.7928, + "step": 36885 + }, + { + "epoch": 1.4202117420596727, + "grad_norm": 1.27838134765625, + "learning_rate": 3.876163587882299e-05, + "loss": 0.8078, + "step": 36890 + }, + { + "epoch": 1.4204042348411934, + "grad_norm": 0.9022307991981506, + "learning_rate": 3.873773243406306e-05, + "loss": 0.8416, + "step": 36895 + }, + { + "epoch": 1.4205967276227143, + "grad_norm": 1.2606749534606934, + "learning_rate": 3.8713834591331646e-05, + "loss": 0.816, + "step": 36900 + }, + { + "epoch": 1.420789220404235, + "grad_norm": 1.7780064344406128, + "learning_rate": 3.868994235281407e-05, + "loss": 0.9733, + "step": 36905 + }, + { + "epoch": 1.4209817131857556, + "grad_norm": 1.242295742034912, + "learning_rate": 3.8666055720695126e-05, + "loss": 0.6507, + "step": 36910 + }, + { + "epoch": 1.4211742059672763, + "grad_norm": 1.5196983814239502, + "learning_rate": 3.864217469715912e-05, + "loss": 0.8056, + "step": 36915 + }, + { + "epoch": 1.421366698748797, + "grad_norm": 0.9677949547767639, + "learning_rate": 3.8618299284389747e-05, + "loss": 0.8748, + "step": 36920 + }, + { + "epoch": 1.4215591915303176, + "grad_norm": 0.9580836892127991, + "learning_rate": 3.859442948457028e-05, + "loss": 0.8544, + "step": 36925 + }, + { + "epoch": 1.4217516843118383, + "grad_norm": 1.5484107732772827, + "learning_rate": 3.857056529988347e-05, + "loss": 0.8237, + "step": 36930 + }, + { + "epoch": 1.421944177093359, + "grad_norm": 0.9140622019767761, + "learning_rate": 3.854670673251153e-05, + "loss": 0.6624, + "step": 36935 + }, + { + "epoch": 1.4221366698748796, + "grad_norm": 1.0796998739242554, + "learning_rate": 3.852285378463619e-05, + "loss": 0.7723, + "step": 36940 + }, + { + "epoch": 1.4223291626564003, + "grad_norm": 0.9047008752822876, + "learning_rate": 3.84990064584386e-05, + "loss": 0.787, + "step": 36945 + }, + { + "epoch": 1.422521655437921, + "grad_norm": 2.0904667377471924, + "learning_rate": 3.847516475609947e-05, + "loss": 0.8196, + "step": 36950 + }, + { + "epoch": 1.4227141482194416, + "grad_norm": 1.2479698657989502, + "learning_rate": 3.845132867979895e-05, + "loss": 0.8103, + "step": 36955 + }, + { + "epoch": 1.4229066410009625, + "grad_norm": 1.6270309686660767, + "learning_rate": 3.842749823171673e-05, + "loss": 0.8852, + "step": 36960 + }, + { + "epoch": 1.4230991337824832, + "grad_norm": 1.9030733108520508, + "learning_rate": 3.840367341403185e-05, + "loss": 0.9027, + "step": 36965 + }, + { + "epoch": 1.4232916265640039, + "grad_norm": 1.098138689994812, + "learning_rate": 3.837985422892307e-05, + "loss": 0.8737, + "step": 36970 + }, + { + "epoch": 1.4234841193455245, + "grad_norm": 1.9470241069793701, + "learning_rate": 3.8356040678568393e-05, + "loss": 0.7401, + "step": 36975 + }, + { + "epoch": 1.4236766121270452, + "grad_norm": 1.122239589691162, + "learning_rate": 3.833223276514544e-05, + "loss": 0.873, + "step": 36980 + }, + { + "epoch": 1.4238691049085659, + "grad_norm": 1.3224730491638184, + "learning_rate": 3.830843049083128e-05, + "loss": 0.7331, + "step": 36985 + }, + { + "epoch": 1.4240615976900866, + "grad_norm": 1.4504386186599731, + "learning_rate": 3.8284633857802524e-05, + "loss": 0.7491, + "step": 36990 + }, + { + "epoch": 1.4242540904716074, + "grad_norm": 1.9830818176269531, + "learning_rate": 3.82608428682351e-05, + "loss": 0.8795, + "step": 36995 + }, + { + "epoch": 1.4244465832531281, + "grad_norm": 0.9479761719703674, + "learning_rate": 3.823705752430469e-05, + "loss": 0.972, + "step": 37000 + }, + { + "epoch": 1.4246390760346488, + "grad_norm": 2.649916172027588, + "learning_rate": 3.821327782818619e-05, + "loss": 0.8017, + "step": 37005 + }, + { + "epoch": 1.4248315688161695, + "grad_norm": 1.3527096509933472, + "learning_rate": 3.8189503782054135e-05, + "loss": 0.796, + "step": 37010 + }, + { + "epoch": 1.4250240615976901, + "grad_norm": 1.6791696548461914, + "learning_rate": 3.816573538808249e-05, + "loss": 0.7578, + "step": 37015 + }, + { + "epoch": 1.4252165543792108, + "grad_norm": 1.8477129936218262, + "learning_rate": 3.814197264844478e-05, + "loss": 0.7141, + "step": 37020 + }, + { + "epoch": 1.4254090471607315, + "grad_norm": 1.5426512956619263, + "learning_rate": 3.811821556531382e-05, + "loss": 0.8485, + "step": 37025 + }, + { + "epoch": 1.4256015399422521, + "grad_norm": 1.2095710039138794, + "learning_rate": 3.809446414086218e-05, + "loss": 0.8202, + "step": 37030 + }, + { + "epoch": 1.4257940327237728, + "grad_norm": 1.1557666063308716, + "learning_rate": 3.8070718377261696e-05, + "loss": 0.7613, + "step": 37035 + }, + { + "epoch": 1.4259865255052935, + "grad_norm": 1.4314980506896973, + "learning_rate": 3.8046978276683756e-05, + "loss": 0.8334, + "step": 37040 + }, + { + "epoch": 1.4261790182868141, + "grad_norm": 0.7171056866645813, + "learning_rate": 3.802324384129925e-05, + "loss": 0.886, + "step": 37045 + }, + { + "epoch": 1.4263715110683348, + "grad_norm": 2.3960771560668945, + "learning_rate": 3.799951507327858e-05, + "loss": 0.857, + "step": 37050 + }, + { + "epoch": 1.4265640038498557, + "grad_norm": 1.7112245559692383, + "learning_rate": 3.7975791974791494e-05, + "loss": 0.7698, + "step": 37055 + }, + { + "epoch": 1.4267564966313764, + "grad_norm": 1.118683934211731, + "learning_rate": 3.795207454800737e-05, + "loss": 0.6801, + "step": 37060 + }, + { + "epoch": 1.426948989412897, + "grad_norm": 1.9905716180801392, + "learning_rate": 3.7928362795095e-05, + "loss": 0.793, + "step": 37065 + }, + { + "epoch": 1.4271414821944177, + "grad_norm": 1.2646980285644531, + "learning_rate": 3.790465671822265e-05, + "loss": 0.8383, + "step": 37070 + }, + { + "epoch": 1.4273339749759384, + "grad_norm": 1.1731975078582764, + "learning_rate": 3.7880956319558146e-05, + "loss": 0.7778, + "step": 37075 + }, + { + "epoch": 1.427526467757459, + "grad_norm": 1.5719605684280396, + "learning_rate": 3.785726160126864e-05, + "loss": 0.8979, + "step": 37080 + }, + { + "epoch": 1.4277189605389797, + "grad_norm": 1.1388436555862427, + "learning_rate": 3.7833572565520924e-05, + "loss": 0.8414, + "step": 37085 + }, + { + "epoch": 1.4279114533205004, + "grad_norm": 1.2077659368515015, + "learning_rate": 3.7809889214481165e-05, + "loss": 0.8771, + "step": 37090 + }, + { + "epoch": 1.4281039461020213, + "grad_norm": 1.342726469039917, + "learning_rate": 3.778621155031511e-05, + "loss": 0.724, + "step": 37095 + }, + { + "epoch": 1.428296438883542, + "grad_norm": 0.8151179552078247, + "learning_rate": 3.77625395751878e-05, + "loss": 0.7926, + "step": 37100 + }, + { + "epoch": 1.4284889316650626, + "grad_norm": 1.3730394840240479, + "learning_rate": 3.773887329126404e-05, + "loss": 0.9403, + "step": 37105 + }, + { + "epoch": 1.4286814244465833, + "grad_norm": 1.6528422832489014, + "learning_rate": 3.77152127007078e-05, + "loss": 0.796, + "step": 37110 + }, + { + "epoch": 1.428873917228104, + "grad_norm": 1.3892961740493774, + "learning_rate": 3.769155780568283e-05, + "loss": 0.8341, + "step": 37115 + }, + { + "epoch": 1.4290664100096246, + "grad_norm": 0.6289898753166199, + "learning_rate": 3.766790860835211e-05, + "loss": 0.7489, + "step": 37120 + }, + { + "epoch": 1.4292589027911453, + "grad_norm": 1.1175183057785034, + "learning_rate": 3.764426511087823e-05, + "loss": 0.9842, + "step": 37125 + }, + { + "epoch": 1.429451395572666, + "grad_norm": 1.6422615051269531, + "learning_rate": 3.762062731542324e-05, + "loss": 0.8007, + "step": 37130 + }, + { + "epoch": 1.4296438883541867, + "grad_norm": 2.0091776847839355, + "learning_rate": 3.759699522414869e-05, + "loss": 0.7446, + "step": 37135 + }, + { + "epoch": 1.4298363811357073, + "grad_norm": 1.1759101152420044, + "learning_rate": 3.757336883921552e-05, + "loss": 0.7502, + "step": 37140 + }, + { + "epoch": 1.430028873917228, + "grad_norm": 1.3325209617614746, + "learning_rate": 3.754974816278422e-05, + "loss": 0.7311, + "step": 37145 + }, + { + "epoch": 1.4302213666987487, + "grad_norm": 0.9399605989456177, + "learning_rate": 3.752613319701476e-05, + "loss": 0.7261, + "step": 37150 + }, + { + "epoch": 1.4304138594802696, + "grad_norm": 0.8700345754623413, + "learning_rate": 3.750252394406656e-05, + "loss": 0.7377, + "step": 37155 + }, + { + "epoch": 1.4306063522617902, + "grad_norm": 1.1583389043807983, + "learning_rate": 3.747892040609858e-05, + "loss": 0.6968, + "step": 37160 + }, + { + "epoch": 1.430798845043311, + "grad_norm": 0.9834827184677124, + "learning_rate": 3.7455322585269125e-05, + "loss": 0.8028, + "step": 37165 + }, + { + "epoch": 1.4309913378248316, + "grad_norm": 1.4735242128372192, + "learning_rate": 3.74317304837361e-05, + "loss": 0.8179, + "step": 37170 + }, + { + "epoch": 1.4311838306063522, + "grad_norm": 1.5621691942214966, + "learning_rate": 3.740814410365685e-05, + "loss": 0.8056, + "step": 37175 + }, + { + "epoch": 1.431376323387873, + "grad_norm": 3.325570583343506, + "learning_rate": 3.7384563447188226e-05, + "loss": 0.9204, + "step": 37180 + }, + { + "epoch": 1.4315688161693936, + "grad_norm": 0.9473647475242615, + "learning_rate": 3.736098851648641e-05, + "loss": 0.9339, + "step": 37185 + }, + { + "epoch": 1.4317613089509145, + "grad_norm": 1.8556329011917114, + "learning_rate": 3.733741931370731e-05, + "loss": 0.8082, + "step": 37190 + }, + { + "epoch": 1.4319538017324351, + "grad_norm": 1.158944845199585, + "learning_rate": 3.73138558410061e-05, + "loss": 0.8515, + "step": 37195 + }, + { + "epoch": 1.4321462945139558, + "grad_norm": 1.7924774885177612, + "learning_rate": 3.729029810053749e-05, + "loss": 0.904, + "step": 37200 + }, + { + "epoch": 1.4323387872954765, + "grad_norm": 1.753733515739441, + "learning_rate": 3.726674609445572e-05, + "loss": 0.9027, + "step": 37205 + }, + { + "epoch": 1.4325312800769971, + "grad_norm": 1.4341967105865479, + "learning_rate": 3.7243199824914484e-05, + "loss": 1.0106, + "step": 37210 + }, + { + "epoch": 1.4327237728585178, + "grad_norm": 1.0536222457885742, + "learning_rate": 3.721965929406682e-05, + "loss": 0.8197, + "step": 37215 + }, + { + "epoch": 1.4329162656400385, + "grad_norm": 1.952141284942627, + "learning_rate": 3.719612450406551e-05, + "loss": 0.8157, + "step": 37220 + }, + { + "epoch": 1.4331087584215592, + "grad_norm": 2.054227113723755, + "learning_rate": 3.717259545706254e-05, + "loss": 0.859, + "step": 37225 + }, + { + "epoch": 1.4333012512030798, + "grad_norm": 0.9729071259498596, + "learning_rate": 3.714907215520952e-05, + "loss": 0.8556, + "step": 37230 + }, + { + "epoch": 1.4334937439846005, + "grad_norm": 1.8992177248001099, + "learning_rate": 3.712555460065751e-05, + "loss": 0.97, + "step": 37235 + }, + { + "epoch": 1.4336862367661212, + "grad_norm": 1.2505042552947998, + "learning_rate": 3.710204279555705e-05, + "loss": 0.8497, + "step": 37240 + }, + { + "epoch": 1.4338787295476418, + "grad_norm": 1.4273964166641235, + "learning_rate": 3.707853674205806e-05, + "loss": 0.8451, + "step": 37245 + }, + { + "epoch": 1.4340712223291627, + "grad_norm": 1.104282259941101, + "learning_rate": 3.705503644231012e-05, + "loss": 0.8491, + "step": 37250 + }, + { + "epoch": 1.4342637151106834, + "grad_norm": 1.683293104171753, + "learning_rate": 3.7031541898462096e-05, + "loss": 0.8464, + "step": 37255 + }, + { + "epoch": 1.434456207892204, + "grad_norm": 1.0647557973861694, + "learning_rate": 3.700805311266244e-05, + "loss": 0.7783, + "step": 37260 + }, + { + "epoch": 1.4346487006737247, + "grad_norm": 1.2208470106124878, + "learning_rate": 3.6984570087059045e-05, + "loss": 0.7758, + "step": 37265 + }, + { + "epoch": 1.4348411934552454, + "grad_norm": 1.9771546125411987, + "learning_rate": 3.6961092823799314e-05, + "loss": 0.9314, + "step": 37270 + }, + { + "epoch": 1.435033686236766, + "grad_norm": 0.9171108603477478, + "learning_rate": 3.6937621325030016e-05, + "loss": 0.7365, + "step": 37275 + }, + { + "epoch": 1.4352261790182868, + "grad_norm": 1.75246000289917, + "learning_rate": 3.691415559289749e-05, + "loss": 0.8452, + "step": 37280 + }, + { + "epoch": 1.4354186717998076, + "grad_norm": 1.3954969644546509, + "learning_rate": 3.6890695629547564e-05, + "loss": 0.8532, + "step": 37285 + }, + { + "epoch": 1.4356111645813283, + "grad_norm": 1.296316146850586, + "learning_rate": 3.68672414371254e-05, + "loss": 0.7224, + "step": 37290 + }, + { + "epoch": 1.435803657362849, + "grad_norm": 1.0747795104980469, + "learning_rate": 3.684379301777585e-05, + "loss": 0.9024, + "step": 37295 + }, + { + "epoch": 1.4359961501443697, + "grad_norm": 1.5699114799499512, + "learning_rate": 3.682035037364303e-05, + "loss": 0.9244, + "step": 37300 + }, + { + "epoch": 1.4361886429258903, + "grad_norm": 2.6069118976593018, + "learning_rate": 3.679691350687064e-05, + "loss": 0.645, + "step": 37305 + }, + { + "epoch": 1.436381135707411, + "grad_norm": 1.1996586322784424, + "learning_rate": 3.6773482419601826e-05, + "loss": 0.7952, + "step": 37310 + }, + { + "epoch": 1.4365736284889317, + "grad_norm": 1.3153495788574219, + "learning_rate": 3.675005711397924e-05, + "loss": 0.9398, + "step": 37315 + }, + { + "epoch": 1.4367661212704523, + "grad_norm": 1.404503583908081, + "learning_rate": 3.672663759214487e-05, + "loss": 0.845, + "step": 37320 + }, + { + "epoch": 1.436958614051973, + "grad_norm": 0.9732365012168884, + "learning_rate": 3.670322385624042e-05, + "loss": 0.8258, + "step": 37325 + }, + { + "epoch": 1.4371511068334937, + "grad_norm": 1.2123562097549438, + "learning_rate": 3.667981590840681e-05, + "loss": 0.8841, + "step": 37330 + }, + { + "epoch": 1.4373435996150143, + "grad_norm": 2.156731367111206, + "learning_rate": 3.6656413750784565e-05, + "loss": 0.9473, + "step": 37335 + }, + { + "epoch": 1.437536092396535, + "grad_norm": 1.4987213611602783, + "learning_rate": 3.6633017385513676e-05, + "loss": 0.7732, + "step": 37340 + }, + { + "epoch": 1.4377285851780557, + "grad_norm": 1.517491102218628, + "learning_rate": 3.660962681473362e-05, + "loss": 0.8491, + "step": 37345 + }, + { + "epoch": 1.4379210779595766, + "grad_norm": 1.0631245374679565, + "learning_rate": 3.65862420405832e-05, + "loss": 0.6431, + "step": 37350 + }, + { + "epoch": 1.4381135707410972, + "grad_norm": 1.114632248878479, + "learning_rate": 3.656286306520094e-05, + "loss": 0.8981, + "step": 37355 + }, + { + "epoch": 1.438306063522618, + "grad_norm": 2.538287401199341, + "learning_rate": 3.6539489890724575e-05, + "loss": 0.8646, + "step": 37360 + }, + { + "epoch": 1.4384985563041386, + "grad_norm": 1.4105244874954224, + "learning_rate": 3.651612251929147e-05, + "loss": 0.9686, + "step": 37365 + }, + { + "epoch": 1.4386910490856593, + "grad_norm": 1.3305569887161255, + "learning_rate": 3.649276095303843e-05, + "loss": 0.9057, + "step": 37370 + }, + { + "epoch": 1.43888354186718, + "grad_norm": 1.292600154876709, + "learning_rate": 3.646940519410169e-05, + "loss": 0.8515, + "step": 37375 + }, + { + "epoch": 1.4390760346487006, + "grad_norm": 1.3273907899856567, + "learning_rate": 3.644605524461704e-05, + "loss": 0.9926, + "step": 37380 + }, + { + "epoch": 1.4392685274302215, + "grad_norm": 1.1350206136703491, + "learning_rate": 3.6422711106719595e-05, + "loss": 0.9411, + "step": 37385 + }, + { + "epoch": 1.4394610202117422, + "grad_norm": 1.1308443546295166, + "learning_rate": 3.639937278254406e-05, + "loss": 0.7784, + "step": 37390 + }, + { + "epoch": 1.4396535129932628, + "grad_norm": 1.2704259157180786, + "learning_rate": 3.637604027422457e-05, + "loss": 0.7807, + "step": 37395 + }, + { + "epoch": 1.4398460057747835, + "grad_norm": 1.5792492628097534, + "learning_rate": 3.6352713583894746e-05, + "loss": 0.7525, + "step": 37400 + }, + { + "epoch": 1.4400384985563042, + "grad_norm": 1.706715703010559, + "learning_rate": 3.632939271368758e-05, + "loss": 0.856, + "step": 37405 + }, + { + "epoch": 1.4402309913378248, + "grad_norm": 1.0315080881118774, + "learning_rate": 3.630607766573574e-05, + "loss": 0.907, + "step": 37410 + }, + { + "epoch": 1.4404234841193455, + "grad_norm": 1.4221409559249878, + "learning_rate": 3.628276844217113e-05, + "loss": 0.7504, + "step": 37415 + }, + { + "epoch": 1.4406159769008662, + "grad_norm": 0.8785042762756348, + "learning_rate": 3.6259465045125265e-05, + "loss": 0.8181, + "step": 37420 + }, + { + "epoch": 1.4408084696823868, + "grad_norm": 1.8334077596664429, + "learning_rate": 3.623616747672907e-05, + "loss": 0.8067, + "step": 37425 + }, + { + "epoch": 1.4410009624639075, + "grad_norm": 0.9262105226516724, + "learning_rate": 3.621287573911299e-05, + "loss": 0.8618, + "step": 37430 + }, + { + "epoch": 1.4411934552454282, + "grad_norm": 1.228710651397705, + "learning_rate": 3.618958983440682e-05, + "loss": 0.8697, + "step": 37435 + }, + { + "epoch": 1.4413859480269489, + "grad_norm": 1.6522730588912964, + "learning_rate": 3.616630976474003e-05, + "loss": 0.9191, + "step": 37440 + }, + { + "epoch": 1.4415784408084698, + "grad_norm": 1.1182198524475098, + "learning_rate": 3.6143035532241296e-05, + "loss": 0.818, + "step": 37445 + }, + { + "epoch": 1.4417709335899904, + "grad_norm": 0.9378697872161865, + "learning_rate": 3.611976713903895e-05, + "loss": 0.8066, + "step": 37450 + }, + { + "epoch": 1.441963426371511, + "grad_norm": 1.22636079788208, + "learning_rate": 3.609650458726075e-05, + "loss": 0.6601, + "step": 37455 + }, + { + "epoch": 1.4421559191530318, + "grad_norm": 1.1185452938079834, + "learning_rate": 3.60732478790339e-05, + "loss": 0.7541, + "step": 37460 + }, + { + "epoch": 1.4423484119345524, + "grad_norm": 0.9403701424598694, + "learning_rate": 3.6049997016485024e-05, + "loss": 0.7677, + "step": 37465 + }, + { + "epoch": 1.442540904716073, + "grad_norm": 0.9934571981430054, + "learning_rate": 3.602675200174031e-05, + "loss": 0.8691, + "step": 37470 + }, + { + "epoch": 1.4427333974975938, + "grad_norm": 1.63606858253479, + "learning_rate": 3.600351283692531e-05, + "loss": 0.7052, + "step": 37475 + }, + { + "epoch": 1.4429258902791147, + "grad_norm": 1.6457669734954834, + "learning_rate": 3.598027952416515e-05, + "loss": 0.8816, + "step": 37480 + }, + { + "epoch": 1.4431183830606353, + "grad_norm": 1.1504909992218018, + "learning_rate": 3.595705206558435e-05, + "loss": 0.8399, + "step": 37485 + }, + { + "epoch": 1.443310875842156, + "grad_norm": 1.7006773948669434, + "learning_rate": 3.593383046330687e-05, + "loss": 0.8129, + "step": 37490 + }, + { + "epoch": 1.4435033686236767, + "grad_norm": 0.8277364373207092, + "learning_rate": 3.591061471945619e-05, + "loss": 0.7758, + "step": 37495 + }, + { + "epoch": 1.4436958614051973, + "grad_norm": 1.3335161209106445, + "learning_rate": 3.5887404836155244e-05, + "loss": 0.8057, + "step": 37500 + }, + { + "epoch": 1.443888354186718, + "grad_norm": 1.1840698719024658, + "learning_rate": 3.586420081552646e-05, + "loss": 0.681, + "step": 37505 + }, + { + "epoch": 1.4440808469682387, + "grad_norm": 1.2329528331756592, + "learning_rate": 3.584100265969157e-05, + "loss": 0.899, + "step": 37510 + }, + { + "epoch": 1.4442733397497594, + "grad_norm": 1.9120075702667236, + "learning_rate": 3.5817810370772046e-05, + "loss": 0.8751, + "step": 37515 + }, + { + "epoch": 1.44446583253128, + "grad_norm": 1.4686700105667114, + "learning_rate": 3.579462395088856e-05, + "loss": 0.9473, + "step": 37520 + }, + { + "epoch": 1.4446583253128007, + "grad_norm": 0.9519988298416138, + "learning_rate": 3.5771443402161396e-05, + "loss": 0.782, + "step": 37525 + }, + { + "epoch": 1.4448508180943214, + "grad_norm": 1.1774522066116333, + "learning_rate": 3.5748268726710254e-05, + "loss": 0.914, + "step": 37530 + }, + { + "epoch": 1.445043310875842, + "grad_norm": 1.1277164220809937, + "learning_rate": 3.5725099926654334e-05, + "loss": 0.8537, + "step": 37535 + }, + { + "epoch": 1.445235803657363, + "grad_norm": 1.815774917602539, + "learning_rate": 3.570193700411219e-05, + "loss": 0.7308, + "step": 37540 + }, + { + "epoch": 1.4454282964388836, + "grad_norm": 1.1836425065994263, + "learning_rate": 3.5678779961202035e-05, + "loss": 0.7268, + "step": 37545 + }, + { + "epoch": 1.4456207892204043, + "grad_norm": 0.964604914188385, + "learning_rate": 3.565562880004133e-05, + "loss": 0.689, + "step": 37550 + }, + { + "epoch": 1.445813282001925, + "grad_norm": 1.3372588157653809, + "learning_rate": 3.563248352274712e-05, + "loss": 0.8704, + "step": 37555 + }, + { + "epoch": 1.4460057747834456, + "grad_norm": 1.824810266494751, + "learning_rate": 3.56093441314359e-05, + "loss": 0.8523, + "step": 37560 + }, + { + "epoch": 1.4461982675649663, + "grad_norm": 1.0827716588974, + "learning_rate": 3.558621062822365e-05, + "loss": 0.9851, + "step": 37565 + }, + { + "epoch": 1.446390760346487, + "grad_norm": 1.1135975122451782, + "learning_rate": 3.556308301522566e-05, + "loss": 0.8532, + "step": 37570 + }, + { + "epoch": 1.4465832531280076, + "grad_norm": 1.0616776943206787, + "learning_rate": 3.553996129455694e-05, + "loss": 0.6833, + "step": 37575 + }, + { + "epoch": 1.4467757459095285, + "grad_norm": 0.9578964114189148, + "learning_rate": 3.551684546833173e-05, + "loss": 0.7893, + "step": 37580 + }, + { + "epoch": 1.4469682386910492, + "grad_norm": 1.2795662879943848, + "learning_rate": 3.549373553866383e-05, + "loss": 0.8313, + "step": 37585 + }, + { + "epoch": 1.4471607314725699, + "grad_norm": 0.9473350048065186, + "learning_rate": 3.547063150766651e-05, + "loss": 0.7281, + "step": 37590 + }, + { + "epoch": 1.4473532242540905, + "grad_norm": 2.0121681690216064, + "learning_rate": 3.544753337745249e-05, + "loss": 0.9083, + "step": 37595 + }, + { + "epoch": 1.4475457170356112, + "grad_norm": 1.5119409561157227, + "learning_rate": 3.542444115013389e-05, + "loss": 0.9774, + "step": 37600 + }, + { + "epoch": 1.4477382098171319, + "grad_norm": 1.390446424484253, + "learning_rate": 3.5401354827822386e-05, + "loss": 0.8337, + "step": 37605 + }, + { + "epoch": 1.4479307025986525, + "grad_norm": 2.891700267791748, + "learning_rate": 3.537827441262904e-05, + "loss": 0.7639, + "step": 37610 + }, + { + "epoch": 1.4481231953801732, + "grad_norm": 0.9751268625259399, + "learning_rate": 3.5355199906664426e-05, + "loss": 0.7479, + "step": 37615 + }, + { + "epoch": 1.4483156881616939, + "grad_norm": 1.9345892667770386, + "learning_rate": 3.533213131203859e-05, + "loss": 0.8192, + "step": 37620 + }, + { + "epoch": 1.4485081809432145, + "grad_norm": 0.8891172409057617, + "learning_rate": 3.5309068630860886e-05, + "loss": 0.7572, + "step": 37625 + }, + { + "epoch": 1.4487006737247352, + "grad_norm": 1.0947364568710327, + "learning_rate": 3.528601186524038e-05, + "loss": 0.8355, + "step": 37630 + }, + { + "epoch": 1.4488931665062559, + "grad_norm": 1.883512020111084, + "learning_rate": 3.5262961017285365e-05, + "loss": 0.705, + "step": 37635 + }, + { + "epoch": 1.4490856592877768, + "grad_norm": 1.5759938955307007, + "learning_rate": 3.523991608910373e-05, + "loss": 0.7667, + "step": 37640 + }, + { + "epoch": 1.4492781520692974, + "grad_norm": 1.6118040084838867, + "learning_rate": 3.521687708280277e-05, + "loss": 0.799, + "step": 37645 + }, + { + "epoch": 1.4494706448508181, + "grad_norm": 1.8567378520965576, + "learning_rate": 3.5193844000489275e-05, + "loss": 0.8733, + "step": 37650 + }, + { + "epoch": 1.4496631376323388, + "grad_norm": 1.5645298957824707, + "learning_rate": 3.5170816844269386e-05, + "loss": 0.833, + "step": 37655 + }, + { + "epoch": 1.4498556304138595, + "grad_norm": 1.3709477186203003, + "learning_rate": 3.5147795616248916e-05, + "loss": 0.8262, + "step": 37660 + }, + { + "epoch": 1.4500481231953801, + "grad_norm": 1.078057885169983, + "learning_rate": 3.512478031853288e-05, + "loss": 0.7562, + "step": 37665 + }, + { + "epoch": 1.4502406159769008, + "grad_norm": 2.0203640460968018, + "learning_rate": 3.510177095322593e-05, + "loss": 0.7648, + "step": 37670 + }, + { + "epoch": 1.4504331087584217, + "grad_norm": 2.2243845462799072, + "learning_rate": 3.5078767522432124e-05, + "loss": 0.6971, + "step": 37675 + }, + { + "epoch": 1.4506256015399424, + "grad_norm": 1.4907315969467163, + "learning_rate": 3.5055770028254995e-05, + "loss": 0.7486, + "step": 37680 + }, + { + "epoch": 1.450818094321463, + "grad_norm": 1.167829155921936, + "learning_rate": 3.503277847279745e-05, + "loss": 1.0481, + "step": 37685 + }, + { + "epoch": 1.4510105871029837, + "grad_norm": 2.3201534748077393, + "learning_rate": 3.500979285816194e-05, + "loss": 0.8633, + "step": 37690 + }, + { + "epoch": 1.4512030798845044, + "grad_norm": 1.0884878635406494, + "learning_rate": 3.498681318645035e-05, + "loss": 0.7201, + "step": 37695 + }, + { + "epoch": 1.451395572666025, + "grad_norm": 1.033949375152588, + "learning_rate": 3.496383945976403e-05, + "loss": 0.828, + "step": 37700 + }, + { + "epoch": 1.4515880654475457, + "grad_norm": 1.7160718441009521, + "learning_rate": 3.494087168020378e-05, + "loss": 1.0329, + "step": 37705 + }, + { + "epoch": 1.4517805582290664, + "grad_norm": 1.1474641561508179, + "learning_rate": 3.491790984986982e-05, + "loss": 0.7973, + "step": 37710 + }, + { + "epoch": 1.451973051010587, + "grad_norm": 1.491860270500183, + "learning_rate": 3.4894953970861875e-05, + "loss": 0.8382, + "step": 37715 + }, + { + "epoch": 1.4521655437921077, + "grad_norm": 1.2758644819259644, + "learning_rate": 3.487200404527911e-05, + "loss": 0.7288, + "step": 37720 + }, + { + "epoch": 1.4523580365736284, + "grad_norm": 1.046302318572998, + "learning_rate": 3.484906007522017e-05, + "loss": 0.787, + "step": 37725 + }, + { + "epoch": 1.452550529355149, + "grad_norm": 1.3662068843841553, + "learning_rate": 3.4826122062783026e-05, + "loss": 0.859, + "step": 37730 + }, + { + "epoch": 1.45274302213667, + "grad_norm": 1.0735856294631958, + "learning_rate": 3.480319001006536e-05, + "loss": 0.882, + "step": 37735 + }, + { + "epoch": 1.4529355149181906, + "grad_norm": 3.124530553817749, + "learning_rate": 3.4780263919164046e-05, + "loss": 1.0391, + "step": 37740 + }, + { + "epoch": 1.4531280076997113, + "grad_norm": 1.3612589836120605, + "learning_rate": 3.475734379217555e-05, + "loss": 0.731, + "step": 37745 + }, + { + "epoch": 1.453320500481232, + "grad_norm": 1.0555542707443237, + "learning_rate": 3.473442963119576e-05, + "loss": 0.9877, + "step": 37750 + }, + { + "epoch": 1.4535129932627526, + "grad_norm": 1.4260815382003784, + "learning_rate": 3.4711521438320094e-05, + "loss": 0.8009, + "step": 37755 + }, + { + "epoch": 1.4537054860442733, + "grad_norm": 1.2460854053497314, + "learning_rate": 3.4688619215643215e-05, + "loss": 0.8803, + "step": 37760 + }, + { + "epoch": 1.453897978825794, + "grad_norm": 0.8238735198974609, + "learning_rate": 3.4665722965259525e-05, + "loss": 0.7875, + "step": 37765 + }, + { + "epoch": 1.4540904716073149, + "grad_norm": 1.1429873704910278, + "learning_rate": 3.464283268926264e-05, + "loss": 0.7947, + "step": 37770 + }, + { + "epoch": 1.4542829643888355, + "grad_norm": 1.3068310022354126, + "learning_rate": 3.461994838974576e-05, + "loss": 0.826, + "step": 37775 + }, + { + "epoch": 1.4544754571703562, + "grad_norm": 1.26819908618927, + "learning_rate": 3.4597070068801484e-05, + "loss": 0.9455, + "step": 37780 + }, + { + "epoch": 1.4546679499518769, + "grad_norm": 1.3983910083770752, + "learning_rate": 3.457419772852194e-05, + "loss": 0.8876, + "step": 37785 + }, + { + "epoch": 1.4548604427333975, + "grad_norm": 1.6775517463684082, + "learning_rate": 3.455133137099853e-05, + "loss": 0.7722, + "step": 37790 + }, + { + "epoch": 1.4550529355149182, + "grad_norm": 1.2544530630111694, + "learning_rate": 3.452847099832238e-05, + "loss": 0.8891, + "step": 37795 + }, + { + "epoch": 1.4552454282964389, + "grad_norm": 0.9618650674819946, + "learning_rate": 3.4505616612583824e-05, + "loss": 0.7674, + "step": 37800 + }, + { + "epoch": 1.4554379210779596, + "grad_norm": 1.694305658340454, + "learning_rate": 3.448276821587275e-05, + "loss": 0.8635, + "step": 37805 + }, + { + "epoch": 1.4556304138594802, + "grad_norm": 1.1212447881698608, + "learning_rate": 3.445992581027853e-05, + "loss": 0.7715, + "step": 37810 + }, + { + "epoch": 1.455822906641001, + "grad_norm": 1.3266422748565674, + "learning_rate": 3.443708939788995e-05, + "loss": 0.8759, + "step": 37815 + }, + { + "epoch": 1.4560153994225216, + "grad_norm": 1.0106375217437744, + "learning_rate": 3.44142589807952e-05, + "loss": 0.639, + "step": 37820 + }, + { + "epoch": 1.4562078922040422, + "grad_norm": 1.1478214263916016, + "learning_rate": 3.4391434561082e-05, + "loss": 0.9026, + "step": 37825 + }, + { + "epoch": 1.4564003849855631, + "grad_norm": 1.7715827226638794, + "learning_rate": 3.436861614083753e-05, + "loss": 0.7835, + "step": 37830 + }, + { + "epoch": 1.4565928777670838, + "grad_norm": 1.5840331315994263, + "learning_rate": 3.434580372214829e-05, + "loss": 0.9205, + "step": 37835 + }, + { + "epoch": 1.4567853705486045, + "grad_norm": 1.0657918453216553, + "learning_rate": 3.432299730710042e-05, + "loss": 0.8256, + "step": 37840 + }, + { + "epoch": 1.4569778633301251, + "grad_norm": 1.3026148080825806, + "learning_rate": 3.430019689777936e-05, + "loss": 0.8559, + "step": 37845 + }, + { + "epoch": 1.4571703561116458, + "grad_norm": 1.7707239389419556, + "learning_rate": 3.4277402496270075e-05, + "loss": 0.8683, + "step": 37850 + }, + { + "epoch": 1.4573628488931665, + "grad_norm": 1.134018063545227, + "learning_rate": 3.4254614104656945e-05, + "loss": 0.7637, + "step": 37855 + }, + { + "epoch": 1.4575553416746871, + "grad_norm": 1.713789463043213, + "learning_rate": 3.423183172502388e-05, + "loss": 1.0235, + "step": 37860 + }, + { + "epoch": 1.4577478344562078, + "grad_norm": 1.0064624547958374, + "learning_rate": 3.420905535945406e-05, + "loss": 0.8631, + "step": 37865 + }, + { + "epoch": 1.4579403272377287, + "grad_norm": 1.2187049388885498, + "learning_rate": 3.418628501003036e-05, + "loss": 0.7412, + "step": 37870 + }, + { + "epoch": 1.4581328200192494, + "grad_norm": 0.9734949469566345, + "learning_rate": 3.416352067883489e-05, + "loss": 0.7388, + "step": 37875 + }, + { + "epoch": 1.45832531280077, + "grad_norm": 2.005460262298584, + "learning_rate": 3.414076236794933e-05, + "loss": 0.9135, + "step": 37880 + }, + { + "epoch": 1.4585178055822907, + "grad_norm": 1.063448429107666, + "learning_rate": 3.4118010079454775e-05, + "loss": 0.8584, + "step": 37885 + }, + { + "epoch": 1.4587102983638114, + "grad_norm": 1.4160773754119873, + "learning_rate": 3.4095263815431765e-05, + "loss": 0.871, + "step": 37890 + }, + { + "epoch": 1.458902791145332, + "grad_norm": 2.008450508117676, + "learning_rate": 3.40725235779603e-05, + "loss": 0.886, + "step": 37895 + }, + { + "epoch": 1.4590952839268527, + "grad_norm": 3.2115590572357178, + "learning_rate": 3.4049789369119866e-05, + "loss": 0.7679, + "step": 37900 + }, + { + "epoch": 1.4592877767083734, + "grad_norm": 1.1200793981552124, + "learning_rate": 3.402706119098927e-05, + "loss": 0.7649, + "step": 37905 + }, + { + "epoch": 1.459480269489894, + "grad_norm": 1.700742483139038, + "learning_rate": 3.4004339045646906e-05, + "loss": 0.7951, + "step": 37910 + }, + { + "epoch": 1.4596727622714147, + "grad_norm": 1.5388984680175781, + "learning_rate": 3.3981622935170554e-05, + "loss": 0.6965, + "step": 37915 + }, + { + "epoch": 1.4598652550529354, + "grad_norm": 1.1668038368225098, + "learning_rate": 3.3958912861637446e-05, + "loss": 0.807, + "step": 37920 + }, + { + "epoch": 1.460057747834456, + "grad_norm": 1.614430546760559, + "learning_rate": 3.3936208827124316e-05, + "loss": 0.7649, + "step": 37925 + }, + { + "epoch": 1.460250240615977, + "grad_norm": 1.4020650386810303, + "learning_rate": 3.3913510833707215e-05, + "loss": 0.8886, + "step": 37930 + }, + { + "epoch": 1.4604427333974976, + "grad_norm": 1.0239553451538086, + "learning_rate": 3.3890818883461774e-05, + "loss": 0.8201, + "step": 37935 + }, + { + "epoch": 1.4606352261790183, + "grad_norm": 1.2521649599075317, + "learning_rate": 3.386813297846301e-05, + "loss": 0.8144, + "step": 37940 + }, + { + "epoch": 1.460827718960539, + "grad_norm": 1.6655640602111816, + "learning_rate": 3.384545312078543e-05, + "loss": 0.7052, + "step": 37945 + }, + { + "epoch": 1.4610202117420596, + "grad_norm": 1.3284400701522827, + "learning_rate": 3.382277931250287e-05, + "loss": 0.7968, + "step": 37950 + }, + { + "epoch": 1.4612127045235803, + "grad_norm": 1.4124058485031128, + "learning_rate": 3.380011155568882e-05, + "loss": 0.7928, + "step": 37955 + }, + { + "epoch": 1.461405197305101, + "grad_norm": 1.6810020208358765, + "learning_rate": 3.3777449852416e-05, + "loss": 0.8213, + "step": 37960 + }, + { + "epoch": 1.4615976900866219, + "grad_norm": 0.6957064867019653, + "learning_rate": 3.375479420475671e-05, + "loss": 0.6974, + "step": 37965 + }, + { + "epoch": 1.4617901828681426, + "grad_norm": 1.094000220298767, + "learning_rate": 3.3732144614782655e-05, + "loss": 0.7123, + "step": 37970 + }, + { + "epoch": 1.4619826756496632, + "grad_norm": 1.638156771659851, + "learning_rate": 3.370950108456502e-05, + "loss": 0.9115, + "step": 37975 + }, + { + "epoch": 1.462175168431184, + "grad_norm": 1.41256582736969, + "learning_rate": 3.368686361617431e-05, + "loss": 0.7888, + "step": 37980 + }, + { + "epoch": 1.4623676612127046, + "grad_norm": 1.2503211498260498, + "learning_rate": 3.366423221168071e-05, + "loss": 0.7508, + "step": 37985 + }, + { + "epoch": 1.4625601539942252, + "grad_norm": 1.3764451742172241, + "learning_rate": 3.3641606873153596e-05, + "loss": 0.8037, + "step": 37990 + }, + { + "epoch": 1.462752646775746, + "grad_norm": 0.9157146215438843, + "learning_rate": 3.3618987602661966e-05, + "loss": 0.7469, + "step": 37995 + }, + { + "epoch": 1.4629451395572666, + "grad_norm": 1.5734952688217163, + "learning_rate": 3.359637440227418e-05, + "loss": 0.8036, + "step": 38000 + }, + { + "epoch": 1.4631376323387872, + "grad_norm": 1.2893953323364258, + "learning_rate": 3.357376727405809e-05, + "loss": 0.7856, + "step": 38005 + }, + { + "epoch": 1.463330125120308, + "grad_norm": 1.133774995803833, + "learning_rate": 3.3551166220080896e-05, + "loss": 0.8766, + "step": 38010 + }, + { + "epoch": 1.4635226179018286, + "grad_norm": 1.4209762811660767, + "learning_rate": 3.3528571242409435e-05, + "loss": 0.8307, + "step": 38015 + }, + { + "epoch": 1.4637151106833493, + "grad_norm": 1.2726255655288696, + "learning_rate": 3.350598234310977e-05, + "loss": 0.8027, + "step": 38020 + }, + { + "epoch": 1.4639076034648701, + "grad_norm": 1.623399257659912, + "learning_rate": 3.3483399524247525e-05, + "loss": 0.8838, + "step": 38025 + }, + { + "epoch": 1.4641000962463908, + "grad_norm": 1.2219460010528564, + "learning_rate": 3.3460822787887805e-05, + "loss": 1.0114, + "step": 38030 + }, + { + "epoch": 1.4642925890279115, + "grad_norm": 0.8296359777450562, + "learning_rate": 3.3438252136095016e-05, + "loss": 0.7348, + "step": 38035 + }, + { + "epoch": 1.4644850818094322, + "grad_norm": 1.6310055255889893, + "learning_rate": 3.341568757093314e-05, + "loss": 0.892, + "step": 38040 + }, + { + "epoch": 1.4646775745909528, + "grad_norm": 0.8464129567146301, + "learning_rate": 3.339312909446557e-05, + "loss": 0.8913, + "step": 38045 + }, + { + "epoch": 1.4648700673724735, + "grad_norm": 1.2375149726867676, + "learning_rate": 3.337057670875513e-05, + "loss": 0.6927, + "step": 38050 + }, + { + "epoch": 1.4650625601539942, + "grad_norm": 2.361973285675049, + "learning_rate": 3.334803041586402e-05, + "loss": 0.7511, + "step": 38055 + }, + { + "epoch": 1.465255052935515, + "grad_norm": 2.223909616470337, + "learning_rate": 3.332999776976661e-05, + "loss": 0.9986, + "step": 38060 + }, + { + "epoch": 1.4654475457170357, + "grad_norm": 0.8594219088554382, + "learning_rate": 3.330746244914553e-05, + "loss": 0.6887, + "step": 38065 + }, + { + "epoch": 1.4656400384985564, + "grad_norm": 1.6795843839645386, + "learning_rate": 3.3284933227115236e-05, + "loss": 0.7934, + "step": 38070 + }, + { + "epoch": 1.465832531280077, + "grad_norm": 0.9740133881568909, + "learning_rate": 3.3262410105735864e-05, + "loss": 0.7993, + "step": 38075 + }, + { + "epoch": 1.4660250240615977, + "grad_norm": 1.578036904335022, + "learning_rate": 3.323989308706693e-05, + "loss": 0.8793, + "step": 38080 + }, + { + "epoch": 1.4662175168431184, + "grad_norm": 1.1103001832962036, + "learning_rate": 3.32173821731676e-05, + "loss": 0.8391, + "step": 38085 + }, + { + "epoch": 1.466410009624639, + "grad_norm": 1.6756675243377686, + "learning_rate": 3.3194877366096246e-05, + "loss": 0.955, + "step": 38090 + }, + { + "epoch": 1.4666025024061597, + "grad_norm": 1.257323980331421, + "learning_rate": 3.3172378667910796e-05, + "loss": 0.7475, + "step": 38095 + }, + { + "epoch": 1.4667949951876804, + "grad_norm": 1.2056502103805542, + "learning_rate": 3.314988608066867e-05, + "loss": 0.8694, + "step": 38100 + }, + { + "epoch": 1.466987487969201, + "grad_norm": 1.552820086479187, + "learning_rate": 3.312739960642659e-05, + "loss": 0.8827, + "step": 38105 + }, + { + "epoch": 1.4671799807507218, + "grad_norm": 0.8146559596061707, + "learning_rate": 3.310491924724082e-05, + "loss": 0.592, + "step": 38110 + }, + { + "epoch": 1.4673724735322424, + "grad_norm": 0.8254255056381226, + "learning_rate": 3.3082445005167053e-05, + "loss": 0.7371, + "step": 38115 + }, + { + "epoch": 1.467564966313763, + "grad_norm": 1.7715586423873901, + "learning_rate": 3.3059976882260424e-05, + "loss": 0.7059, + "step": 38120 + }, + { + "epoch": 1.467757459095284, + "grad_norm": 1.2485114336013794, + "learning_rate": 3.303751488057541e-05, + "loss": 0.7397, + "step": 38125 + }, + { + "epoch": 1.4679499518768047, + "grad_norm": 1.529093623161316, + "learning_rate": 3.301505900216614e-05, + "loss": 0.8473, + "step": 38130 + }, + { + "epoch": 1.4681424446583253, + "grad_norm": 1.5459060668945312, + "learning_rate": 3.299260924908596e-05, + "loss": 0.906, + "step": 38135 + }, + { + "epoch": 1.468334937439846, + "grad_norm": 1.5474683046340942, + "learning_rate": 3.2970165623387785e-05, + "loss": 0.8314, + "step": 38140 + }, + { + "epoch": 1.4685274302213667, + "grad_norm": 2.399840831756592, + "learning_rate": 3.2947728127123924e-05, + "loss": 0.8761, + "step": 38145 + }, + { + "epoch": 1.4687199230028873, + "grad_norm": 0.8112540245056152, + "learning_rate": 3.292529676234615e-05, + "loss": 0.7489, + "step": 38150 + }, + { + "epoch": 1.468912415784408, + "grad_norm": 1.178676962852478, + "learning_rate": 3.290287153110565e-05, + "loss": 0.8148, + "step": 38155 + }, + { + "epoch": 1.469104908565929, + "grad_norm": 1.379562258720398, + "learning_rate": 3.28804524354531e-05, + "loss": 0.9524, + "step": 38160 + }, + { + "epoch": 1.4692974013474496, + "grad_norm": 2.1788909435272217, + "learning_rate": 3.28580394774385e-05, + "loss": 0.7175, + "step": 38165 + }, + { + "epoch": 1.4694898941289702, + "grad_norm": 1.2615634202957153, + "learning_rate": 3.283563265911147e-05, + "loss": 1.0649, + "step": 38170 + }, + { + "epoch": 1.469682386910491, + "grad_norm": 1.2906595468521118, + "learning_rate": 3.281323198252087e-05, + "loss": 0.6275, + "step": 38175 + }, + { + "epoch": 1.4698748796920116, + "grad_norm": 0.8177803158760071, + "learning_rate": 3.279083744971515e-05, + "loss": 0.786, + "step": 38180 + }, + { + "epoch": 1.4700673724735323, + "grad_norm": 1.317015290260315, + "learning_rate": 3.276844906274211e-05, + "loss": 0.8074, + "step": 38185 + }, + { + "epoch": 1.470259865255053, + "grad_norm": 1.0974969863891602, + "learning_rate": 3.274606682364908e-05, + "loss": 0.851, + "step": 38190 + }, + { + "epoch": 1.4704523580365736, + "grad_norm": 0.8745707273483276, + "learning_rate": 3.272369073448269e-05, + "loss": 0.9159, + "step": 38195 + }, + { + "epoch": 1.4706448508180943, + "grad_norm": 1.3758927583694458, + "learning_rate": 3.2701320797289114e-05, + "loss": 0.7093, + "step": 38200 + }, + { + "epoch": 1.470837343599615, + "grad_norm": 1.6409497261047363, + "learning_rate": 3.267895701411393e-05, + "loss": 0.7493, + "step": 38205 + }, + { + "epoch": 1.4710298363811356, + "grad_norm": 1.0862737894058228, + "learning_rate": 3.2656599387002176e-05, + "loss": 0.9189, + "step": 38210 + }, + { + "epoch": 1.4712223291626563, + "grad_norm": 0.9804477095603943, + "learning_rate": 3.263424791799833e-05, + "loss": 0.8635, + "step": 38215 + }, + { + "epoch": 1.4714148219441772, + "grad_norm": 1.2791975736618042, + "learning_rate": 3.2611902609146215e-05, + "loss": 0.8745, + "step": 38220 + }, + { + "epoch": 1.4716073147256978, + "grad_norm": 1.2094603776931763, + "learning_rate": 3.2589563462489214e-05, + "loss": 0.8492, + "step": 38225 + }, + { + "epoch": 1.4717998075072185, + "grad_norm": 0.9160768389701843, + "learning_rate": 3.256723048007006e-05, + "loss": 0.8906, + "step": 38230 + }, + { + "epoch": 1.4719923002887392, + "grad_norm": 2.3353841304779053, + "learning_rate": 3.254490366393104e-05, + "loss": 0.8709, + "step": 38235 + }, + { + "epoch": 1.4721847930702598, + "grad_norm": 1.8757665157318115, + "learning_rate": 3.2522583016113636e-05, + "loss": 0.6895, + "step": 38240 + }, + { + "epoch": 1.4723772858517805, + "grad_norm": 1.9597855806350708, + "learning_rate": 3.250026853865911e-05, + "loss": 0.7915, + "step": 38245 + }, + { + "epoch": 1.4725697786333012, + "grad_norm": 1.1252086162567139, + "learning_rate": 3.247796023360783e-05, + "loss": 0.8083, + "step": 38250 + }, + { + "epoch": 1.472762271414822, + "grad_norm": 1.7507824897766113, + "learning_rate": 3.2455658102999796e-05, + "loss": 0.8343, + "step": 38255 + }, + { + "epoch": 1.4729547641963427, + "grad_norm": 0.8124133944511414, + "learning_rate": 3.243336214887439e-05, + "loss": 0.7325, + "step": 38260 + }, + { + "epoch": 1.4731472569778634, + "grad_norm": 0.9057835936546326, + "learning_rate": 3.241107237327047e-05, + "loss": 0.8326, + "step": 38265 + }, + { + "epoch": 1.473339749759384, + "grad_norm": 2.141209602355957, + "learning_rate": 3.238878877822616e-05, + "loss": 0.8098, + "step": 38270 + }, + { + "epoch": 1.4735322425409048, + "grad_norm": 2.2603015899658203, + "learning_rate": 3.236651136577932e-05, + "loss": 0.7639, + "step": 38275 + }, + { + "epoch": 1.4737247353224254, + "grad_norm": 1.785561203956604, + "learning_rate": 3.234424013796694e-05, + "loss": 0.9348, + "step": 38280 + }, + { + "epoch": 1.473917228103946, + "grad_norm": 1.9484220743179321, + "learning_rate": 3.232197509682562e-05, + "loss": 1.0465, + "step": 38285 + }, + { + "epoch": 1.4741097208854668, + "grad_norm": 1.0257092714309692, + "learning_rate": 3.2299716244391356e-05, + "loss": 0.7771, + "step": 38290 + }, + { + "epoch": 1.4743022136669874, + "grad_norm": 0.8890891075134277, + "learning_rate": 3.2277463582699595e-05, + "loss": 0.7485, + "step": 38295 + }, + { + "epoch": 1.474494706448508, + "grad_norm": 1.7376580238342285, + "learning_rate": 3.225521711378514e-05, + "loss": 0.7943, + "step": 38300 + }, + { + "epoch": 1.4746871992300288, + "grad_norm": 0.7426727414131165, + "learning_rate": 3.2232976839682316e-05, + "loss": 0.7996, + "step": 38305 + }, + { + "epoch": 1.4748796920115494, + "grad_norm": 1.1797425746917725, + "learning_rate": 3.221074276242484e-05, + "loss": 0.7686, + "step": 38310 + }, + { + "epoch": 1.4750721847930703, + "grad_norm": 1.5532947778701782, + "learning_rate": 3.2188514884045885e-05, + "loss": 0.6781, + "step": 38315 + }, + { + "epoch": 1.475264677574591, + "grad_norm": 1.4531184434890747, + "learning_rate": 3.216629320657806e-05, + "loss": 0.9722, + "step": 38320 + }, + { + "epoch": 1.4754571703561117, + "grad_norm": 1.0439348220825195, + "learning_rate": 3.214407773205333e-05, + "loss": 0.9018, + "step": 38325 + }, + { + "epoch": 1.4756496631376324, + "grad_norm": 1.6608530282974243, + "learning_rate": 3.212186846250318e-05, + "loss": 0.8168, + "step": 38330 + }, + { + "epoch": 1.475842155919153, + "grad_norm": 1.7269642353057861, + "learning_rate": 3.209966539995851e-05, + "loss": 0.9625, + "step": 38335 + }, + { + "epoch": 1.4760346487006737, + "grad_norm": 1.6941852569580078, + "learning_rate": 3.2077468546449684e-05, + "loss": 0.8007, + "step": 38340 + }, + { + "epoch": 1.4762271414821944, + "grad_norm": 1.0302637815475464, + "learning_rate": 3.205527790400634e-05, + "loss": 0.9428, + "step": 38345 + }, + { + "epoch": 1.476419634263715, + "grad_norm": 1.4871275424957275, + "learning_rate": 3.2033093474657806e-05, + "loss": 0.9217, + "step": 38350 + }, + { + "epoch": 1.476612127045236, + "grad_norm": 1.700111746788025, + "learning_rate": 3.201091526043261e-05, + "loss": 0.791, + "step": 38355 + }, + { + "epoch": 1.4768046198267566, + "grad_norm": 0.7985191345214844, + "learning_rate": 3.198874326335881e-05, + "loss": 0.6741, + "step": 38360 + }, + { + "epoch": 1.4769971126082773, + "grad_norm": 1.5285080671310425, + "learning_rate": 3.1966577485463913e-05, + "loss": 0.8607, + "step": 38365 + }, + { + "epoch": 1.477189605389798, + "grad_norm": 1.746016263961792, + "learning_rate": 3.1944417928774864e-05, + "loss": 0.71, + "step": 38370 + }, + { + "epoch": 1.4773820981713186, + "grad_norm": 2.4140141010284424, + "learning_rate": 3.1922264595317895e-05, + "loss": 0.9232, + "step": 38375 + }, + { + "epoch": 1.4775745909528393, + "grad_norm": 1.8177671432495117, + "learning_rate": 3.190011748711892e-05, + "loss": 0.7186, + "step": 38380 + }, + { + "epoch": 1.47776708373436, + "grad_norm": 1.3410675525665283, + "learning_rate": 3.187797660620305e-05, + "loss": 0.8414, + "step": 38385 + }, + { + "epoch": 1.4779595765158806, + "grad_norm": 0.9143676161766052, + "learning_rate": 3.185584195459496e-05, + "loss": 0.7884, + "step": 38390 + }, + { + "epoch": 1.4781520692974013, + "grad_norm": 1.212738037109375, + "learning_rate": 3.18337135343187e-05, + "loss": 0.7888, + "step": 38395 + }, + { + "epoch": 1.478344562078922, + "grad_norm": 1.165589690208435, + "learning_rate": 3.181159134739777e-05, + "loss": 0.8875, + "step": 38400 + }, + { + "epoch": 1.4785370548604426, + "grad_norm": 1.0738500356674194, + "learning_rate": 3.178947539585512e-05, + "loss": 0.7586, + "step": 38405 + }, + { + "epoch": 1.4787295476419633, + "grad_norm": 1.069633960723877, + "learning_rate": 3.1767365681713123e-05, + "loss": 0.9111, + "step": 38410 + }, + { + "epoch": 1.4789220404234842, + "grad_norm": 1.1516032218933105, + "learning_rate": 3.17452622069935e-05, + "loss": 0.9261, + "step": 38415 + }, + { + "epoch": 1.4791145332050049, + "grad_norm": 1.2986743450164795, + "learning_rate": 3.172316497371749e-05, + "loss": 0.8498, + "step": 38420 + }, + { + "epoch": 1.4793070259865255, + "grad_norm": 1.1140365600585938, + "learning_rate": 3.170107398390576e-05, + "loss": 0.8164, + "step": 38425 + }, + { + "epoch": 1.4794995187680462, + "grad_norm": 0.8522436022758484, + "learning_rate": 3.167898923957838e-05, + "loss": 0.7513, + "step": 38430 + }, + { + "epoch": 1.4796920115495669, + "grad_norm": 1.4208568334579468, + "learning_rate": 3.1656910742754876e-05, + "loss": 0.8429, + "step": 38435 + }, + { + "epoch": 1.4798845043310875, + "grad_norm": 1.4376440048217773, + "learning_rate": 3.163483849545412e-05, + "loss": 0.7375, + "step": 38440 + }, + { + "epoch": 1.4800769971126082, + "grad_norm": 1.1318036317825317, + "learning_rate": 3.161277249969451e-05, + "loss": 0.8024, + "step": 38445 + }, + { + "epoch": 1.480269489894129, + "grad_norm": 1.1824818849563599, + "learning_rate": 3.159071275749382e-05, + "loss": 0.7278, + "step": 38450 + }, + { + "epoch": 1.4804619826756498, + "grad_norm": 1.4010226726531982, + "learning_rate": 3.1568659270869315e-05, + "loss": 0.7994, + "step": 38455 + }, + { + "epoch": 1.4806544754571704, + "grad_norm": 1.4055135250091553, + "learning_rate": 3.154661204183755e-05, + "loss": 0.8096, + "step": 38460 + }, + { + "epoch": 1.480846968238691, + "grad_norm": 1.6588884592056274, + "learning_rate": 3.152457107241471e-05, + "loss": 0.858, + "step": 38465 + }, + { + "epoch": 1.4810394610202118, + "grad_norm": 0.7389982342720032, + "learning_rate": 3.15025363646162e-05, + "loss": 0.8122, + "step": 38470 + }, + { + "epoch": 1.4812319538017324, + "grad_norm": 1.2149773836135864, + "learning_rate": 3.148050792045699e-05, + "loss": 0.8371, + "step": 38475 + }, + { + "epoch": 1.4814244465832531, + "grad_norm": 0.945489227771759, + "learning_rate": 3.1458485741951425e-05, + "loss": 0.9891, + "step": 38480 + }, + { + "epoch": 1.4816169393647738, + "grad_norm": 1.013566255569458, + "learning_rate": 3.1436469831113334e-05, + "loss": 0.8877, + "step": 38485 + }, + { + "epoch": 1.4818094321462945, + "grad_norm": 1.1482200622558594, + "learning_rate": 3.1414460189955805e-05, + "loss": 0.9323, + "step": 38490 + }, + { + "epoch": 1.4820019249278151, + "grad_norm": 1.5675556659698486, + "learning_rate": 3.139245682049163e-05, + "loss": 0.7917, + "step": 38495 + }, + { + "epoch": 1.4821944177093358, + "grad_norm": 1.299532413482666, + "learning_rate": 3.137045972473277e-05, + "loss": 0.7983, + "step": 38500 + }, + { + "epoch": 1.4823869104908565, + "grad_norm": 1.051574945449829, + "learning_rate": 3.134846890469073e-05, + "loss": 0.7102, + "step": 38505 + }, + { + "epoch": 1.4825794032723774, + "grad_norm": 1.790773868560791, + "learning_rate": 3.132648436237644e-05, + "loss": 0.8816, + "step": 38510 + }, + { + "epoch": 1.482771896053898, + "grad_norm": 1.0724399089813232, + "learning_rate": 3.1304506099800256e-05, + "loss": 0.9583, + "step": 38515 + }, + { + "epoch": 1.4829643888354187, + "grad_norm": 2.1715545654296875, + "learning_rate": 3.1282534118971893e-05, + "loss": 0.907, + "step": 38520 + }, + { + "epoch": 1.4831568816169394, + "grad_norm": 1.1639543771743774, + "learning_rate": 3.126056842190058e-05, + "loss": 0.8888, + "step": 38525 + }, + { + "epoch": 1.48334937439846, + "grad_norm": 1.2022185325622559, + "learning_rate": 3.123860901059493e-05, + "loss": 0.7886, + "step": 38530 + }, + { + "epoch": 1.4835418671799807, + "grad_norm": 0.8449472188949585, + "learning_rate": 3.121665588706297e-05, + "loss": 0.7561, + "step": 38535 + }, + { + "epoch": 1.4837343599615014, + "grad_norm": 1.3795840740203857, + "learning_rate": 3.119470905331223e-05, + "loss": 0.9203, + "step": 38540 + }, + { + "epoch": 1.4839268527430223, + "grad_norm": 1.0770927667617798, + "learning_rate": 3.1172768511349514e-05, + "loss": 0.9815, + "step": 38545 + }, + { + "epoch": 1.484119345524543, + "grad_norm": 1.2355643510818481, + "learning_rate": 3.1150834263181187e-05, + "loss": 0.7474, + "step": 38550 + }, + { + "epoch": 1.4843118383060636, + "grad_norm": 1.1445075273513794, + "learning_rate": 3.112890631081297e-05, + "loss": 1.0919, + "step": 38555 + }, + { + "epoch": 1.4845043310875843, + "grad_norm": 1.6212882995605469, + "learning_rate": 3.1106984656250074e-05, + "loss": 0.7342, + "step": 38560 + }, + { + "epoch": 1.484696823869105, + "grad_norm": 1.2851066589355469, + "learning_rate": 3.1085069301496993e-05, + "loss": 0.7242, + "step": 38565 + }, + { + "epoch": 1.4848893166506256, + "grad_norm": 1.7971413135528564, + "learning_rate": 3.1063160248557876e-05, + "loss": 0.7274, + "step": 38570 + }, + { + "epoch": 1.4850818094321463, + "grad_norm": 0.9864533543586731, + "learning_rate": 3.104125749943605e-05, + "loss": 0.7071, + "step": 38575 + }, + { + "epoch": 1.485274302213667, + "grad_norm": 1.3613252639770508, + "learning_rate": 3.101936105613442e-05, + "loss": 0.8422, + "step": 38580 + }, + { + "epoch": 1.4854667949951876, + "grad_norm": 1.0878537893295288, + "learning_rate": 3.099747092065527e-05, + "loss": 0.8569, + "step": 38585 + }, + { + "epoch": 1.4856592877767083, + "grad_norm": 2.1633172035217285, + "learning_rate": 3.0975587095000335e-05, + "loss": 0.904, + "step": 38590 + }, + { + "epoch": 1.485851780558229, + "grad_norm": 1.3097665309906006, + "learning_rate": 3.095370958117064e-05, + "loss": 0.8018, + "step": 38595 + }, + { + "epoch": 1.4860442733397496, + "grad_norm": 1.019307017326355, + "learning_rate": 3.093183838116689e-05, + "loss": 0.8721, + "step": 38600 + }, + { + "epoch": 1.4862367661212703, + "grad_norm": 1.0038857460021973, + "learning_rate": 3.090997349698895e-05, + "loss": 0.7122, + "step": 38605 + }, + { + "epoch": 1.4864292589027912, + "grad_norm": 1.9515637159347534, + "learning_rate": 3.0888114930636256e-05, + "loss": 0.7896, + "step": 38610 + }, + { + "epoch": 1.4866217516843119, + "grad_norm": 1.1651268005371094, + "learning_rate": 3.0866262684107626e-05, + "loss": 0.7926, + "step": 38615 + }, + { + "epoch": 1.4868142444658325, + "grad_norm": 1.1827764511108398, + "learning_rate": 3.084441675940134e-05, + "loss": 0.8771, + "step": 38620 + }, + { + "epoch": 1.4870067372473532, + "grad_norm": 1.2885782718658447, + "learning_rate": 3.0822577158514954e-05, + "loss": 0.8086, + "step": 38625 + }, + { + "epoch": 1.4871992300288739, + "grad_norm": 1.6408069133758545, + "learning_rate": 3.0800743883445703e-05, + "loss": 0.8421, + "step": 38630 + }, + { + "epoch": 1.4873917228103946, + "grad_norm": 1.5079654455184937, + "learning_rate": 3.077891693618998e-05, + "loss": 0.951, + "step": 38635 + }, + { + "epoch": 1.4875842155919152, + "grad_norm": 1.0820412635803223, + "learning_rate": 3.075709631874376e-05, + "loss": 0.9189, + "step": 38640 + }, + { + "epoch": 1.4877767083734361, + "grad_norm": 1.5331305265426636, + "learning_rate": 3.073528203310242e-05, + "loss": 0.8078, + "step": 38645 + }, + { + "epoch": 1.4879692011549568, + "grad_norm": 1.087971568107605, + "learning_rate": 3.0713474081260674e-05, + "loss": 0.9791, + "step": 38650 + }, + { + "epoch": 1.4881616939364775, + "grad_norm": 1.125899076461792, + "learning_rate": 3.069167246521273e-05, + "loss": 0.855, + "step": 38655 + }, + { + "epoch": 1.4883541867179981, + "grad_norm": 1.5379077196121216, + "learning_rate": 3.0669877186952226e-05, + "loss": 0.8884, + "step": 38660 + }, + { + "epoch": 1.4885466794995188, + "grad_norm": 1.4419198036193848, + "learning_rate": 3.064808824847217e-05, + "loss": 0.9177, + "step": 38665 + }, + { + "epoch": 1.4887391722810395, + "grad_norm": 1.242751955986023, + "learning_rate": 3.062630565176504e-05, + "loss": 0.7661, + "step": 38670 + }, + { + "epoch": 1.4889316650625601, + "grad_norm": 0.9697130918502808, + "learning_rate": 3.060452939882273e-05, + "loss": 0.7302, + "step": 38675 + }, + { + "epoch": 1.4891241578440808, + "grad_norm": 1.1777362823486328, + "learning_rate": 3.0582759491636445e-05, + "loss": 0.8163, + "step": 38680 + }, + { + "epoch": 1.4893166506256015, + "grad_norm": 1.943617582321167, + "learning_rate": 3.0560995932197015e-05, + "loss": 0.8965, + "step": 38685 + }, + { + "epoch": 1.4895091434071221, + "grad_norm": 1.0683708190917969, + "learning_rate": 3.053923872249448e-05, + "loss": 0.9112, + "step": 38690 + }, + { + "epoch": 1.4897016361886428, + "grad_norm": 1.1417663097381592, + "learning_rate": 3.0517487864518436e-05, + "loss": 0.8912, + "step": 38695 + }, + { + "epoch": 1.4898941289701635, + "grad_norm": 1.4740487337112427, + "learning_rate": 3.0495743360257845e-05, + "loss": 0.8111, + "step": 38700 + }, + { + "epoch": 1.4900866217516844, + "grad_norm": 2.3702943325042725, + "learning_rate": 3.047400521170113e-05, + "loss": 0.9583, + "step": 38705 + }, + { + "epoch": 1.490279114533205, + "grad_norm": 1.1470707654953003, + "learning_rate": 3.0452273420836007e-05, + "loss": 0.9308, + "step": 38710 + }, + { + "epoch": 1.4904716073147257, + "grad_norm": 1.7966952323913574, + "learning_rate": 3.0430547989649827e-05, + "loss": 0.9539, + "step": 38715 + }, + { + "epoch": 1.4906641000962464, + "grad_norm": 1.2520999908447266, + "learning_rate": 3.0408828920129152e-05, + "loss": 0.9079, + "step": 38720 + }, + { + "epoch": 1.490856592877767, + "grad_norm": 0.930090606212616, + "learning_rate": 3.038711621426007e-05, + "loss": 0.7571, + "step": 38725 + }, + { + "epoch": 1.4910490856592877, + "grad_norm": 1.0837591886520386, + "learning_rate": 3.0365409874028074e-05, + "loss": 0.756, + "step": 38730 + }, + { + "epoch": 1.4912415784408084, + "grad_norm": 1.3901774883270264, + "learning_rate": 3.0343709901418084e-05, + "loss": 0.8403, + "step": 38735 + }, + { + "epoch": 1.4914340712223293, + "grad_norm": 1.2014509439468384, + "learning_rate": 3.032201629841437e-05, + "loss": 0.9118, + "step": 38740 + }, + { + "epoch": 1.49162656400385, + "grad_norm": 1.7022885084152222, + "learning_rate": 3.0300329067000677e-05, + "loss": 0.7848, + "step": 38745 + }, + { + "epoch": 1.4918190567853706, + "grad_norm": 1.2013940811157227, + "learning_rate": 3.0278648209160176e-05, + "loss": 0.8609, + "step": 38750 + }, + { + "epoch": 1.4920115495668913, + "grad_norm": 1.0092146396636963, + "learning_rate": 3.0256973726875436e-05, + "loss": 0.8774, + "step": 38755 + }, + { + "epoch": 1.492204042348412, + "grad_norm": 1.953743577003479, + "learning_rate": 3.0235305622128483e-05, + "loss": 0.8166, + "step": 38760 + }, + { + "epoch": 1.4923965351299326, + "grad_norm": 2.1745972633361816, + "learning_rate": 3.0213643896900646e-05, + "loss": 0.7682, + "step": 38765 + }, + { + "epoch": 1.4925890279114533, + "grad_norm": 1.1667450666427612, + "learning_rate": 3.0191988553172778e-05, + "loss": 0.8615, + "step": 38770 + }, + { + "epoch": 1.492781520692974, + "grad_norm": 1.3305031061172485, + "learning_rate": 3.0170339592925125e-05, + "loss": 0.8538, + "step": 38775 + }, + { + "epoch": 1.4929740134744947, + "grad_norm": 1.1499173641204834, + "learning_rate": 3.0148697018137374e-05, + "loss": 0.8741, + "step": 38780 + }, + { + "epoch": 1.4931665062560153, + "grad_norm": 1.1782283782958984, + "learning_rate": 3.0127060830788477e-05, + "loss": 0.7251, + "step": 38785 + }, + { + "epoch": 1.493358999037536, + "grad_norm": 0.9500432014465332, + "learning_rate": 3.010543103285708e-05, + "loss": 0.8529, + "step": 38790 + }, + { + "epoch": 1.4935514918190567, + "grad_norm": 1.2924103736877441, + "learning_rate": 3.008380762632097e-05, + "loss": 0.8166, + "step": 38795 + }, + { + "epoch": 1.4937439846005776, + "grad_norm": 1.125877022743225, + "learning_rate": 3.0062190613157505e-05, + "loss": 0.8048, + "step": 38800 + }, + { + "epoch": 1.4939364773820982, + "grad_norm": 1.3560032844543457, + "learning_rate": 3.00405799953434e-05, + "loss": 0.8974, + "step": 38805 + }, + { + "epoch": 1.494128970163619, + "grad_norm": 1.8842239379882812, + "learning_rate": 3.0018975774854864e-05, + "loss": 0.8511, + "step": 38810 + }, + { + "epoch": 1.4943214629451396, + "grad_norm": 1.4684277772903442, + "learning_rate": 2.9997377953667337e-05, + "loss": 0.7795, + "step": 38815 + }, + { + "epoch": 1.4945139557266602, + "grad_norm": 1.0859155654907227, + "learning_rate": 2.9975786533755935e-05, + "loss": 0.8679, + "step": 38820 + }, + { + "epoch": 1.494706448508181, + "grad_norm": 1.0024845600128174, + "learning_rate": 2.995420151709496e-05, + "loss": 0.99, + "step": 38825 + }, + { + "epoch": 1.4948989412897016, + "grad_norm": 1.1407462358474731, + "learning_rate": 2.9932622905658238e-05, + "loss": 0.9279, + "step": 38830 + }, + { + "epoch": 1.4950914340712222, + "grad_norm": 1.0193146467208862, + "learning_rate": 2.9911050701419007e-05, + "loss": 0.7641, + "step": 38835 + }, + { + "epoch": 1.4952839268527431, + "grad_norm": 0.9654618501663208, + "learning_rate": 2.9889484906349918e-05, + "loss": 0.745, + "step": 38840 + }, + { + "epoch": 1.4954764196342638, + "grad_norm": 0.9131496548652649, + "learning_rate": 2.9867925522422957e-05, + "loss": 0.8413, + "step": 38845 + }, + { + "epoch": 1.4956689124157845, + "grad_norm": 1.6795357465744019, + "learning_rate": 2.9846372551609637e-05, + "loss": 0.8643, + "step": 38850 + }, + { + "epoch": 1.4958614051973051, + "grad_norm": 0.6970282793045044, + "learning_rate": 2.982482599588081e-05, + "loss": 0.671, + "step": 38855 + }, + { + "epoch": 1.4960538979788258, + "grad_norm": 2.4015212059020996, + "learning_rate": 2.9803285857206787e-05, + "loss": 1.0025, + "step": 38860 + }, + { + "epoch": 1.4962463907603465, + "grad_norm": 0.9597976207733154, + "learning_rate": 2.9781752137557296e-05, + "loss": 0.8127, + "step": 38865 + }, + { + "epoch": 1.4964388835418672, + "grad_norm": 1.1318256855010986, + "learning_rate": 2.9760224838901372e-05, + "loss": 0.7509, + "step": 38870 + }, + { + "epoch": 1.4966313763233878, + "grad_norm": 1.4549952745437622, + "learning_rate": 2.9738703963207604e-05, + "loss": 0.8471, + "step": 38875 + }, + { + "epoch": 1.4968238691049085, + "grad_norm": 1.4533936977386475, + "learning_rate": 2.9717189512443922e-05, + "loss": 0.9046, + "step": 38880 + }, + { + "epoch": 1.4970163618864292, + "grad_norm": 1.5385301113128662, + "learning_rate": 2.9695681488577697e-05, + "loss": 0.8941, + "step": 38885 + }, + { + "epoch": 1.4972088546679498, + "grad_norm": 1.1736408472061157, + "learning_rate": 2.9674179893575626e-05, + "loss": 0.7051, + "step": 38890 + }, + { + "epoch": 1.4974013474494705, + "grad_norm": 1.6953740119934082, + "learning_rate": 2.965268472940399e-05, + "loss": 0.8457, + "step": 38895 + }, + { + "epoch": 1.4975938402309914, + "grad_norm": 0.6899955868721008, + "learning_rate": 2.9631195998028305e-05, + "loss": 0.7057, + "step": 38900 + }, + { + "epoch": 1.497786333012512, + "grad_norm": 1.721205472946167, + "learning_rate": 2.960971370141359e-05, + "loss": 0.9267, + "step": 38905 + }, + { + "epoch": 1.4979788257940327, + "grad_norm": 1.408977746963501, + "learning_rate": 2.958823784152426e-05, + "loss": 0.6861, + "step": 38910 + }, + { + "epoch": 1.4981713185755534, + "grad_norm": 1.3313335180282593, + "learning_rate": 2.9566768420324188e-05, + "loss": 0.8315, + "step": 38915 + }, + { + "epoch": 1.498363811357074, + "grad_norm": 1.7257198095321655, + "learning_rate": 2.9545305439776494e-05, + "loss": 0.7739, + "step": 38920 + }, + { + "epoch": 1.4985563041385948, + "grad_norm": 1.2223293781280518, + "learning_rate": 2.9523848901843953e-05, + "loss": 0.8514, + "step": 38925 + }, + { + "epoch": 1.4987487969201154, + "grad_norm": 0.9924798607826233, + "learning_rate": 2.950239880848852e-05, + "loss": 0.8442, + "step": 38930 + }, + { + "epoch": 1.4989412897016363, + "grad_norm": 1.178801417350769, + "learning_rate": 2.9480955161671774e-05, + "loss": 0.9034, + "step": 38935 + }, + { + "epoch": 1.499133782483157, + "grad_norm": 0.8374056220054626, + "learning_rate": 2.94595179633545e-05, + "loss": 0.7436, + "step": 38940 + }, + { + "epoch": 1.4993262752646777, + "grad_norm": 1.187247633934021, + "learning_rate": 2.9438087215497013e-05, + "loss": 0.8352, + "step": 38945 + }, + { + "epoch": 1.4995187680461983, + "grad_norm": 0.9191497564315796, + "learning_rate": 2.9416662920059024e-05, + "loss": 0.7596, + "step": 38950 + }, + { + "epoch": 1.499711260827719, + "grad_norm": 0.9378131628036499, + "learning_rate": 2.9395245078999665e-05, + "loss": 0.7416, + "step": 38955 + }, + { + "epoch": 1.4999037536092397, + "grad_norm": 1.5277785062789917, + "learning_rate": 2.937383369427741e-05, + "loss": 0.6751, + "step": 38960 + }, + { + "epoch": 1.5000962463907603, + "grad_norm": 0.8396899104118347, + "learning_rate": 2.9352428767850204e-05, + "loss": 0.8376, + "step": 38965 + }, + { + "epoch": 1.500288739172281, + "grad_norm": 1.3302197456359863, + "learning_rate": 2.9331030301675388e-05, + "loss": 0.875, + "step": 38970 + }, + { + "epoch": 1.5004812319538017, + "grad_norm": 1.8585106134414673, + "learning_rate": 2.9309638297709708e-05, + "loss": 0.804, + "step": 38975 + }, + { + "epoch": 1.5006737247353223, + "grad_norm": 1.9096689224243164, + "learning_rate": 2.928825275790936e-05, + "loss": 0.8378, + "step": 38980 + }, + { + "epoch": 1.500866217516843, + "grad_norm": 1.947161078453064, + "learning_rate": 2.9266873684229846e-05, + "loss": 0.9819, + "step": 38985 + }, + { + "epoch": 1.5010587102983637, + "grad_norm": 1.3019959926605225, + "learning_rate": 2.924550107862617e-05, + "loss": 0.7941, + "step": 38990 + }, + { + "epoch": 1.5012512030798844, + "grad_norm": 1.5726149082183838, + "learning_rate": 2.9224134943052717e-05, + "loss": 0.8298, + "step": 38995 + }, + { + "epoch": 1.5014436958614052, + "grad_norm": 1.7521274089813232, + "learning_rate": 2.92027752794633e-05, + "loss": 0.7795, + "step": 39000 + }, + { + "epoch": 1.501636188642926, + "grad_norm": 1.5771510601043701, + "learning_rate": 2.918142208981104e-05, + "loss": 0.7677, + "step": 39005 + }, + { + "epoch": 1.5018286814244466, + "grad_norm": 1.1696386337280273, + "learning_rate": 2.9160075376048656e-05, + "loss": 0.735, + "step": 39010 + }, + { + "epoch": 1.5020211742059673, + "grad_norm": 1.4408166408538818, + "learning_rate": 2.913873514012807e-05, + "loss": 0.8714, + "step": 39015 + }, + { + "epoch": 1.502213666987488, + "grad_norm": 0.9149922728538513, + "learning_rate": 2.9117401384000753e-05, + "loss": 0.6836, + "step": 39020 + }, + { + "epoch": 1.5024061597690088, + "grad_norm": 1.5898869037628174, + "learning_rate": 2.9096074109617522e-05, + "loss": 0.7895, + "step": 39025 + }, + { + "epoch": 1.5025986525505295, + "grad_norm": 1.2088052034378052, + "learning_rate": 2.907475331892865e-05, + "loss": 0.7443, + "step": 39030 + }, + { + "epoch": 1.5027911453320502, + "grad_norm": 1.8943687677383423, + "learning_rate": 2.905343901388369e-05, + "loss": 0.9301, + "step": 39035 + }, + { + "epoch": 1.5029836381135708, + "grad_norm": 1.088670253753662, + "learning_rate": 2.903213119643181e-05, + "loss": 0.5987, + "step": 39040 + }, + { + "epoch": 1.5031761308950915, + "grad_norm": 1.1137605905532837, + "learning_rate": 2.9010829868521393e-05, + "loss": 0.8678, + "step": 39045 + }, + { + "epoch": 1.5033686236766122, + "grad_norm": 1.1481714248657227, + "learning_rate": 2.8989535032100312e-05, + "loss": 0.8033, + "step": 39050 + }, + { + "epoch": 1.5035611164581328, + "grad_norm": 1.2706315517425537, + "learning_rate": 2.8968246689115862e-05, + "loss": 0.7202, + "step": 39055 + }, + { + "epoch": 1.5037536092396535, + "grad_norm": 1.0178372859954834, + "learning_rate": 2.8946964841514735e-05, + "loss": 0.8806, + "step": 39060 + }, + { + "epoch": 1.5039461020211742, + "grad_norm": 1.0535964965820312, + "learning_rate": 2.8925689491242958e-05, + "loss": 0.8236, + "step": 39065 + }, + { + "epoch": 1.5041385948026949, + "grad_norm": 0.9003209471702576, + "learning_rate": 2.8904420640246054e-05, + "loss": 0.7783, + "step": 39070 + }, + { + "epoch": 1.5043310875842155, + "grad_norm": 1.5490949153900146, + "learning_rate": 2.8883158290468916e-05, + "loss": 0.8359, + "step": 39075 + }, + { + "epoch": 1.5045235803657362, + "grad_norm": 0.9985781908035278, + "learning_rate": 2.886190244385585e-05, + "loss": 1.091, + "step": 39080 + }, + { + "epoch": 1.5047160731472569, + "grad_norm": 1.7629387378692627, + "learning_rate": 2.884065310235059e-05, + "loss": 0.9146, + "step": 39085 + }, + { + "epoch": 1.5049085659287775, + "grad_norm": 1.2450439929962158, + "learning_rate": 2.8819410267896173e-05, + "loss": 0.7949, + "step": 39090 + }, + { + "epoch": 1.5051010587102982, + "grad_norm": 1.3784174919128418, + "learning_rate": 2.879817394243517e-05, + "loss": 0.8738, + "step": 39095 + }, + { + "epoch": 1.505293551491819, + "grad_norm": 1.2022943496704102, + "learning_rate": 2.877694412790949e-05, + "loss": 0.8506, + "step": 39100 + }, + { + "epoch": 1.5054860442733398, + "grad_norm": 1.1159878969192505, + "learning_rate": 2.8755720826260492e-05, + "loss": 0.7559, + "step": 39105 + }, + { + "epoch": 1.5056785370548604, + "grad_norm": 1.8302503824234009, + "learning_rate": 2.8734504039428813e-05, + "loss": 0.839, + "step": 39110 + }, + { + "epoch": 1.505871029836381, + "grad_norm": 1.0273211002349854, + "learning_rate": 2.871329376935471e-05, + "loss": 0.7604, + "step": 39115 + }, + { + "epoch": 1.506063522617902, + "grad_norm": 2.0029349327087402, + "learning_rate": 2.8692090017977626e-05, + "loss": 0.8672, + "step": 39120 + }, + { + "epoch": 1.5062560153994227, + "grad_norm": 1.7396565675735474, + "learning_rate": 2.8670892787236536e-05, + "loss": 0.8613, + "step": 39125 + }, + { + "epoch": 1.5064485081809433, + "grad_norm": 1.0118446350097656, + "learning_rate": 2.8649702079069797e-05, + "loss": 0.7605, + "step": 39130 + }, + { + "epoch": 1.506641000962464, + "grad_norm": 1.3765640258789062, + "learning_rate": 2.8628517895415173e-05, + "loss": 0.7897, + "step": 39135 + }, + { + "epoch": 1.5068334937439847, + "grad_norm": 0.9701485633850098, + "learning_rate": 2.8607340238209747e-05, + "loss": 0.7784, + "step": 39140 + }, + { + "epoch": 1.5070259865255053, + "grad_norm": 1.591834306716919, + "learning_rate": 2.858616910939017e-05, + "loss": 0.837, + "step": 39145 + }, + { + "epoch": 1.507218479307026, + "grad_norm": 1.928481936454773, + "learning_rate": 2.8565004510892336e-05, + "loss": 0.8828, + "step": 39150 + }, + { + "epoch": 1.5074109720885467, + "grad_norm": 1.1933850049972534, + "learning_rate": 2.854384644465161e-05, + "loss": 0.7229, + "step": 39155 + }, + { + "epoch": 1.5076034648700674, + "grad_norm": 1.4722340106964111, + "learning_rate": 2.852269491260279e-05, + "loss": 0.7789, + "step": 39160 + }, + { + "epoch": 1.507795957651588, + "grad_norm": 1.0103994607925415, + "learning_rate": 2.8501549916680047e-05, + "loss": 0.8624, + "step": 39165 + }, + { + "epoch": 1.5079884504331087, + "grad_norm": 1.9095921516418457, + "learning_rate": 2.848041145881687e-05, + "loss": 0.9677, + "step": 39170 + }, + { + "epoch": 1.5081809432146294, + "grad_norm": 0.9631972312927246, + "learning_rate": 2.845927954094635e-05, + "loss": 0.6126, + "step": 39175 + }, + { + "epoch": 1.50837343599615, + "grad_norm": 0.9686654210090637, + "learning_rate": 2.8438154165000774e-05, + "loss": 0.7955, + "step": 39180 + }, + { + "epoch": 1.5085659287776707, + "grad_norm": 1.5519697666168213, + "learning_rate": 2.8417035332911946e-05, + "loss": 0.8444, + "step": 39185 + }, + { + "epoch": 1.5087584215591914, + "grad_norm": 0.9993698596954346, + "learning_rate": 2.839592304661107e-05, + "loss": 0.8372, + "step": 39190 + }, + { + "epoch": 1.5089509143407123, + "grad_norm": 2.2285327911376953, + "learning_rate": 2.8374817308028645e-05, + "loss": 0.8041, + "step": 39195 + }, + { + "epoch": 1.509143407122233, + "grad_norm": 1.6649866104125977, + "learning_rate": 2.8353718119094762e-05, + "loss": 0.8547, + "step": 39200 + }, + { + "epoch": 1.5093358999037536, + "grad_norm": 1.4915626049041748, + "learning_rate": 2.8332625481738705e-05, + "loss": 0.9186, + "step": 39205 + }, + { + "epoch": 1.5095283926852743, + "grad_norm": 0.9645226001739502, + "learning_rate": 2.8311539397889307e-05, + "loss": 0.6819, + "step": 39210 + }, + { + "epoch": 1.509720885466795, + "grad_norm": 1.2806873321533203, + "learning_rate": 2.829045986947474e-05, + "loss": 0.8868, + "step": 39215 + }, + { + "epoch": 1.5099133782483158, + "grad_norm": 1.3241313695907593, + "learning_rate": 2.82693868984226e-05, + "loss": 0.7592, + "step": 39220 + }, + { + "epoch": 1.5101058710298365, + "grad_norm": 0.9667055606842041, + "learning_rate": 2.824832048665981e-05, + "loss": 0.8439, + "step": 39225 + }, + { + "epoch": 1.5102983638113572, + "grad_norm": 1.5384224653244019, + "learning_rate": 2.822726063611285e-05, + "loss": 0.7319, + "step": 39230 + }, + { + "epoch": 1.5104908565928779, + "grad_norm": 1.0840857028961182, + "learning_rate": 2.8206207348707435e-05, + "loss": 0.7401, + "step": 39235 + }, + { + "epoch": 1.5106833493743985, + "grad_norm": 0.7421225905418396, + "learning_rate": 2.8185160626368755e-05, + "loss": 0.7408, + "step": 39240 + }, + { + "epoch": 1.5108758421559192, + "grad_norm": 1.7684946060180664, + "learning_rate": 2.8164120471021417e-05, + "loss": 0.8469, + "step": 39245 + }, + { + "epoch": 1.5110683349374399, + "grad_norm": 1.4627081155776978, + "learning_rate": 2.814308688458942e-05, + "loss": 0.8621, + "step": 39250 + }, + { + "epoch": 1.5112608277189605, + "grad_norm": 0.941478967666626, + "learning_rate": 2.8122059868996055e-05, + "loss": 0.7465, + "step": 39255 + }, + { + "epoch": 1.5114533205004812, + "grad_norm": 1.5532718896865845, + "learning_rate": 2.810103942616423e-05, + "loss": 0.7821, + "step": 39260 + }, + { + "epoch": 1.5116458132820019, + "grad_norm": 0.9734495282173157, + "learning_rate": 2.8080025558016033e-05, + "loss": 0.8102, + "step": 39265 + }, + { + "epoch": 1.5118383060635225, + "grad_norm": 1.1670575141906738, + "learning_rate": 2.8059018266473068e-05, + "loss": 0.8069, + "step": 39270 + }, + { + "epoch": 1.5120307988450432, + "grad_norm": 1.0015208721160889, + "learning_rate": 2.803801755345633e-05, + "loss": 0.8015, + "step": 39275 + }, + { + "epoch": 1.5122232916265639, + "grad_norm": 1.1417772769927979, + "learning_rate": 2.8017023420886202e-05, + "loss": 0.7996, + "step": 39280 + }, + { + "epoch": 1.5124157844080846, + "grad_norm": 1.6754858493804932, + "learning_rate": 2.7996035870682402e-05, + "loss": 0.7731, + "step": 39285 + }, + { + "epoch": 1.5126082771896054, + "grad_norm": 2.357158899307251, + "learning_rate": 2.797505490476415e-05, + "loss": 0.817, + "step": 39290 + }, + { + "epoch": 1.5128007699711261, + "grad_norm": 0.9653787612915039, + "learning_rate": 2.7954080525050007e-05, + "loss": 0.7779, + "step": 39295 + }, + { + "epoch": 1.5129932627526468, + "grad_norm": 0.9732620716094971, + "learning_rate": 2.7933112733457933e-05, + "loss": 0.8127, + "step": 39300 + }, + { + "epoch": 1.5131857555341675, + "grad_norm": 0.9965852499008179, + "learning_rate": 2.791215153190533e-05, + "loss": 0.9347, + "step": 39305 + }, + { + "epoch": 1.5133782483156881, + "grad_norm": 1.4929567575454712, + "learning_rate": 2.7891196922308905e-05, + "loss": 0.8412, + "step": 39310 + }, + { + "epoch": 1.513570741097209, + "grad_norm": 1.590889811515808, + "learning_rate": 2.787024890658484e-05, + "loss": 0.792, + "step": 39315 + }, + { + "epoch": 1.5137632338787297, + "grad_norm": 1.5452625751495361, + "learning_rate": 2.7849307486648702e-05, + "loss": 0.8094, + "step": 39320 + }, + { + "epoch": 1.5139557266602504, + "grad_norm": 1.8608287572860718, + "learning_rate": 2.782837266441546e-05, + "loss": 0.8371, + "step": 39325 + }, + { + "epoch": 1.514148219441771, + "grad_norm": 1.141189455986023, + "learning_rate": 2.780744444179939e-05, + "loss": 0.8949, + "step": 39330 + }, + { + "epoch": 1.5143407122232917, + "grad_norm": 1.3165732622146606, + "learning_rate": 2.7786522820714355e-05, + "loss": 0.8983, + "step": 39335 + }, + { + "epoch": 1.5145332050048124, + "grad_norm": 1.809031367301941, + "learning_rate": 2.776560780307341e-05, + "loss": 0.7893, + "step": 39340 + }, + { + "epoch": 1.514725697786333, + "grad_norm": 0.9109099507331848, + "learning_rate": 2.774469939078912e-05, + "loss": 0.7076, + "step": 39345 + }, + { + "epoch": 1.5149181905678537, + "grad_norm": 1.9604289531707764, + "learning_rate": 2.7723797585773426e-05, + "loss": 0.916, + "step": 39350 + }, + { + "epoch": 1.5151106833493744, + "grad_norm": 1.2984347343444824, + "learning_rate": 2.770290238993768e-05, + "loss": 0.9168, + "step": 39355 + }, + { + "epoch": 1.515303176130895, + "grad_norm": 2.4720041751861572, + "learning_rate": 2.768201380519253e-05, + "loss": 0.8866, + "step": 39360 + }, + { + "epoch": 1.5154956689124157, + "grad_norm": 0.9475586414337158, + "learning_rate": 2.7661131833448216e-05, + "loss": 0.8357, + "step": 39365 + }, + { + "epoch": 1.5156881616939364, + "grad_norm": 1.0659955739974976, + "learning_rate": 2.7640256476614157e-05, + "loss": 0.8456, + "step": 39370 + }, + { + "epoch": 1.515880654475457, + "grad_norm": 1.2105602025985718, + "learning_rate": 2.7619387736599312e-05, + "loss": 0.7651, + "step": 39375 + }, + { + "epoch": 1.5160731472569777, + "grad_norm": 1.759903907775879, + "learning_rate": 2.7598525615311975e-05, + "loss": 0.7607, + "step": 39380 + }, + { + "epoch": 1.5162656400384984, + "grad_norm": 0.6724603176116943, + "learning_rate": 2.7577670114659892e-05, + "loss": 0.7695, + "step": 39385 + }, + { + "epoch": 1.5164581328200193, + "grad_norm": 1.3016246557235718, + "learning_rate": 2.755682123655009e-05, + "loss": 0.8609, + "step": 39390 + }, + { + "epoch": 1.51665062560154, + "grad_norm": 1.1983976364135742, + "learning_rate": 2.75359789828891e-05, + "loss": 0.8225, + "step": 39395 + }, + { + "epoch": 1.5168431183830606, + "grad_norm": 1.0239381790161133, + "learning_rate": 2.7515143355582786e-05, + "loss": 0.6805, + "step": 39400 + }, + { + "epoch": 1.5170356111645813, + "grad_norm": 1.6102731227874756, + "learning_rate": 2.7494314356536456e-05, + "loss": 0.8922, + "step": 39405 + }, + { + "epoch": 1.517228103946102, + "grad_norm": 1.7071436643600464, + "learning_rate": 2.7473491987654798e-05, + "loss": 0.9025, + "step": 39410 + }, + { + "epoch": 1.5174205967276229, + "grad_norm": 1.8176323175430298, + "learning_rate": 2.7452676250841824e-05, + "loss": 0.9699, + "step": 39415 + }, + { + "epoch": 1.5176130895091435, + "grad_norm": 2.1587765216827393, + "learning_rate": 2.7431867148001024e-05, + "loss": 1.0543, + "step": 39420 + }, + { + "epoch": 1.5178055822906642, + "grad_norm": 1.8511626720428467, + "learning_rate": 2.741106468103526e-05, + "loss": 0.7427, + "step": 39425 + }, + { + "epoch": 1.5179980750721849, + "grad_norm": 1.2232860326766968, + "learning_rate": 2.739026885184679e-05, + "loss": 0.834, + "step": 39430 + }, + { + "epoch": 1.5181905678537055, + "grad_norm": 0.9527449011802673, + "learning_rate": 2.7369479662337195e-05, + "loss": 0.8507, + "step": 39435 + }, + { + "epoch": 1.5183830606352262, + "grad_norm": 1.3037556409835815, + "learning_rate": 2.7348697114407595e-05, + "loss": 0.8492, + "step": 39440 + }, + { + "epoch": 1.5185755534167469, + "grad_norm": 0.8813561797142029, + "learning_rate": 2.732792120995832e-05, + "loss": 0.8644, + "step": 39445 + }, + { + "epoch": 1.5187680461982676, + "grad_norm": 1.0147926807403564, + "learning_rate": 2.7307151950889308e-05, + "loss": 0.7337, + "step": 39450 + }, + { + "epoch": 1.5189605389797882, + "grad_norm": 1.2371786832809448, + "learning_rate": 2.728638933909966e-05, + "loss": 0.7856, + "step": 39455 + }, + { + "epoch": 1.519153031761309, + "grad_norm": 1.3520861864089966, + "learning_rate": 2.726563337648803e-05, + "loss": 0.8224, + "step": 39460 + }, + { + "epoch": 1.5193455245428296, + "grad_norm": 1.1333503723144531, + "learning_rate": 2.7244884064952414e-05, + "loss": 0.7827, + "step": 39465 + }, + { + "epoch": 1.5195380173243502, + "grad_norm": 1.4535999298095703, + "learning_rate": 2.7224141406390214e-05, + "loss": 0.9174, + "step": 39470 + }, + { + "epoch": 1.519730510105871, + "grad_norm": 1.9915587902069092, + "learning_rate": 2.7203405402698133e-05, + "loss": 0.9327, + "step": 39475 + }, + { + "epoch": 1.5199230028873916, + "grad_norm": 1.0400042533874512, + "learning_rate": 2.7182676055772448e-05, + "loss": 0.7821, + "step": 39480 + }, + { + "epoch": 1.5201154956689125, + "grad_norm": 1.4389541149139404, + "learning_rate": 2.7161953367508643e-05, + "loss": 0.826, + "step": 39485 + }, + { + "epoch": 1.5203079884504331, + "grad_norm": 1.9297611713409424, + "learning_rate": 2.7141237339801685e-05, + "loss": 0.9919, + "step": 39490 + }, + { + "epoch": 1.5205004812319538, + "grad_norm": 1.1149492263793945, + "learning_rate": 2.712052797454594e-05, + "loss": 0.7048, + "step": 39495 + }, + { + "epoch": 1.5206929740134745, + "grad_norm": 1.5707048177719116, + "learning_rate": 2.709982527363515e-05, + "loss": 0.8483, + "step": 39500 + }, + { + "epoch": 1.5208854667949951, + "grad_norm": 1.289869785308838, + "learning_rate": 2.7079129238962386e-05, + "loss": 0.6966, + "step": 39505 + }, + { + "epoch": 1.521077959576516, + "grad_norm": 1.144384741783142, + "learning_rate": 2.7058439872420204e-05, + "loss": 0.7713, + "step": 39510 + }, + { + "epoch": 1.5212704523580367, + "grad_norm": 0.8240516781806946, + "learning_rate": 2.7037757175900503e-05, + "loss": 0.7334, + "step": 39515 + }, + { + "epoch": 1.5214629451395574, + "grad_norm": 2.318265199661255, + "learning_rate": 2.7017081151294576e-05, + "loss": 0.8006, + "step": 39520 + }, + { + "epoch": 1.521655437921078, + "grad_norm": 1.3979312181472778, + "learning_rate": 2.6996411800493137e-05, + "loss": 0.7323, + "step": 39525 + }, + { + "epoch": 1.5218479307025987, + "grad_norm": 0.9698436856269836, + "learning_rate": 2.6975749125386207e-05, + "loss": 0.952, + "step": 39530 + }, + { + "epoch": 1.5220404234841194, + "grad_norm": 1.6334707736968994, + "learning_rate": 2.6955093127863272e-05, + "loss": 0.7626, + "step": 39535 + }, + { + "epoch": 1.52223291626564, + "grad_norm": 1.1066991090774536, + "learning_rate": 2.6934443809813203e-05, + "loss": 0.7948, + "step": 39540 + }, + { + "epoch": 1.5224254090471607, + "grad_norm": 1.206701636314392, + "learning_rate": 2.6913801173124253e-05, + "loss": 0.6289, + "step": 39545 + }, + { + "epoch": 1.5226179018286814, + "grad_norm": 1.3689944744110107, + "learning_rate": 2.689316521968398e-05, + "loss": 0.7864, + "step": 39550 + }, + { + "epoch": 1.522810394610202, + "grad_norm": 2.103961944580078, + "learning_rate": 2.687253595137952e-05, + "loss": 0.6763, + "step": 39555 + }, + { + "epoch": 1.5230028873917227, + "grad_norm": 1.0298928022384644, + "learning_rate": 2.6851913370097194e-05, + "loss": 0.7179, + "step": 39560 + }, + { + "epoch": 1.5231953801732434, + "grad_norm": 1.2706328630447388, + "learning_rate": 2.6831297477722815e-05, + "loss": 0.6922, + "step": 39565 + }, + { + "epoch": 1.523387872954764, + "grad_norm": 0.9494262933731079, + "learning_rate": 2.6810688276141592e-05, + "loss": 0.8778, + "step": 39570 + }, + { + "epoch": 1.5235803657362847, + "grad_norm": 0.9296857118606567, + "learning_rate": 2.6790085767238126e-05, + "loss": 0.8307, + "step": 39575 + }, + { + "epoch": 1.5237728585178054, + "grad_norm": 1.7679259777069092, + "learning_rate": 2.6769489952896275e-05, + "loss": 0.8162, + "step": 39580 + }, + { + "epoch": 1.5239653512993263, + "grad_norm": 1.1842856407165527, + "learning_rate": 2.6748900834999534e-05, + "loss": 0.724, + "step": 39585 + }, + { + "epoch": 1.524157844080847, + "grad_norm": 0.9693066477775574, + "learning_rate": 2.672831841543053e-05, + "loss": 0.8326, + "step": 39590 + }, + { + "epoch": 1.5243503368623676, + "grad_norm": 1.6108996868133545, + "learning_rate": 2.6707742696071425e-05, + "loss": 0.7797, + "step": 39595 + }, + { + "epoch": 1.5245428296438883, + "grad_norm": 2.4159984588623047, + "learning_rate": 2.6687173678803734e-05, + "loss": 0.9232, + "step": 39600 + }, + { + "epoch": 1.5247353224254092, + "grad_norm": 1.5150139331817627, + "learning_rate": 2.6666611365508388e-05, + "loss": 0.7605, + "step": 39605 + }, + { + "epoch": 1.5249278152069299, + "grad_norm": 1.5143542289733887, + "learning_rate": 2.6646055758065613e-05, + "loss": 0.9481, + "step": 39610 + }, + { + "epoch": 1.5251203079884506, + "grad_norm": 0.9554632306098938, + "learning_rate": 2.662550685835511e-05, + "loss": 0.9355, + "step": 39615 + }, + { + "epoch": 1.5253128007699712, + "grad_norm": 1.936998963356018, + "learning_rate": 2.660496466825595e-05, + "loss": 0.7781, + "step": 39620 + }, + { + "epoch": 1.525505293551492, + "grad_norm": 2.4360251426696777, + "learning_rate": 2.6584429189646576e-05, + "loss": 0.8181, + "step": 39625 + }, + { + "epoch": 1.5256977863330126, + "grad_norm": 1.0304756164550781, + "learning_rate": 2.6563900424404843e-05, + "loss": 0.6611, + "step": 39630 + }, + { + "epoch": 1.5258902791145332, + "grad_norm": 1.5006756782531738, + "learning_rate": 2.6543378374407913e-05, + "loss": 0.8139, + "step": 39635 + }, + { + "epoch": 1.526082771896054, + "grad_norm": 2.475623846054077, + "learning_rate": 2.652286304153243e-05, + "loss": 0.8565, + "step": 39640 + }, + { + "epoch": 1.5262752646775746, + "grad_norm": 1.3540338277816772, + "learning_rate": 2.6502354427654375e-05, + "loss": 0.9392, + "step": 39645 + }, + { + "epoch": 1.5264677574590952, + "grad_norm": 1.5708937644958496, + "learning_rate": 2.6481852534649165e-05, + "loss": 0.7956, + "step": 39650 + }, + { + "epoch": 1.526660250240616, + "grad_norm": 1.0419323444366455, + "learning_rate": 2.646135736439147e-05, + "loss": 0.7812, + "step": 39655 + }, + { + "epoch": 1.5268527430221366, + "grad_norm": 1.3944579362869263, + "learning_rate": 2.6440868918755556e-05, + "loss": 0.9234, + "step": 39660 + }, + { + "epoch": 1.5270452358036573, + "grad_norm": 1.2345746755599976, + "learning_rate": 2.642038719961486e-05, + "loss": 0.7003, + "step": 39665 + }, + { + "epoch": 1.527237728585178, + "grad_norm": 1.7590235471725464, + "learning_rate": 2.6399912208842337e-05, + "loss": 1.0322, + "step": 39670 + }, + { + "epoch": 1.5274302213666986, + "grad_norm": 1.5209788084030151, + "learning_rate": 2.637944394831028e-05, + "loss": 0.7132, + "step": 39675 + }, + { + "epoch": 1.5276227141482195, + "grad_norm": 1.2953201532363892, + "learning_rate": 2.635898241989042e-05, + "loss": 0.8862, + "step": 39680 + }, + { + "epoch": 1.5278152069297402, + "grad_norm": 0.8863399028778076, + "learning_rate": 2.6338527625453725e-05, + "loss": 0.8335, + "step": 39685 + }, + { + "epoch": 1.5280076997112608, + "grad_norm": 0.9998905062675476, + "learning_rate": 2.631807956687078e-05, + "loss": 0.8523, + "step": 39690 + }, + { + "epoch": 1.5282001924927815, + "grad_norm": 1.509939432144165, + "learning_rate": 2.62976382460113e-05, + "loss": 0.7919, + "step": 39695 + }, + { + "epoch": 1.5283926852743022, + "grad_norm": 1.2834575176239014, + "learning_rate": 2.627720366474463e-05, + "loss": 0.8921, + "step": 39700 + }, + { + "epoch": 1.528585178055823, + "grad_norm": 0.9761720299720764, + "learning_rate": 2.6256775824939283e-05, + "loss": 0.7869, + "step": 39705 + }, + { + "epoch": 1.5287776708373437, + "grad_norm": 2.2494795322418213, + "learning_rate": 2.6236354728463286e-05, + "loss": 0.765, + "step": 39710 + }, + { + "epoch": 1.5289701636188644, + "grad_norm": 1.8769516944885254, + "learning_rate": 2.6215940377184002e-05, + "loss": 0.6916, + "step": 39715 + }, + { + "epoch": 1.529162656400385, + "grad_norm": 0.8097474575042725, + "learning_rate": 2.6195532772968235e-05, + "loss": 0.7983, + "step": 39720 + }, + { + "epoch": 1.5293551491819057, + "grad_norm": 1.8454664945602417, + "learning_rate": 2.617513191768205e-05, + "loss": 0.8183, + "step": 39725 + }, + { + "epoch": 1.5295476419634264, + "grad_norm": 1.6850463151931763, + "learning_rate": 2.615473781319101e-05, + "loss": 1.0079, + "step": 39730 + }, + { + "epoch": 1.529740134744947, + "grad_norm": 1.442956566810608, + "learning_rate": 2.6134350461360013e-05, + "loss": 0.7613, + "step": 39735 + }, + { + "epoch": 1.5299326275264677, + "grad_norm": 1.101926326751709, + "learning_rate": 2.6113969864053356e-05, + "loss": 0.7822, + "step": 39740 + }, + { + "epoch": 1.5301251203079884, + "grad_norm": 0.8331781625747681, + "learning_rate": 2.6093596023134736e-05, + "loss": 0.8167, + "step": 39745 + }, + { + "epoch": 1.530317613089509, + "grad_norm": 1.0116935968399048, + "learning_rate": 2.6073228940467142e-05, + "loss": 0.9155, + "step": 39750 + }, + { + "epoch": 1.5305101058710298, + "grad_norm": 1.2244682312011719, + "learning_rate": 2.605286861791304e-05, + "loss": 0.7738, + "step": 39755 + }, + { + "epoch": 1.5307025986525504, + "grad_norm": 1.583099126815796, + "learning_rate": 2.603251505733424e-05, + "loss": 0.8342, + "step": 39760 + }, + { + "epoch": 1.530895091434071, + "grad_norm": 2.7618401050567627, + "learning_rate": 2.6012168260591997e-05, + "loss": 0.7933, + "step": 39765 + }, + { + "epoch": 1.5310875842155918, + "grad_norm": 1.7654789686203003, + "learning_rate": 2.5991828229546766e-05, + "loss": 0.9115, + "step": 39770 + }, + { + "epoch": 1.5312800769971127, + "grad_norm": 1.1165564060211182, + "learning_rate": 2.597149496605865e-05, + "loss": 0.8808, + "step": 39775 + }, + { + "epoch": 1.5314725697786333, + "grad_norm": 2.018768310546875, + "learning_rate": 2.5951168471986896e-05, + "loss": 0.6884, + "step": 39780 + }, + { + "epoch": 1.531665062560154, + "grad_norm": 1.567855954170227, + "learning_rate": 2.593084874919025e-05, + "loss": 0.7812, + "step": 39785 + }, + { + "epoch": 1.5318575553416747, + "grad_norm": 1.3749490976333618, + "learning_rate": 2.5910535799526834e-05, + "loss": 0.8049, + "step": 39790 + }, + { + "epoch": 1.5320500481231953, + "grad_norm": 0.9157447814941406, + "learning_rate": 2.5890229624854146e-05, + "loss": 0.7877, + "step": 39795 + }, + { + "epoch": 1.5322425409047162, + "grad_norm": 1.0359165668487549, + "learning_rate": 2.586993022702897e-05, + "loss": 0.6694, + "step": 39800 + }, + { + "epoch": 1.532435033686237, + "grad_norm": 1.5438929796218872, + "learning_rate": 2.584963760790766e-05, + "loss": 0.7098, + "step": 39805 + }, + { + "epoch": 1.5326275264677576, + "grad_norm": 1.700568675994873, + "learning_rate": 2.5829351769345765e-05, + "loss": 0.6406, + "step": 39810 + }, + { + "epoch": 1.5328200192492782, + "grad_norm": 1.7474377155303955, + "learning_rate": 2.5809072713198313e-05, + "loss": 0.7675, + "step": 39815 + }, + { + "epoch": 1.533012512030799, + "grad_norm": 1.255887746810913, + "learning_rate": 2.5788800441319693e-05, + "loss": 0.8331, + "step": 39820 + }, + { + "epoch": 1.5332050048123196, + "grad_norm": 1.3320515155792236, + "learning_rate": 2.5768534955563694e-05, + "loss": 0.685, + "step": 39825 + }, + { + "epoch": 1.5333974975938403, + "grad_norm": 1.2977993488311768, + "learning_rate": 2.574827625778341e-05, + "loss": 0.9356, + "step": 39830 + }, + { + "epoch": 1.533589990375361, + "grad_norm": 1.2286765575408936, + "learning_rate": 2.572802434983139e-05, + "loss": 0.7974, + "step": 39835 + }, + { + "epoch": 1.5337824831568816, + "grad_norm": 1.5404809713363647, + "learning_rate": 2.5707779233559537e-05, + "loss": 0.8141, + "step": 39840 + }, + { + "epoch": 1.5339749759384023, + "grad_norm": 2.166562795639038, + "learning_rate": 2.568754091081913e-05, + "loss": 0.7906, + "step": 39845 + }, + { + "epoch": 1.534167468719923, + "grad_norm": 2.0202114582061768, + "learning_rate": 2.5667309383460857e-05, + "loss": 0.8012, + "step": 39850 + }, + { + "epoch": 1.5343599615014436, + "grad_norm": 1.2766706943511963, + "learning_rate": 2.5647084653334707e-05, + "loss": 0.8452, + "step": 39855 + }, + { + "epoch": 1.5345524542829643, + "grad_norm": 1.7136098146438599, + "learning_rate": 2.562686672229012e-05, + "loss": 0.7912, + "step": 39860 + }, + { + "epoch": 1.534744947064485, + "grad_norm": 1.3089460134506226, + "learning_rate": 2.56066555921759e-05, + "loss": 0.7512, + "step": 39865 + }, + { + "epoch": 1.5349374398460056, + "grad_norm": 1.317051649093628, + "learning_rate": 2.558645126484024e-05, + "loss": 0.9469, + "step": 39870 + }, + { + "epoch": 1.5351299326275265, + "grad_norm": 1.0458593368530273, + "learning_rate": 2.5566253742130608e-05, + "loss": 0.8018, + "step": 39875 + }, + { + "epoch": 1.5353224254090472, + "grad_norm": 0.9164959788322449, + "learning_rate": 2.5546063025894053e-05, + "loss": 0.7815, + "step": 39880 + }, + { + "epoch": 1.5355149181905678, + "grad_norm": 1.2855099439620972, + "learning_rate": 2.5525879117976794e-05, + "loss": 0.7955, + "step": 39885 + }, + { + "epoch": 1.5357074109720885, + "grad_norm": 1.3438894748687744, + "learning_rate": 2.5505702020224542e-05, + "loss": 0.724, + "step": 39890 + }, + { + "epoch": 1.5358999037536092, + "grad_norm": 1.1192864179611206, + "learning_rate": 2.548553173448236e-05, + "loss": 0.8516, + "step": 39895 + }, + { + "epoch": 1.53609239653513, + "grad_norm": 1.1312519311904907, + "learning_rate": 2.5465368262594726e-05, + "loss": 0.8709, + "step": 39900 + }, + { + "epoch": 1.5362848893166507, + "grad_norm": 1.1640228033065796, + "learning_rate": 2.544521160640535e-05, + "loss": 0.8806, + "step": 39905 + }, + { + "epoch": 1.5364773820981714, + "grad_norm": 0.9546281695365906, + "learning_rate": 2.5425061767757553e-05, + "loss": 0.7814, + "step": 39910 + }, + { + "epoch": 1.536669874879692, + "grad_norm": 1.2589070796966553, + "learning_rate": 2.540491874849381e-05, + "loss": 0.8007, + "step": 39915 + }, + { + "epoch": 1.5368623676612128, + "grad_norm": 1.1835721731185913, + "learning_rate": 2.53847825504561e-05, + "loss": 0.8729, + "step": 39920 + }, + { + "epoch": 1.5370548604427334, + "grad_norm": 1.403292179107666, + "learning_rate": 2.5364653175485754e-05, + "loss": 0.8297, + "step": 39925 + }, + { + "epoch": 1.537247353224254, + "grad_norm": 1.9332057237625122, + "learning_rate": 2.534453062542348e-05, + "loss": 0.8723, + "step": 39930 + }, + { + "epoch": 1.5374398460057748, + "grad_norm": 1.8926912546157837, + "learning_rate": 2.5324414902109316e-05, + "loss": 0.732, + "step": 39935 + }, + { + "epoch": 1.5376323387872954, + "grad_norm": 1.3487472534179688, + "learning_rate": 2.5304306007382716e-05, + "loss": 0.863, + "step": 39940 + }, + { + "epoch": 1.537824831568816, + "grad_norm": 1.4540005922317505, + "learning_rate": 2.528420394308253e-05, + "loss": 0.6911, + "step": 39945 + }, + { + "epoch": 1.5380173243503368, + "grad_norm": 1.1179677248001099, + "learning_rate": 2.526410871104694e-05, + "loss": 0.779, + "step": 39950 + }, + { + "epoch": 1.5382098171318574, + "grad_norm": 1.332623839378357, + "learning_rate": 2.5244020313113558e-05, + "loss": 0.6558, + "step": 39955 + }, + { + "epoch": 1.5384023099133781, + "grad_norm": 2.073150634765625, + "learning_rate": 2.5223938751119248e-05, + "loss": 0.9246, + "step": 39960 + }, + { + "epoch": 1.5385948026948988, + "grad_norm": 1.640775203704834, + "learning_rate": 2.5203864026900448e-05, + "loss": 0.7962, + "step": 39965 + }, + { + "epoch": 1.5387872954764197, + "grad_norm": 1.4655123949050903, + "learning_rate": 2.518379614229278e-05, + "loss": 0.8892, + "step": 39970 + }, + { + "epoch": 1.5389797882579404, + "grad_norm": 0.9323534369468689, + "learning_rate": 2.5163735099131336e-05, + "loss": 0.7165, + "step": 39975 + }, + { + "epoch": 1.539172281039461, + "grad_norm": 1.2594026327133179, + "learning_rate": 2.5143680899250577e-05, + "loss": 0.6767, + "step": 39980 + }, + { + "epoch": 1.5393647738209817, + "grad_norm": 1.3983330726623535, + "learning_rate": 2.5123633544484347e-05, + "loss": 0.7979, + "step": 39985 + }, + { + "epoch": 1.5395572666025024, + "grad_norm": 1.1931166648864746, + "learning_rate": 2.510359303666576e-05, + "loss": 0.8501, + "step": 39990 + }, + { + "epoch": 1.5397497593840233, + "grad_norm": 1.4925061464309692, + "learning_rate": 2.5083559377627518e-05, + "loss": 0.8053, + "step": 39995 + }, + { + "epoch": 1.539942252165544, + "grad_norm": 1.4643807411193848, + "learning_rate": 2.5063532569201454e-05, + "loss": 0.765, + "step": 40000 } ], "logging_steps": 5, @@ -42014,7 +56014,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, - "total_flos": 9.370503974649938e+17, + "total_flos": 1.2478892695491133e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null