{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005980861244019139, "grad_norm": 3.412325382232666, "learning_rate": 0.0002, "loss": 3.3895, "step": 25 }, { "epoch": 0.011961722488038277, "grad_norm": 9.85770034790039, "learning_rate": 0.0002, "loss": 2.6708, "step": 50 }, { "epoch": 0.017942583732057416, "grad_norm": 1.8022898435592651, "learning_rate": 0.0002, "loss": 2.4984, "step": 75 }, { "epoch": 0.023923444976076555, "grad_norm": 1.3909275531768799, "learning_rate": 0.0002, "loss": 2.4433, "step": 100 }, { "epoch": 0.029904306220095694, "grad_norm": 1.3175278902053833, "learning_rate": 0.0002, "loss": 2.4068, "step": 125 }, { "epoch": 0.03588516746411483, "grad_norm": 2.1698503494262695, "learning_rate": 0.0002, "loss": 2.3572, "step": 150 }, { "epoch": 0.041866028708133975, "grad_norm": 3.156744956970215, "learning_rate": 0.0002, "loss": 2.3665, "step": 175 }, { "epoch": 0.04784688995215311, "grad_norm": 1.0966124534606934, "learning_rate": 0.0002, "loss": 2.3631, "step": 200 }, { "epoch": 0.05382775119617225, "grad_norm": 1.5008922815322876, "learning_rate": 0.0002, "loss": 2.3375, "step": 225 }, { "epoch": 0.05980861244019139, "grad_norm": 1.0263694524765015, "learning_rate": 0.0002, "loss": 2.3668, "step": 250 }, { "epoch": 0.06578947368421052, "grad_norm": 0.9050750136375427, "learning_rate": 0.0002, "loss": 2.2953, "step": 275 }, { "epoch": 0.07177033492822966, "grad_norm": 1.1184417009353638, "learning_rate": 0.0002, "loss": 2.3101, "step": 300 }, { "epoch": 0.07775119617224881, "grad_norm": 1.2090150117874146, "learning_rate": 0.0002, "loss": 2.3292, "step": 325 }, { "epoch": 0.08373205741626795, "grad_norm": 2.347069263458252, "learning_rate": 0.0002, "loss": 2.3108, "step": 350 }, { "epoch": 0.08971291866028708, "grad_norm": 1.3362812995910645, "learning_rate": 0.0002, "loss": 2.2901, "step": 375 }, { "epoch": 0.09569377990430622, "grad_norm": 0.906521201133728, "learning_rate": 0.0002, "loss": 2.3137, "step": 400 }, { "epoch": 0.10167464114832536, "grad_norm": 0.8491584658622742, "learning_rate": 0.0002, "loss": 2.2915, "step": 425 }, { "epoch": 0.1076555023923445, "grad_norm": 0.9403386116027832, "learning_rate": 0.0002, "loss": 2.2874, "step": 450 }, { "epoch": 0.11363636363636363, "grad_norm": 0.7675734758377075, "learning_rate": 0.0002, "loss": 2.2536, "step": 475 }, { "epoch": 0.11961722488038277, "grad_norm": 0.8333762288093567, "learning_rate": 0.0002, "loss": 2.3332, "step": 500 }, { "epoch": 0.1255980861244019, "grad_norm": 0.8489273190498352, "learning_rate": 0.0002, "loss": 2.3112, "step": 525 }, { "epoch": 0.13157894736842105, "grad_norm": 1.2032957077026367, "learning_rate": 0.0002, "loss": 2.2614, "step": 550 }, { "epoch": 0.1375598086124402, "grad_norm": 0.8014360070228577, "learning_rate": 0.0002, "loss": 2.3014, "step": 575 }, { "epoch": 0.14354066985645933, "grad_norm": 0.8756849765777588, "learning_rate": 0.0002, "loss": 2.2626, "step": 600 }, { "epoch": 0.14952153110047847, "grad_norm": 1.0479413270950317, "learning_rate": 0.0002, "loss": 2.2487, "step": 625 }, { "epoch": 0.15550239234449761, "grad_norm": 1.6525335311889648, "learning_rate": 0.0002, "loss": 2.2643, "step": 650 }, { "epoch": 0.16148325358851676, "grad_norm": 0.7974942922592163, "learning_rate": 0.0002, "loss": 2.2983, "step": 675 }, { "epoch": 0.1674641148325359, "grad_norm": 0.8229785561561584, "learning_rate": 0.0002, "loss": 2.27, "step": 700 }, { "epoch": 0.173444976076555, "grad_norm": 0.9374330639839172, "learning_rate": 0.0002, "loss": 2.2322, "step": 725 }, { "epoch": 0.17942583732057416, "grad_norm": 0.8249229788780212, "learning_rate": 0.0002, "loss": 2.3029, "step": 750 }, { "epoch": 0.1854066985645933, "grad_norm": 0.8934934735298157, "learning_rate": 0.0002, "loss": 2.2204, "step": 775 }, { "epoch": 0.19138755980861244, "grad_norm": 0.8451672196388245, "learning_rate": 0.0002, "loss": 2.2576, "step": 800 }, { "epoch": 0.19736842105263158, "grad_norm": 0.8721255660057068, "learning_rate": 0.0002, "loss": 2.2357, "step": 825 }, { "epoch": 0.20334928229665072, "grad_norm": 0.9069824814796448, "learning_rate": 0.0002, "loss": 2.2624, "step": 850 }, { "epoch": 0.20933014354066987, "grad_norm": 0.8029842972755432, "learning_rate": 0.0002, "loss": 2.2697, "step": 875 }, { "epoch": 0.215311004784689, "grad_norm": 3.0026650428771973, "learning_rate": 0.0002, "loss": 2.2058, "step": 900 }, { "epoch": 0.22129186602870812, "grad_norm": 0.8026193380355835, "learning_rate": 0.0002, "loss": 2.2537, "step": 925 }, { "epoch": 0.22727272727272727, "grad_norm": 0.770354688167572, "learning_rate": 0.0002, "loss": 2.3118, "step": 950 }, { "epoch": 0.2332535885167464, "grad_norm": 0.822100043296814, "learning_rate": 0.0002, "loss": 2.2443, "step": 975 }, { "epoch": 0.23923444976076555, "grad_norm": 0.8492611050605774, "learning_rate": 0.0002, "loss": 2.2295, "step": 1000 }, { "epoch": 0.2452153110047847, "grad_norm": 0.7530927658081055, "learning_rate": 0.0002, "loss": 2.2321, "step": 1025 }, { "epoch": 0.2511961722488038, "grad_norm": 0.7999204993247986, "learning_rate": 0.0002, "loss": 2.2462, "step": 1050 }, { "epoch": 0.25717703349282295, "grad_norm": 0.765783965587616, "learning_rate": 0.0002, "loss": 2.2362, "step": 1075 }, { "epoch": 0.2631578947368421, "grad_norm": 1.9569802284240723, "learning_rate": 0.0002, "loss": 2.2309, "step": 1100 }, { "epoch": 0.26913875598086123, "grad_norm": 0.8249408602714539, "learning_rate": 0.0002, "loss": 2.2433, "step": 1125 }, { "epoch": 0.2751196172248804, "grad_norm": 0.848108172416687, "learning_rate": 0.0002, "loss": 2.2013, "step": 1150 }, { "epoch": 0.2811004784688995, "grad_norm": 0.8488432765007019, "learning_rate": 0.0002, "loss": 2.2393, "step": 1175 }, { "epoch": 0.28708133971291866, "grad_norm": 0.7786160111427307, "learning_rate": 0.0002, "loss": 2.2247, "step": 1200 }, { "epoch": 0.2930622009569378, "grad_norm": 2.2969539165496826, "learning_rate": 0.0002, "loss": 2.2523, "step": 1225 }, { "epoch": 0.29904306220095694, "grad_norm": 0.8230640292167664, "learning_rate": 0.0002, "loss": 2.2327, "step": 1250 }, { "epoch": 0.3050239234449761, "grad_norm": 0.8330740928649902, "learning_rate": 0.0002, "loss": 2.2306, "step": 1275 }, { "epoch": 0.31100478468899523, "grad_norm": 0.8412021994590759, "learning_rate": 0.0002, "loss": 2.2299, "step": 1300 }, { "epoch": 0.31698564593301437, "grad_norm": 0.8107555508613586, "learning_rate": 0.0002, "loss": 2.264, "step": 1325 }, { "epoch": 0.3229665071770335, "grad_norm": 2.283083200454712, "learning_rate": 0.0002, "loss": 2.2085, "step": 1350 }, { "epoch": 0.32894736842105265, "grad_norm": 0.7781470417976379, "learning_rate": 0.0002, "loss": 2.2123, "step": 1375 }, { "epoch": 0.3349282296650718, "grad_norm": 0.7660220265388489, "learning_rate": 0.0002, "loss": 2.2525, "step": 1400 }, { "epoch": 0.3409090909090909, "grad_norm": 0.8373708724975586, "learning_rate": 0.0002, "loss": 2.2142, "step": 1425 }, { "epoch": 0.34688995215311, "grad_norm": 0.7727882862091064, "learning_rate": 0.0002, "loss": 2.1824, "step": 1450 }, { "epoch": 0.35287081339712917, "grad_norm": 0.9092174768447876, "learning_rate": 0.0002, "loss": 2.1939, "step": 1475 }, { "epoch": 0.3588516746411483, "grad_norm": 0.8021971583366394, "learning_rate": 0.0002, "loss": 2.2175, "step": 1500 }, { "epoch": 0.36483253588516745, "grad_norm": 0.7922872304916382, "learning_rate": 0.0002, "loss": 2.2079, "step": 1525 }, { "epoch": 0.3708133971291866, "grad_norm": 1.0842111110687256, "learning_rate": 0.0002, "loss": 2.1879, "step": 1550 }, { "epoch": 0.37679425837320574, "grad_norm": 0.9562531113624573, "learning_rate": 0.0002, "loss": 2.1896, "step": 1575 }, { "epoch": 0.3827751196172249, "grad_norm": 0.8320727348327637, "learning_rate": 0.0002, "loss": 2.2254, "step": 1600 }, { "epoch": 0.388755980861244, "grad_norm": 0.80451899766922, "learning_rate": 0.0002, "loss": 2.2144, "step": 1625 }, { "epoch": 0.39473684210526316, "grad_norm": 0.7288826704025269, "learning_rate": 0.0002, "loss": 2.2349, "step": 1650 }, { "epoch": 0.4007177033492823, "grad_norm": 0.8023431897163391, "learning_rate": 0.0002, "loss": 2.2037, "step": 1675 }, { "epoch": 0.40669856459330145, "grad_norm": 0.8532123565673828, "learning_rate": 0.0002, "loss": 2.2314, "step": 1700 }, { "epoch": 0.4126794258373206, "grad_norm": 0.7851171493530273, "learning_rate": 0.0002, "loss": 2.185, "step": 1725 }, { "epoch": 0.41866028708133973, "grad_norm": 0.8515769243240356, "learning_rate": 0.0002, "loss": 2.232, "step": 1750 }, { "epoch": 0.4246411483253589, "grad_norm": 0.782311201095581, "learning_rate": 0.0002, "loss": 2.1859, "step": 1775 }, { "epoch": 0.430622009569378, "grad_norm": 0.7590478658676147, "learning_rate": 0.0002, "loss": 2.1837, "step": 1800 }, { "epoch": 0.4366028708133971, "grad_norm": 0.7843049168586731, "learning_rate": 0.0002, "loss": 2.2069, "step": 1825 }, { "epoch": 0.44258373205741625, "grad_norm": 0.7173344492912292, "learning_rate": 0.0002, "loss": 2.1989, "step": 1850 }, { "epoch": 0.4485645933014354, "grad_norm": 0.8669169545173645, "learning_rate": 0.0002, "loss": 2.1941, "step": 1875 }, { "epoch": 0.45454545454545453, "grad_norm": 1.45564603805542, "learning_rate": 0.0002, "loss": 2.2045, "step": 1900 }, { "epoch": 0.4605263157894737, "grad_norm": 0.8243363499641418, "learning_rate": 0.0002, "loss": 2.1819, "step": 1925 }, { "epoch": 0.4665071770334928, "grad_norm": 0.8817090392112732, "learning_rate": 0.0002, "loss": 2.1533, "step": 1950 }, { "epoch": 0.47248803827751196, "grad_norm": 0.82022625207901, "learning_rate": 0.0002, "loss": 2.243, "step": 1975 }, { "epoch": 0.4784688995215311, "grad_norm": 0.8863716721534729, "learning_rate": 0.0002, "loss": 2.1638, "step": 2000 }, { "epoch": 0.48444976076555024, "grad_norm": 0.7413605451583862, "learning_rate": 0.0002, "loss": 2.2107, "step": 2025 }, { "epoch": 0.4904306220095694, "grad_norm": 0.8566731810569763, "learning_rate": 0.0002, "loss": 2.148, "step": 2050 }, { "epoch": 0.4964114832535885, "grad_norm": 0.9010487794876099, "learning_rate": 0.0002, "loss": 2.1828, "step": 2075 }, { "epoch": 0.5023923444976076, "grad_norm": 0.8197215795516968, "learning_rate": 0.0002, "loss": 2.1683, "step": 2100 }, { "epoch": 0.5083732057416268, "grad_norm": 0.9159034490585327, "learning_rate": 0.0002, "loss": 2.2269, "step": 2125 }, { "epoch": 0.5143540669856459, "grad_norm": 0.8410281538963318, "learning_rate": 0.0002, "loss": 2.2111, "step": 2150 }, { "epoch": 0.5203349282296651, "grad_norm": 1.8926668167114258, "learning_rate": 0.0002, "loss": 2.1884, "step": 2175 }, { "epoch": 0.5263157894736842, "grad_norm": 1.0006904602050781, "learning_rate": 0.0002, "loss": 2.192, "step": 2200 }, { "epoch": 0.5322966507177034, "grad_norm": 0.7826078534126282, "learning_rate": 0.0002, "loss": 2.1894, "step": 2225 }, { "epoch": 0.5382775119617225, "grad_norm": 0.8574744462966919, "learning_rate": 0.0002, "loss": 2.1854, "step": 2250 }, { "epoch": 0.5442583732057417, "grad_norm": 0.9018279314041138, "learning_rate": 0.0002, "loss": 2.1725, "step": 2275 }, { "epoch": 0.5502392344497608, "grad_norm": 1.3028662204742432, "learning_rate": 0.0002, "loss": 2.1944, "step": 2300 }, { "epoch": 0.55622009569378, "grad_norm": 0.8321689367294312, "learning_rate": 0.0002, "loss": 2.1652, "step": 2325 }, { "epoch": 0.562200956937799, "grad_norm": 0.7999281287193298, "learning_rate": 0.0002, "loss": 2.2041, "step": 2350 }, { "epoch": 0.5681818181818182, "grad_norm": 4.9575629234313965, "learning_rate": 0.0002, "loss": 2.2154, "step": 2375 }, { "epoch": 0.5741626794258373, "grad_norm": 0.7689957022666931, "learning_rate": 0.0002, "loss": 2.1695, "step": 2400 }, { "epoch": 0.5801435406698564, "grad_norm": 0.9012035131454468, "learning_rate": 0.0002, "loss": 2.1557, "step": 2425 }, { "epoch": 0.5861244019138756, "grad_norm": 0.8276737928390503, "learning_rate": 0.0002, "loss": 2.1906, "step": 2450 }, { "epoch": 0.5921052631578947, "grad_norm": 0.9128056168556213, "learning_rate": 0.0002, "loss": 2.1445, "step": 2475 }, { "epoch": 0.5980861244019139, "grad_norm": 0.8623008131980896, "learning_rate": 0.0002, "loss": 2.1852, "step": 2500 }, { "epoch": 0.604066985645933, "grad_norm": 0.7866010665893555, "learning_rate": 0.0002, "loss": 2.173, "step": 2525 }, { "epoch": 0.6100478468899522, "grad_norm": 0.8097877502441406, "learning_rate": 0.0002, "loss": 2.1324, "step": 2550 }, { "epoch": 0.6160287081339713, "grad_norm": 0.8153032660484314, "learning_rate": 0.0002, "loss": 2.1271, "step": 2575 }, { "epoch": 0.6220095693779905, "grad_norm": 0.7427578568458557, "learning_rate": 0.0002, "loss": 2.1617, "step": 2600 }, { "epoch": 0.6279904306220095, "grad_norm": 0.8749725222587585, "learning_rate": 0.0002, "loss": 2.1961, "step": 2625 }, { "epoch": 0.6339712918660287, "grad_norm": 0.7804417610168457, "learning_rate": 0.0002, "loss": 2.1782, "step": 2650 }, { "epoch": 0.6399521531100478, "grad_norm": 0.8692734837532043, "learning_rate": 0.0002, "loss": 2.1852, "step": 2675 }, { "epoch": 0.645933014354067, "grad_norm": 0.8000411987304688, "learning_rate": 0.0002, "loss": 2.1628, "step": 2700 }, { "epoch": 0.6519138755980861, "grad_norm": 0.9027504324913025, "learning_rate": 0.0002, "loss": 2.1666, "step": 2725 }, { "epoch": 0.6578947368421053, "grad_norm": 0.8674067854881287, "learning_rate": 0.0002, "loss": 2.1394, "step": 2750 }, { "epoch": 0.6638755980861244, "grad_norm": 1.3791645765304565, "learning_rate": 0.0002, "loss": 2.1626, "step": 2775 }, { "epoch": 0.6698564593301436, "grad_norm": 0.8177993297576904, "learning_rate": 0.0002, "loss": 2.1664, "step": 2800 }, { "epoch": 0.6758373205741627, "grad_norm": 0.8040952682495117, "learning_rate": 0.0002, "loss": 2.1603, "step": 2825 }, { "epoch": 0.6818181818181818, "grad_norm": 0.8698276281356812, "learning_rate": 0.0002, "loss": 2.2068, "step": 2850 }, { "epoch": 0.687799043062201, "grad_norm": 0.8038722276687622, "learning_rate": 0.0002, "loss": 2.1528, "step": 2875 }, { "epoch": 0.69377990430622, "grad_norm": 0.8705615401268005, "learning_rate": 0.0002, "loss": 2.1538, "step": 2900 }, { "epoch": 0.6997607655502392, "grad_norm": 0.9985973834991455, "learning_rate": 0.0002, "loss": 2.1686, "step": 2925 }, { "epoch": 0.7057416267942583, "grad_norm": 0.7473865747451782, "learning_rate": 0.0002, "loss": 2.2257, "step": 2950 }, { "epoch": 0.7117224880382775, "grad_norm": 0.8028366565704346, "learning_rate": 0.0002, "loss": 2.1712, "step": 2975 }, { "epoch": 0.7177033492822966, "grad_norm": 0.767857551574707, "learning_rate": 0.0002, "loss": 2.1782, "step": 3000 }, { "epoch": 0.7236842105263158, "grad_norm": 0.7830066680908203, "learning_rate": 0.0002, "loss": 2.1488, "step": 3025 }, { "epoch": 0.7296650717703349, "grad_norm": 0.8238586783409119, "learning_rate": 0.0002, "loss": 2.1588, "step": 3050 }, { "epoch": 0.7356459330143541, "grad_norm": 0.7727087140083313, "learning_rate": 0.0002, "loss": 2.1398, "step": 3075 }, { "epoch": 0.7416267942583732, "grad_norm": 0.8918077945709229, "learning_rate": 0.0002, "loss": 2.1829, "step": 3100 }, { "epoch": 0.7476076555023924, "grad_norm": 1.202504277229309, "learning_rate": 0.0002, "loss": 2.1784, "step": 3125 }, { "epoch": 0.7535885167464115, "grad_norm": 0.8316906094551086, "learning_rate": 0.0002, "loss": 2.1806, "step": 3150 }, { "epoch": 0.7595693779904307, "grad_norm": 0.7766339182853699, "learning_rate": 0.0002, "loss": 2.1577, "step": 3175 }, { "epoch": 0.7655502392344498, "grad_norm": 0.9902828931808472, "learning_rate": 0.0002, "loss": 2.1519, "step": 3200 }, { "epoch": 0.7715311004784688, "grad_norm": 0.895126461982727, "learning_rate": 0.0002, "loss": 2.1712, "step": 3225 }, { "epoch": 0.777511961722488, "grad_norm": 0.8055546879768372, "learning_rate": 0.0002, "loss": 2.1814, "step": 3250 }, { "epoch": 0.7834928229665071, "grad_norm": 0.7867780327796936, "learning_rate": 0.0002, "loss": 2.1507, "step": 3275 }, { "epoch": 0.7894736842105263, "grad_norm": 0.8065791726112366, "learning_rate": 0.0002, "loss": 2.2211, "step": 3300 }, { "epoch": 0.7954545454545454, "grad_norm": 1.5913640260696411, "learning_rate": 0.0002, "loss": 2.2257, "step": 3325 }, { "epoch": 0.8014354066985646, "grad_norm": 0.7849767208099365, "learning_rate": 0.0002, "loss": 2.1893, "step": 3350 }, { "epoch": 0.8074162679425837, "grad_norm": 0.7633355855941772, "learning_rate": 0.0002, "loss": 2.1956, "step": 3375 }, { "epoch": 0.8133971291866029, "grad_norm": 0.8164528608322144, "learning_rate": 0.0002, "loss": 2.2224, "step": 3400 }, { "epoch": 0.819377990430622, "grad_norm": 0.7906235456466675, "learning_rate": 0.0002, "loss": 2.1833, "step": 3425 }, { "epoch": 0.8253588516746412, "grad_norm": 0.8774910569190979, "learning_rate": 0.0002, "loss": 2.1471, "step": 3450 }, { "epoch": 0.8313397129186603, "grad_norm": 0.8200404644012451, "learning_rate": 0.0002, "loss": 2.1554, "step": 3475 }, { "epoch": 0.8373205741626795, "grad_norm": 0.7728098630905151, "learning_rate": 0.0002, "loss": 2.2009, "step": 3500 }, { "epoch": 0.8433014354066986, "grad_norm": 0.7523846626281738, "learning_rate": 0.0002, "loss": 2.2052, "step": 3525 }, { "epoch": 0.8492822966507177, "grad_norm": 0.8525931239128113, "learning_rate": 0.0002, "loss": 2.1811, "step": 3550 }, { "epoch": 0.8552631578947368, "grad_norm": 0.7875164747238159, "learning_rate": 0.0002, "loss": 2.1308, "step": 3575 }, { "epoch": 0.861244019138756, "grad_norm": 0.7879646420478821, "learning_rate": 0.0002, "loss": 2.1665, "step": 3600 }, { "epoch": 0.8672248803827751, "grad_norm": 0.7715153694152832, "learning_rate": 0.0002, "loss": 2.1593, "step": 3625 }, { "epoch": 0.8732057416267942, "grad_norm": 0.8685998320579529, "learning_rate": 0.0002, "loss": 2.18, "step": 3650 }, { "epoch": 0.8791866028708134, "grad_norm": 0.8396874666213989, "learning_rate": 0.0002, "loss": 2.1639, "step": 3675 }, { "epoch": 0.8851674641148325, "grad_norm": 0.8163192272186279, "learning_rate": 0.0002, "loss": 2.1742, "step": 3700 }, { "epoch": 0.8911483253588517, "grad_norm": 0.8341553211212158, "learning_rate": 0.0002, "loss": 2.1569, "step": 3725 }, { "epoch": 0.8971291866028708, "grad_norm": 0.7632786631584167, "learning_rate": 0.0002, "loss": 2.1596, "step": 3750 }, { "epoch": 0.90311004784689, "grad_norm": 0.7861719131469727, "learning_rate": 0.0002, "loss": 2.1853, "step": 3775 }, { "epoch": 0.9090909090909091, "grad_norm": 0.8243244886398315, "learning_rate": 0.0002, "loss": 2.1695, "step": 3800 }, { "epoch": 0.9150717703349283, "grad_norm": 0.760749876499176, "learning_rate": 0.0002, "loss": 2.1501, "step": 3825 }, { "epoch": 0.9210526315789473, "grad_norm": 0.9622604250907898, "learning_rate": 0.0002, "loss": 2.1256, "step": 3850 }, { "epoch": 0.9270334928229665, "grad_norm": 0.7732083797454834, "learning_rate": 0.0002, "loss": 2.1433, "step": 3875 }, { "epoch": 0.9330143540669856, "grad_norm": 0.7828539609909058, "learning_rate": 0.0002, "loss": 2.1356, "step": 3900 }, { "epoch": 0.9389952153110048, "grad_norm": 0.8860824704170227, "learning_rate": 0.0002, "loss": 2.1525, "step": 3925 }, { "epoch": 0.9449760765550239, "grad_norm": 0.8569679260253906, "learning_rate": 0.0002, "loss": 2.1501, "step": 3950 }, { "epoch": 0.9509569377990431, "grad_norm": 0.7966086864471436, "learning_rate": 0.0002, "loss": 2.1484, "step": 3975 }, { "epoch": 0.9569377990430622, "grad_norm": 0.7861948609352112, "learning_rate": 0.0002, "loss": 2.1461, "step": 4000 }, { "epoch": 0.9629186602870813, "grad_norm": 0.8073152303695679, "learning_rate": 0.0002, "loss": 2.1681, "step": 4025 }, { "epoch": 0.9688995215311005, "grad_norm": 0.8233998417854309, "learning_rate": 0.0002, "loss": 2.1513, "step": 4050 }, { "epoch": 0.9748803827751196, "grad_norm": 0.836236834526062, "learning_rate": 0.0002, "loss": 2.1665, "step": 4075 }, { "epoch": 0.9808612440191388, "grad_norm": 0.7221957445144653, "learning_rate": 0.0002, "loss": 2.1079, "step": 4100 }, { "epoch": 0.9868421052631579, "grad_norm": 0.7149819731712341, "learning_rate": 0.0002, "loss": 2.1858, "step": 4125 }, { "epoch": 0.992822966507177, "grad_norm": 0.7578993439674377, "learning_rate": 0.0002, "loss": 2.1467, "step": 4150 }, { "epoch": 0.9988038277511961, "grad_norm": 1.0370241403579712, "learning_rate": 0.0002, "loss": 2.1626, "step": 4175 } ], "logging_steps": 25, "max_steps": 4180, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.194945264893952e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }