|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 4180, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005980861244019139, |
|
"grad_norm": 3.412325382232666, |
|
"learning_rate": 0.0002, |
|
"loss": 3.3895, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.011961722488038277, |
|
"grad_norm": 9.85770034790039, |
|
"learning_rate": 0.0002, |
|
"loss": 2.6708, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.017942583732057416, |
|
"grad_norm": 1.8022898435592651, |
|
"learning_rate": 0.0002, |
|
"loss": 2.4984, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.023923444976076555, |
|
"grad_norm": 1.3909275531768799, |
|
"learning_rate": 0.0002, |
|
"loss": 2.4433, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.029904306220095694, |
|
"grad_norm": 1.3175278902053833, |
|
"learning_rate": 0.0002, |
|
"loss": 2.4068, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03588516746411483, |
|
"grad_norm": 2.1698503494262695, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3572, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.041866028708133975, |
|
"grad_norm": 3.156744956970215, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3665, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.04784688995215311, |
|
"grad_norm": 1.0966124534606934, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3631, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05382775119617225, |
|
"grad_norm": 1.5008922815322876, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3375, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.05980861244019139, |
|
"grad_norm": 1.0263694524765015, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3668, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06578947368421052, |
|
"grad_norm": 0.9050750136375427, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2953, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.07177033492822966, |
|
"grad_norm": 1.1184417009353638, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3101, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07775119617224881, |
|
"grad_norm": 1.2090150117874146, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3292, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.08373205741626795, |
|
"grad_norm": 2.347069263458252, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3108, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08971291866028708, |
|
"grad_norm": 1.3362812995910645, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2901, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.09569377990430622, |
|
"grad_norm": 0.906521201133728, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3137, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10167464114832536, |
|
"grad_norm": 0.8491584658622742, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2915, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.1076555023923445, |
|
"grad_norm": 0.9403386116027832, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2874, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11363636363636363, |
|
"grad_norm": 0.7675734758377075, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2536, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.11961722488038277, |
|
"grad_norm": 0.8333762288093567, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3332, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1255980861244019, |
|
"grad_norm": 0.8489273190498352, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3112, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.13157894736842105, |
|
"grad_norm": 1.2032957077026367, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2614, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1375598086124402, |
|
"grad_norm": 0.8014360070228577, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3014, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.14354066985645933, |
|
"grad_norm": 0.8756849765777588, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2626, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.14952153110047847, |
|
"grad_norm": 1.0479413270950317, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2487, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.15550239234449761, |
|
"grad_norm": 1.6525335311889648, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2643, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.16148325358851676, |
|
"grad_norm": 0.7974942922592163, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2983, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.1674641148325359, |
|
"grad_norm": 0.8229785561561584, |
|
"learning_rate": 0.0002, |
|
"loss": 2.27, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.173444976076555, |
|
"grad_norm": 0.9374330639839172, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2322, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.17942583732057416, |
|
"grad_norm": 0.8249229788780212, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3029, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1854066985645933, |
|
"grad_norm": 0.8934934735298157, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2204, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.19138755980861244, |
|
"grad_norm": 0.8451672196388245, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2576, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.19736842105263158, |
|
"grad_norm": 0.8721255660057068, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2357, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.20334928229665072, |
|
"grad_norm": 0.9069824814796448, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2624, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.20933014354066987, |
|
"grad_norm": 0.8029842972755432, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2697, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.215311004784689, |
|
"grad_norm": 3.0026650428771973, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2058, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.22129186602870812, |
|
"grad_norm": 0.8026193380355835, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2537, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 0.770354688167572, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3118, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.2332535885167464, |
|
"grad_norm": 0.822100043296814, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2443, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.23923444976076555, |
|
"grad_norm": 0.8492611050605774, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2295, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2452153110047847, |
|
"grad_norm": 0.7530927658081055, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2321, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.2511961722488038, |
|
"grad_norm": 0.7999204993247986, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2462, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.25717703349282295, |
|
"grad_norm": 0.765783965587616, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2362, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 1.9569802284240723, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2309, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.26913875598086123, |
|
"grad_norm": 0.8249408602714539, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2433, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.2751196172248804, |
|
"grad_norm": 0.848108172416687, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2013, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2811004784688995, |
|
"grad_norm": 0.8488432765007019, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2393, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.28708133971291866, |
|
"grad_norm": 0.7786160111427307, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2247, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2930622009569378, |
|
"grad_norm": 2.2969539165496826, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2523, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.29904306220095694, |
|
"grad_norm": 0.8230640292167664, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2327, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3050239234449761, |
|
"grad_norm": 0.8330740928649902, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2306, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.31100478468899523, |
|
"grad_norm": 0.8412021994590759, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2299, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.31698564593301437, |
|
"grad_norm": 0.8107555508613586, |
|
"learning_rate": 0.0002, |
|
"loss": 2.264, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.3229665071770335, |
|
"grad_norm": 2.283083200454712, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2085, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.32894736842105265, |
|
"grad_norm": 0.7781470417976379, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2123, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.3349282296650718, |
|
"grad_norm": 0.7660220265388489, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2525, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3409090909090909, |
|
"grad_norm": 0.8373708724975586, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2142, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.34688995215311, |
|
"grad_norm": 0.7727882862091064, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1824, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.35287081339712917, |
|
"grad_norm": 0.9092174768447876, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1939, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.3588516746411483, |
|
"grad_norm": 0.8021971583366394, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2175, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.36483253588516745, |
|
"grad_norm": 0.7922872304916382, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2079, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.3708133971291866, |
|
"grad_norm": 1.0842111110687256, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1879, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.37679425837320574, |
|
"grad_norm": 0.9562531113624573, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1896, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.3827751196172249, |
|
"grad_norm": 0.8320727348327637, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2254, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.388755980861244, |
|
"grad_norm": 0.80451899766922, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2144, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.39473684210526316, |
|
"grad_norm": 0.7288826704025269, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2349, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.4007177033492823, |
|
"grad_norm": 0.8023431897163391, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2037, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.40669856459330145, |
|
"grad_norm": 0.8532123565673828, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2314, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4126794258373206, |
|
"grad_norm": 0.7851171493530273, |
|
"learning_rate": 0.0002, |
|
"loss": 2.185, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.41866028708133973, |
|
"grad_norm": 0.8515769243240356, |
|
"learning_rate": 0.0002, |
|
"loss": 2.232, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.4246411483253589, |
|
"grad_norm": 0.782311201095581, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1859, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.430622009569378, |
|
"grad_norm": 0.7590478658676147, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1837, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4366028708133971, |
|
"grad_norm": 0.7843049168586731, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2069, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.44258373205741625, |
|
"grad_norm": 0.7173344492912292, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1989, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.4485645933014354, |
|
"grad_norm": 0.8669169545173645, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1941, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.45564603805542, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2045, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4605263157894737, |
|
"grad_norm": 0.8243363499641418, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1819, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.4665071770334928, |
|
"grad_norm": 0.8817090392112732, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1533, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.47248803827751196, |
|
"grad_norm": 0.82022625207901, |
|
"learning_rate": 0.0002, |
|
"loss": 2.243, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.4784688995215311, |
|
"grad_norm": 0.8863716721534729, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1638, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.48444976076555024, |
|
"grad_norm": 0.7413605451583862, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2107, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.4904306220095694, |
|
"grad_norm": 0.8566731810569763, |
|
"learning_rate": 0.0002, |
|
"loss": 2.148, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.4964114832535885, |
|
"grad_norm": 0.9010487794876099, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1828, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.5023923444976076, |
|
"grad_norm": 0.8197215795516968, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1683, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5083732057416268, |
|
"grad_norm": 0.9159034490585327, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2269, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.5143540669856459, |
|
"grad_norm": 0.8410281538963318, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2111, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5203349282296651, |
|
"grad_norm": 1.8926668167114258, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1884, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 1.0006904602050781, |
|
"learning_rate": 0.0002, |
|
"loss": 2.192, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5322966507177034, |
|
"grad_norm": 0.7826078534126282, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1894, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.5382775119617225, |
|
"grad_norm": 0.8574744462966919, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1854, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5442583732057417, |
|
"grad_norm": 0.9018279314041138, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1725, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.5502392344497608, |
|
"grad_norm": 1.3028662204742432, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1944, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.55622009569378, |
|
"grad_norm": 0.8321689367294312, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1652, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.562200956937799, |
|
"grad_norm": 0.7999281287193298, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2041, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5681818181818182, |
|
"grad_norm": 4.9575629234313965, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2154, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.5741626794258373, |
|
"grad_norm": 0.7689957022666931, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1695, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5801435406698564, |
|
"grad_norm": 0.9012035131454468, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1557, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.5861244019138756, |
|
"grad_norm": 0.8276737928390503, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1906, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.5921052631578947, |
|
"grad_norm": 0.9128056168556213, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1445, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.5980861244019139, |
|
"grad_norm": 0.8623008131980896, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1852, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.604066985645933, |
|
"grad_norm": 0.7866010665893555, |
|
"learning_rate": 0.0002, |
|
"loss": 2.173, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.6100478468899522, |
|
"grad_norm": 0.8097877502441406, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1324, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6160287081339713, |
|
"grad_norm": 0.8153032660484314, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1271, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.6220095693779905, |
|
"grad_norm": 0.7427578568458557, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1617, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6279904306220095, |
|
"grad_norm": 0.8749725222587585, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1961, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.6339712918660287, |
|
"grad_norm": 0.7804417610168457, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1782, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6399521531100478, |
|
"grad_norm": 0.8692734837532043, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1852, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.645933014354067, |
|
"grad_norm": 0.8000411987304688, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1628, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6519138755980861, |
|
"grad_norm": 0.9027504324913025, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1666, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 0.8674067854881287, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1394, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.6638755980861244, |
|
"grad_norm": 1.3791645765304565, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1626, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.6698564593301436, |
|
"grad_norm": 0.8177993297576904, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1664, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6758373205741627, |
|
"grad_norm": 0.8040952682495117, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1603, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 0.8698276281356812, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2068, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.687799043062201, |
|
"grad_norm": 0.8038722276687622, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1528, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.69377990430622, |
|
"grad_norm": 0.8705615401268005, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1538, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.6997607655502392, |
|
"grad_norm": 0.9985973834991455, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1686, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.7057416267942583, |
|
"grad_norm": 0.7473865747451782, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2257, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.7117224880382775, |
|
"grad_norm": 0.8028366565704346, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1712, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.7177033492822966, |
|
"grad_norm": 0.767857551574707, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1782, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7236842105263158, |
|
"grad_norm": 0.7830066680908203, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1488, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.7296650717703349, |
|
"grad_norm": 0.8238586783409119, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1588, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.7356459330143541, |
|
"grad_norm": 0.7727087140083313, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1398, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.7416267942583732, |
|
"grad_norm": 0.8918077945709229, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1829, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7476076555023924, |
|
"grad_norm": 1.202504277229309, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1784, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.7535885167464115, |
|
"grad_norm": 0.8316906094551086, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1806, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.7595693779904307, |
|
"grad_norm": 0.7766339182853699, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1577, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.7655502392344498, |
|
"grad_norm": 0.9902828931808472, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1519, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.7715311004784688, |
|
"grad_norm": 0.895126461982727, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1712, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.777511961722488, |
|
"grad_norm": 0.8055546879768372, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1814, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.7834928229665071, |
|
"grad_norm": 0.7867780327796936, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1507, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 0.8065791726112366, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2211, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7954545454545454, |
|
"grad_norm": 1.5913640260696411, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2257, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.8014354066985646, |
|
"grad_norm": 0.7849767208099365, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1893, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.8074162679425837, |
|
"grad_norm": 0.7633355855941772, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1956, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.8133971291866029, |
|
"grad_norm": 0.8164528608322144, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2224, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.819377990430622, |
|
"grad_norm": 0.7906235456466675, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1833, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.8253588516746412, |
|
"grad_norm": 0.8774910569190979, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1471, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.8313397129186603, |
|
"grad_norm": 0.8200404644012451, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1554, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.8373205741626795, |
|
"grad_norm": 0.7728098630905151, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2009, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8433014354066986, |
|
"grad_norm": 0.7523846626281738, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2052, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.8492822966507177, |
|
"grad_norm": 0.8525931239128113, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1811, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.8552631578947368, |
|
"grad_norm": 0.7875164747238159, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1308, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.861244019138756, |
|
"grad_norm": 0.7879646420478821, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1665, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.8672248803827751, |
|
"grad_norm": 0.7715153694152832, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1593, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.8732057416267942, |
|
"grad_norm": 0.8685998320579529, |
|
"learning_rate": 0.0002, |
|
"loss": 2.18, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.8791866028708134, |
|
"grad_norm": 0.8396874666213989, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1639, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.8851674641148325, |
|
"grad_norm": 0.8163192272186279, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1742, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8911483253588517, |
|
"grad_norm": 0.8341553211212158, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1569, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.8971291866028708, |
|
"grad_norm": 0.7632786631584167, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1596, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.90311004784689, |
|
"grad_norm": 0.7861719131469727, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1853, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.8243244886398315, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1695, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9150717703349283, |
|
"grad_norm": 0.760749876499176, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1501, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.9210526315789473, |
|
"grad_norm": 0.9622604250907898, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1256, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.9270334928229665, |
|
"grad_norm": 0.7732083797454834, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1433, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.9330143540669856, |
|
"grad_norm": 0.7828539609909058, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1356, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9389952153110048, |
|
"grad_norm": 0.8860824704170227, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1525, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.9449760765550239, |
|
"grad_norm": 0.8569679260253906, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1501, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.9509569377990431, |
|
"grad_norm": 0.7966086864471436, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1484, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.9569377990430622, |
|
"grad_norm": 0.7861948609352112, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1461, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9629186602870813, |
|
"grad_norm": 0.8073152303695679, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1681, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.9688995215311005, |
|
"grad_norm": 0.8233998417854309, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1513, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.9748803827751196, |
|
"grad_norm": 0.836236834526062, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1665, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.9808612440191388, |
|
"grad_norm": 0.7221957445144653, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1079, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.9868421052631579, |
|
"grad_norm": 0.7149819731712341, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1858, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.992822966507177, |
|
"grad_norm": 0.7578993439674377, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1467, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.9988038277511961, |
|
"grad_norm": 1.0370241403579712, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1626, |
|
"step": 4175 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 4180, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.194945264893952e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|