|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 64.48, |
|
"eval_steps": 500, |
|
"global_step": 4030, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.246424436569214, |
|
"learning_rate": 2.3573200992555833e-06, |
|
"loss": 2.826, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9050242900848389, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 2.72, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.6034655570983887, |
|
"learning_rate": 7.320099255583126e-06, |
|
"loss": 2.4912, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.3487274646759033, |
|
"learning_rate": 9.801488833746898e-06, |
|
"loss": 2.0561, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.6185756921768188, |
|
"learning_rate": 1.2282878411910669e-05, |
|
"loss": 1.7744, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 3.017139196395874, |
|
"learning_rate": 1.4764267990074444e-05, |
|
"loss": 1.8387, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.2100813388824463, |
|
"learning_rate": 1.7245657568238215e-05, |
|
"loss": 1.4478, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.574629545211792, |
|
"learning_rate": 1.9727047146401986e-05, |
|
"loss": 1.285, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 4.586638450622559, |
|
"learning_rate": 2.2208436724565757e-05, |
|
"loss": 1.2235, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.7081515789031982, |
|
"learning_rate": 2.468982630272953e-05, |
|
"loss": 0.9575, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.9670729041099548, |
|
"learning_rate": 2.7171215880893302e-05, |
|
"loss": 0.7086, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 3.229243040084839, |
|
"learning_rate": 2.9652605459057077e-05, |
|
"loss": 0.8587, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 1.1293463706970215, |
|
"learning_rate": 3.2133995037220844e-05, |
|
"loss": 0.5978, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.7043830156326294, |
|
"learning_rate": 3.461538461538462e-05, |
|
"loss": 0.4668, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 2.565268039703369, |
|
"learning_rate": 3.7096774193548386e-05, |
|
"loss": 0.5667, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 1.158849835395813, |
|
"learning_rate": 3.957816377171216e-05, |
|
"loss": 0.4373, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 2.714164972305298, |
|
"learning_rate": 4.205955334987593e-05, |
|
"loss": 0.3492, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 2.2089672088623047, |
|
"learning_rate": 4.45409429280397e-05, |
|
"loss": 0.4018, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 1.8179335594177246, |
|
"learning_rate": 4.702233250620348e-05, |
|
"loss": 0.279, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 1.4858269691467285, |
|
"learning_rate": 4.950372208436725e-05, |
|
"loss": 0.2362, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 1.7704375982284546, |
|
"learning_rate": 4.99975992459978e-05, |
|
"loss": 0.2665, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 1.2611212730407715, |
|
"learning_rate": 4.9987846973104825e-05, |
|
"loss": 0.2029, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 2.994542360305786, |
|
"learning_rate": 4.9970596058519116e-05, |
|
"loss": 0.1747, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 2.7456889152526855, |
|
"learning_rate": 4.994585167909436e-05, |
|
"loss": 0.1486, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.8236416578292847, |
|
"learning_rate": 4.9913621260409695e-05, |
|
"loss": 0.1866, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 2.636003017425537, |
|
"learning_rate": 4.987391447454136e-05, |
|
"loss": 0.1476, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 2.879154920578003, |
|
"learning_rate": 4.982674323716023e-05, |
|
"loss": 0.1403, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 0.9377075433731079, |
|
"learning_rate": 4.977212170395598e-05, |
|
"loss": 0.1018, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 0.311233788728714, |
|
"learning_rate": 4.9710066266389074e-05, |
|
"loss": 0.0992, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 0.8316205739974976, |
|
"learning_rate": 4.964059554677187e-05, |
|
"loss": 0.1134, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 2.567354679107666, |
|
"learning_rate": 4.956373039268022e-05, |
|
"loss": 0.0781, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 10.24, |
|
"grad_norm": 0.0829504132270813, |
|
"learning_rate": 4.947949387069721e-05, |
|
"loss": 0.0892, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 10.56, |
|
"grad_norm": 0.8588472008705139, |
|
"learning_rate": 4.938791125949119e-05, |
|
"loss": 0.0499, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 10.88, |
|
"grad_norm": 1.2792423963546753, |
|
"learning_rate": 4.9289010042229765e-05, |
|
"loss": 0.0831, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 0.4728279709815979, |
|
"learning_rate": 4.918281989833238e-05, |
|
"loss": 0.0715, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 11.52, |
|
"grad_norm": 2.5855355262756348, |
|
"learning_rate": 4.9069372694563756e-05, |
|
"loss": 0.0718, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 11.84, |
|
"grad_norm": 0.8059779405593872, |
|
"learning_rate": 4.8948702475470933e-05, |
|
"loss": 0.0849, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 12.16, |
|
"grad_norm": 1.2841193675994873, |
|
"learning_rate": 4.882084545316684e-05, |
|
"loss": 0.0683, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"grad_norm": 1.3422589302062988, |
|
"learning_rate": 4.868583999646329e-05, |
|
"loss": 0.0808, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 1.3376965522766113, |
|
"learning_rate": 4.8543726619356846e-05, |
|
"loss": 0.0607, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 1.008899450302124, |
|
"learning_rate": 4.83945479688709e-05, |
|
"loss": 0.062, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"grad_norm": 0.441413551568985, |
|
"learning_rate": 4.8238348812257684e-05, |
|
"loss": 0.0461, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 13.76, |
|
"grad_norm": 1.296985149383545, |
|
"learning_rate": 4.808349953928184e-05, |
|
"loss": 0.0482, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 14.08, |
|
"grad_norm": 0.035805635154247284, |
|
"learning_rate": 4.791374712344622e-05, |
|
"loss": 0.0388, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 0.10618308186531067, |
|
"learning_rate": 4.7737118485753564e-05, |
|
"loss": 0.0251, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 14.72, |
|
"grad_norm": 0.866423487663269, |
|
"learning_rate": 4.75536666309653e-05, |
|
"loss": 0.0515, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 15.04, |
|
"grad_norm": 0.5916399955749512, |
|
"learning_rate": 4.73634466114326e-05, |
|
"loss": 0.0536, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 15.36, |
|
"grad_norm": 0.1653570532798767, |
|
"learning_rate": 4.7166515510575676e-05, |
|
"loss": 0.0392, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 15.68, |
|
"grad_norm": 0.027391331270337105, |
|
"learning_rate": 4.696293242575356e-05, |
|
"loss": 0.0369, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 2.17256760597229, |
|
"learning_rate": 4.675275845052942e-05, |
|
"loss": 0.0651, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 16.32, |
|
"grad_norm": 0.8612786531448364, |
|
"learning_rate": 4.6536056656336947e-05, |
|
"loss": 0.037, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 16.64, |
|
"grad_norm": 4.489969253540039, |
|
"learning_rate": 4.631289207355313e-05, |
|
"loss": 0.0272, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 16.96, |
|
"grad_norm": 0.4311043918132782, |
|
"learning_rate": 4.6083331671983185e-05, |
|
"loss": 0.0507, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 17.28, |
|
"grad_norm": 0.4327545762062073, |
|
"learning_rate": 4.584744434076352e-05, |
|
"loss": 0.0274, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"grad_norm": 0.12099918723106384, |
|
"learning_rate": 4.560530086768863e-05, |
|
"loss": 0.0565, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 17.92, |
|
"grad_norm": 0.103216253221035, |
|
"learning_rate": 4.535697391796832e-05, |
|
"loss": 0.0425, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 18.24, |
|
"grad_norm": 0.419209748506546, |
|
"learning_rate": 4.510253801242147e-05, |
|
"loss": 0.0273, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 18.56, |
|
"grad_norm": 1.3193784952163696, |
|
"learning_rate": 4.4842069505112984e-05, |
|
"loss": 0.0438, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"grad_norm": 1.5185387134552002, |
|
"learning_rate": 4.457564656044056e-05, |
|
"loss": 0.0544, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 0.4024270474910736, |
|
"learning_rate": 4.430334912967824e-05, |
|
"loss": 0.0283, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 19.52, |
|
"grad_norm": 0.16141988337039948, |
|
"learning_rate": 4.402525892698367e-05, |
|
"loss": 0.0393, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 19.84, |
|
"grad_norm": 0.07228437811136246, |
|
"learning_rate": 4.374145940487641e-05, |
|
"loss": 0.0249, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 20.16, |
|
"grad_norm": 0.7919737696647644, |
|
"learning_rate": 4.345203572919454e-05, |
|
"loss": 0.0293, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 20.48, |
|
"grad_norm": 0.26585039496421814, |
|
"learning_rate": 4.315707475353706e-05, |
|
"loss": 0.0287, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 20.8, |
|
"grad_norm": 0.5761149525642395, |
|
"learning_rate": 4.285666499319992e-05, |
|
"loss": 0.0521, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 21.12, |
|
"grad_norm": 0.018601374700665474, |
|
"learning_rate": 4.25508965986133e-05, |
|
"loss": 0.0285, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 21.44, |
|
"grad_norm": 0.00528874434530735, |
|
"learning_rate": 4.2239861328288214e-05, |
|
"loss": 0.0346, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 21.76, |
|
"grad_norm": 0.3073647618293762, |
|
"learning_rate": 4.1923652521280585e-05, |
|
"loss": 0.022, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 22.08, |
|
"grad_norm": 0.42911043763160706, |
|
"learning_rate": 4.160236506918098e-05, |
|
"loss": 0.0482, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 22.4, |
|
"grad_norm": 0.6457176804542542, |
|
"learning_rate": 4.127609538763842e-05, |
|
"loss": 0.019, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 22.72, |
|
"grad_norm": 2.3716557025909424, |
|
"learning_rate": 4.094494138742685e-05, |
|
"loss": 0.0312, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 23.04, |
|
"grad_norm": 0.01667410507798195, |
|
"learning_rate": 4.0609002445063036e-05, |
|
"loss": 0.0377, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 23.36, |
|
"grad_norm": 0.6381007432937622, |
|
"learning_rate": 4.02683793729844e-05, |
|
"loss": 0.0307, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 23.68, |
|
"grad_norm": 0.42919328808784485, |
|
"learning_rate": 3.9923174389296085e-05, |
|
"loss": 0.0419, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.01456019002944231, |
|
"learning_rate": 3.957349108709623e-05, |
|
"loss": 0.0223, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 24.32, |
|
"grad_norm": 0.31073492765426636, |
|
"learning_rate": 3.921943440338849e-05, |
|
"loss": 0.0209, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 24.64, |
|
"grad_norm": 0.38279736042022705, |
|
"learning_rate": 3.886111058759132e-05, |
|
"loss": 0.0491, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 24.96, |
|
"grad_norm": 0.30651962757110596, |
|
"learning_rate": 3.849862716965352e-05, |
|
"loss": 0.0298, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 25.28, |
|
"grad_norm": 0.4538489580154419, |
|
"learning_rate": 3.813209292778527e-05, |
|
"loss": 0.0319, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 25.6, |
|
"grad_norm": 0.11643072962760925, |
|
"learning_rate": 3.776161785581481e-05, |
|
"loss": 0.0302, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 25.92, |
|
"grad_norm": 0.008515519089996815, |
|
"learning_rate": 3.738731313018019e-05, |
|
"loss": 0.04, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 26.24, |
|
"grad_norm": 0.002214708598330617, |
|
"learning_rate": 3.700929107656614e-05, |
|
"loss": 0.0354, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 26.56, |
|
"grad_norm": 0.02200801856815815, |
|
"learning_rate": 3.662766513619611e-05, |
|
"loss": 0.0186, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 26.88, |
|
"grad_norm": 0.1882447600364685, |
|
"learning_rate": 3.62425498317895e-05, |
|
"loss": 0.022, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 27.2, |
|
"grad_norm": 0.004948125686496496, |
|
"learning_rate": 3.585406073319439e-05, |
|
"loss": 0.015, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 27.52, |
|
"grad_norm": 0.3387264013290405, |
|
"learning_rate": 3.546231442270596e-05, |
|
"loss": 0.0381, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 27.84, |
|
"grad_norm": 0.09048642963171005, |
|
"learning_rate": 3.506742846008116e-05, |
|
"loss": 0.0277, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 28.16, |
|
"grad_norm": 0.6405784487724304, |
|
"learning_rate": 3.4669521347259996e-05, |
|
"loss": 0.0423, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 28.48, |
|
"grad_norm": 0.16012047231197357, |
|
"learning_rate": 3.426871249280414e-05, |
|
"loss": 0.0115, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 28.8, |
|
"grad_norm": 0.3279825448989868, |
|
"learning_rate": 3.386512217606339e-05, |
|
"loss": 0.0275, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 29.12, |
|
"grad_norm": 0.005494344513863325, |
|
"learning_rate": 3.345887151108087e-05, |
|
"loss": 0.0309, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 29.44, |
|
"grad_norm": 0.0037028896622359753, |
|
"learning_rate": 3.305008241024774e-05, |
|
"loss": 0.0294, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 29.76, |
|
"grad_norm": 0.003084386931732297, |
|
"learning_rate": 3.2638877547718264e-05, |
|
"loss": 0.0213, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 30.08, |
|
"grad_norm": 0.0017954249633476138, |
|
"learning_rate": 3.222538032259643e-05, |
|
"loss": 0.0326, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 30.4, |
|
"grad_norm": 0.26840922236442566, |
|
"learning_rate": 3.1809714821904834e-05, |
|
"loss": 0.0249, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 30.72, |
|
"grad_norm": 0.7214370965957642, |
|
"learning_rate": 3.1392005783347244e-05, |
|
"loss": 0.0115, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 31.04, |
|
"grad_norm": 0.1613769233226776, |
|
"learning_rate": 3.0972378557875884e-05, |
|
"loss": 0.0322, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 31.36, |
|
"grad_norm": 0.18066717684268951, |
|
"learning_rate": 3.055095907207465e-05, |
|
"loss": 0.0316, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 31.68, |
|
"grad_norm": 0.24756371974945068, |
|
"learning_rate": 3.0127873790369627e-05, |
|
"loss": 0.0248, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.08604203909635544, |
|
"learning_rate": 2.9703249677078156e-05, |
|
"loss": 0.0234, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 32.32, |
|
"grad_norm": 0.0022385423071682453, |
|
"learning_rate": 2.9277214158307937e-05, |
|
"loss": 0.0277, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 32.64, |
|
"grad_norm": 0.0020592950750142336, |
|
"learning_rate": 2.8849895083717537e-05, |
|
"loss": 0.0162, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 32.96, |
|
"grad_norm": 0.20633552968502045, |
|
"learning_rate": 2.842142068814977e-05, |
|
"loss": 0.022, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 33.28, |
|
"grad_norm": 0.0019172705942764878, |
|
"learning_rate": 2.7991919553149497e-05, |
|
"loss": 0.0278, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 33.6, |
|
"grad_norm": 0.0013098755152896047, |
|
"learning_rate": 2.756152056837743e-05, |
|
"loss": 0.0189, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 33.92, |
|
"grad_norm": 0.09349821507930756, |
|
"learning_rate": 2.7130352892931388e-05, |
|
"loss": 0.0228, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 34.24, |
|
"grad_norm": 0.0017231553792953491, |
|
"learning_rate": 2.669854591658679e-05, |
|
"loss": 0.0319, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 34.56, |
|
"grad_norm": 0.047173839062452316, |
|
"learning_rate": 2.6266229220967818e-05, |
|
"loss": 0.0153, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 34.88, |
|
"grad_norm": 0.2877206802368164, |
|
"learning_rate": 2.5833532540661127e-05, |
|
"loss": 0.0267, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 35.2, |
|
"grad_norm": 0.25823402404785156, |
|
"learning_rate": 2.540058572428356e-05, |
|
"loss": 0.0178, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 35.52, |
|
"grad_norm": 0.23003694415092468, |
|
"learning_rate": 2.496751869551567e-05, |
|
"loss": 0.0217, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 35.84, |
|
"grad_norm": 0.23193888366222382, |
|
"learning_rate": 2.453446141411273e-05, |
|
"loss": 0.017, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 36.16, |
|
"grad_norm": 0.1941184252500534, |
|
"learning_rate": 2.4101543836904938e-05, |
|
"loss": 0.0257, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 36.48, |
|
"grad_norm": 0.012731954455375671, |
|
"learning_rate": 2.3668895878798424e-05, |
|
"loss": 0.0237, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 36.8, |
|
"grad_norm": 0.18219026923179626, |
|
"learning_rate": 2.32366473737889e-05, |
|
"loss": 0.024, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 37.12, |
|
"grad_norm": 0.256547212600708, |
|
"learning_rate": 2.2804928035999594e-05, |
|
"loss": 0.0225, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 37.44, |
|
"grad_norm": 0.45314905047416687, |
|
"learning_rate": 2.23738674207551e-05, |
|
"loss": 0.0239, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 37.76, |
|
"grad_norm": 0.3919714689254761, |
|
"learning_rate": 2.1943594885702984e-05, |
|
"loss": 0.0235, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 38.08, |
|
"grad_norm": 0.0769328773021698, |
|
"learning_rate": 2.151423955199456e-05, |
|
"loss": 0.0286, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 38.4, |
|
"grad_norm": 0.3520802855491638, |
|
"learning_rate": 2.108593026553681e-05, |
|
"loss": 0.0323, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 38.72, |
|
"grad_norm": 0.3691672384738922, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 0.0241, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 39.04, |
|
"grad_norm": 0.001480752951465547, |
|
"learning_rate": 2.0232963609880093e-05, |
|
"loss": 0.0158, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 39.36, |
|
"grad_norm": 0.31921085715293884, |
|
"learning_rate": 1.9808562208765667e-05, |
|
"loss": 0.0241, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 39.68, |
|
"grad_norm": 0.20936931669712067, |
|
"learning_rate": 1.938571871425715e-05, |
|
"loss": 0.0174, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.0011563162552192807, |
|
"learning_rate": 1.896456001811357e-05, |
|
"loss": 0.0183, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 40.32, |
|
"grad_norm": 0.19230084121227264, |
|
"learning_rate": 1.854521250650026e-05, |
|
"loss": 0.012, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 40.64, |
|
"grad_norm": 0.32013317942619324, |
|
"learning_rate": 1.8127802022061334e-05, |
|
"loss": 0.0225, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 40.96, |
|
"grad_norm": 0.11989307403564453, |
|
"learning_rate": 1.7712453826155457e-05, |
|
"loss": 0.0391, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 41.28, |
|
"grad_norm": 0.0009496643324382603, |
|
"learning_rate": 1.72992925612659e-05, |
|
"loss": 0.0229, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 41.6, |
|
"grad_norm": 0.0012078011641278863, |
|
"learning_rate": 1.688844221359645e-05, |
|
"loss": 0.015, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 41.92, |
|
"grad_norm": 0.0012093032710254192, |
|
"learning_rate": 1.6480026075864163e-05, |
|
"loss": 0.0287, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 42.24, |
|
"grad_norm": 0.2027181088924408, |
|
"learning_rate": 1.6074166710300247e-05, |
|
"loss": 0.0229, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 42.56, |
|
"grad_norm": 0.2977555990219116, |
|
"learning_rate": 1.567098591187021e-05, |
|
"loss": 0.0352, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 42.88, |
|
"grad_norm": 0.36129167675971985, |
|
"learning_rate": 1.5270604671724188e-05, |
|
"loss": 0.0242, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 43.2, |
|
"grad_norm": 0.001115540275350213, |
|
"learning_rate": 1.4873143140888538e-05, |
|
"loss": 0.0165, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 43.52, |
|
"grad_norm": 0.19148553907871246, |
|
"learning_rate": 1.4478720594209532e-05, |
|
"loss": 0.0274, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 43.84, |
|
"grad_norm": 0.057757727801799774, |
|
"learning_rate": 1.4087455394559984e-05, |
|
"loss": 0.0185, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 44.16, |
|
"grad_norm": 0.0009874219540506601, |
|
"learning_rate": 1.369946495731954e-05, |
|
"loss": 0.0509, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 44.48, |
|
"grad_norm": 0.3896861672401428, |
|
"learning_rate": 1.3314865715139346e-05, |
|
"loss": 0.027, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 44.8, |
|
"grad_norm": 0.19004037976264954, |
|
"learning_rate": 1.2933773083001517e-05, |
|
"loss": 0.0163, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 45.12, |
|
"grad_norm": 0.0009183284710161388, |
|
"learning_rate": 1.255630142358421e-05, |
|
"loss": 0.0125, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 45.44, |
|
"grad_norm": 0.1238480657339096, |
|
"learning_rate": 1.2182564012942193e-05, |
|
"loss": 0.0327, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 45.76, |
|
"grad_norm": 0.0009572324343025684, |
|
"learning_rate": 1.1812673006513789e-05, |
|
"loss": 0.0302, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 46.08, |
|
"grad_norm": 0.0011610776418820024, |
|
"learning_rate": 1.14467394054639e-05, |
|
"loss": 0.0209, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 46.4, |
|
"grad_norm": 0.04993343725800514, |
|
"learning_rate": 1.108487302337353e-05, |
|
"loss": 0.025, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 46.72, |
|
"grad_norm": 0.1806841343641281, |
|
"learning_rate": 1.0727182453285647e-05, |
|
"loss": 0.0284, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 47.04, |
|
"grad_norm": 0.0011777572799474, |
|
"learning_rate": 1.0373775035117305e-05, |
|
"loss": 0.0174, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 47.36, |
|
"grad_norm": 0.14497865736484528, |
|
"learning_rate": 1.002475682344792e-05, |
|
"loss": 0.0115, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 47.68, |
|
"grad_norm": 0.0014984839363023639, |
|
"learning_rate": 9.680232555693067e-06, |
|
"loss": 0.0238, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.07430601865053177, |
|
"learning_rate": 9.340305620673778e-06, |
|
"loss": 0.0294, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 48.32, |
|
"grad_norm": 0.07801785320043564, |
|
"learning_rate": 9.005078027590375e-06, |
|
"loss": 0.0226, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 48.64, |
|
"grad_norm": 0.0007196432561613619, |
|
"learning_rate": 8.67465037541038e-06, |
|
"loss": 0.0196, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 48.96, |
|
"grad_norm": 0.0008374506141990423, |
|
"learning_rate": 8.34912182267959e-06, |
|
"loss": 0.0175, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 49.28, |
|
"grad_norm": 0.0010465418454259634, |
|
"learning_rate": 8.028590057765523e-06, |
|
"loss": 0.015, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 49.6, |
|
"grad_norm": 0.0007761380402371287, |
|
"learning_rate": 7.713151269541844e-06, |
|
"loss": 0.0221, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 49.92, |
|
"grad_norm": 0.0216947291046381, |
|
"learning_rate": 7.402900118522979e-06, |
|
"loss": 0.0161, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 50.24, |
|
"grad_norm": 0.26546710729599, |
|
"learning_rate": 7.097929708457282e-06, |
|
"loss": 0.0237, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 50.56, |
|
"grad_norm": 0.0011781662469729781, |
|
"learning_rate": 6.7983315583873695e-06, |
|
"loss": 0.0172, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 50.88, |
|
"grad_norm": 0.39518535137176514, |
|
"learning_rate": 6.504195575186009e-06, |
|
"loss": 0.0198, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 51.2, |
|
"grad_norm": 0.3506232500076294, |
|
"learning_rate": 6.215610026575916e-06, |
|
"loss": 0.0227, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 51.52, |
|
"grad_norm": 0.31244903802871704, |
|
"learning_rate": 5.93266151464123e-06, |
|
"loss": 0.0156, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 51.84, |
|
"grad_norm": 0.17840787768363953, |
|
"learning_rate": 5.655434949839061e-06, |
|
"loss": 0.0268, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 52.16, |
|
"grad_norm": 0.1670505702495575, |
|
"learning_rate": 5.384013525518541e-06, |
|
"loss": 0.0209, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 52.48, |
|
"grad_norm": 0.0010594127234071493, |
|
"learning_rate": 5.118478692955194e-06, |
|
"loss": 0.0202, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 52.8, |
|
"grad_norm": 0.0015649694250896573, |
|
"learning_rate": 4.858910136908123e-06, |
|
"loss": 0.0192, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 53.12, |
|
"grad_norm": 0.19762022793293, |
|
"learning_rate": 4.605385751707248e-06, |
|
"loss": 0.0205, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 53.44, |
|
"grad_norm": 0.2010522186756134, |
|
"learning_rate": 4.357981617877932e-06, |
|
"loss": 0.0129, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 53.76, |
|
"grad_norm": 0.19793441891670227, |
|
"learning_rate": 4.116771979309797e-06, |
|
"loss": 0.0258, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 54.08, |
|
"grad_norm": 0.2605569064617157, |
|
"learning_rate": 3.881829220976807e-06, |
|
"loss": 0.0306, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 54.4, |
|
"grad_norm": 0.037421807646751404, |
|
"learning_rate": 3.653223847215126e-06, |
|
"loss": 0.0198, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 54.72, |
|
"grad_norm": 0.0007586870342493057, |
|
"learning_rate": 3.4310244605653797e-06, |
|
"loss": 0.0257, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 55.04, |
|
"grad_norm": 0.27584579586982727, |
|
"learning_rate": 3.215297741185572e-06, |
|
"loss": 0.0125, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 55.36, |
|
"grad_norm": 0.0007228174363262951, |
|
"learning_rate": 3.0061084268410006e-06, |
|
"loss": 0.0124, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 55.68, |
|
"grad_norm": 0.04090801998972893, |
|
"learning_rate": 2.8035192934769362e-06, |
|
"loss": 0.023, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 0.3518761694431305, |
|
"learning_rate": 2.607591136380122e-06, |
|
"loss": 0.0194, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 56.32, |
|
"grad_norm": 0.06331823766231537, |
|
"learning_rate": 2.4183827519346308e-06, |
|
"loss": 0.0162, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 56.64, |
|
"grad_norm": 0.22303640842437744, |
|
"learning_rate": 2.235950919977545e-06, |
|
"loss": 0.0337, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 56.96, |
|
"grad_norm": 0.08465743064880371, |
|
"learning_rate": 2.0603503867598182e-06, |
|
"loss": 0.0139, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 57.28, |
|
"grad_norm": 0.20135080814361572, |
|
"learning_rate": 1.8916338485173823e-06, |
|
"loss": 0.0193, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 57.6, |
|
"grad_norm": 0.0006721566896885633, |
|
"learning_rate": 1.7298519356574727e-06, |
|
"loss": 0.0203, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 57.92, |
|
"grad_norm": 0.10799671709537506, |
|
"learning_rate": 1.5750531975648324e-06, |
|
"loss": 0.0212, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 58.24, |
|
"grad_norm": 0.0010109569411724806, |
|
"learning_rate": 1.4272840880324934e-06, |
|
"loss": 0.0173, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 58.56, |
|
"grad_norm": 0.0008448906592093408, |
|
"learning_rate": 1.286588951321363e-06, |
|
"loss": 0.0139, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 58.88, |
|
"grad_norm": 0.0010856656590476632, |
|
"learning_rate": 1.1530100088528867e-06, |
|
"loss": 0.0268, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 59.2, |
|
"grad_norm": 0.23958024382591248, |
|
"learning_rate": 1.0265873465387516e-06, |
|
"loss": 0.0191, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 59.52, |
|
"grad_norm": 0.20584586262702942, |
|
"learning_rate": 9.073589027514789e-07, |
|
"loss": 0.0168, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 59.84, |
|
"grad_norm": 0.031580936163663864, |
|
"learning_rate": 7.953604569393841e-07, |
|
"loss": 0.0246, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 60.16, |
|
"grad_norm": 0.14215555787086487, |
|
"learning_rate": 6.906256188895038e-07, |
|
"loss": 0.019, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 60.48, |
|
"grad_norm": 0.0012006442993879318, |
|
"learning_rate": 5.931858186415756e-07, |
|
"loss": 0.0168, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 60.8, |
|
"grad_norm": 0.0063135698437690735, |
|
"learning_rate": 5.03070297056149e-07, |
|
"loss": 0.0197, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 61.12, |
|
"grad_norm": 0.07496818155050278, |
|
"learning_rate": 4.203060970396383e-07, |
|
"loss": 0.0207, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 61.44, |
|
"grad_norm": 0.16551247239112854, |
|
"learning_rate": 3.4491805542899157e-07, |
|
"loss": 0.0224, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 61.76, |
|
"grad_norm": 0.0008456969517283142, |
|
"learning_rate": 2.769287955383532e-07, |
|
"loss": 0.0151, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 62.08, |
|
"grad_norm": 0.0008134017698466778, |
|
"learning_rate": 2.1635872037001626e-07, |
|
"loss": 0.0284, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 62.4, |
|
"grad_norm": 0.18878595530986786, |
|
"learning_rate": 1.6322600649162356e-07, |
|
"loss": 0.0217, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 62.72, |
|
"grad_norm": 0.0008310906123369932, |
|
"learning_rate": 1.1754659858156659e-07, |
|
"loss": 0.0103, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 63.04, |
|
"grad_norm": 0.38621172308921814, |
|
"learning_rate": 7.933420464410201e-08, |
|
"loss": 0.0333, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 63.36, |
|
"grad_norm": 0.016794312745332718, |
|
"learning_rate": 4.860029189569237e-08, |
|
"loss": 0.0231, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 63.68, |
|
"grad_norm": 0.16253815591335297, |
|
"learning_rate": 2.535408332381417e-08, |
|
"loss": 0.0226, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 0.2387680560350418, |
|
"learning_rate": 9.60255491919415e-09, |
|
"loss": 0.0218, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 64.32, |
|
"grad_norm": 0.16293394565582275, |
|
"learning_rate": 1.3504335823810722e-09, |
|
"loss": 0.0219, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 64.48, |
|
"step": 4030, |
|
"total_flos": 2.3325606118844006e+17, |
|
"train_loss": 0.1495482857003993, |
|
"train_runtime": 6882.5617, |
|
"train_samples_per_second": 4.722, |
|
"train_steps_per_second": 0.586 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 4030, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 65, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3325606118844006e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|