|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.688, |
|
"eval_steps": 500, |
|
"global_step": 1240, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 16.156723022460938, |
|
"learning_rate": 8.064516129032258e-07, |
|
"loss": 1.7136, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 8.073517799377441, |
|
"learning_rate": 2.4193548387096776e-06, |
|
"loss": 1.2809, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.825227737426758, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.8516, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.599808692932129, |
|
"learning_rate": 5.645161290322582e-06, |
|
"loss": 0.8144, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.7444440126419067, |
|
"learning_rate": 7.258064516129033e-06, |
|
"loss": 0.6716, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.365881323814392, |
|
"learning_rate": 8.870967741935484e-06, |
|
"loss": 0.6604, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 1.2223279476165771, |
|
"learning_rate": 1.0483870967741936e-05, |
|
"loss": 0.5284, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 1.4933074712753296, |
|
"learning_rate": 1.1935483870967743e-05, |
|
"loss": 0.5844, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 1.5303218364715576, |
|
"learning_rate": 1.3548387096774194e-05, |
|
"loss": 0.604, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 1.146931529045105, |
|
"learning_rate": 1.5161290322580646e-05, |
|
"loss": 0.6135, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 1.2514358758926392, |
|
"learning_rate": 1.6774193548387098e-05, |
|
"loss": 0.5823, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 1.1219581365585327, |
|
"learning_rate": 1.838709677419355e-05, |
|
"loss": 0.5869, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 1.2538785934448242, |
|
"learning_rate": 2e-05, |
|
"loss": 0.529, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 1.4412481784820557, |
|
"learning_rate": 1.9996038016334953e-05, |
|
"loss": 0.4977, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 1.280132532119751, |
|
"learning_rate": 1.9984155204802715e-05, |
|
"loss": 0.5153, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 1.388814091682434, |
|
"learning_rate": 1.996436098130433e-05, |
|
"loss": 0.5185, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 1.2465498447418213, |
|
"learning_rate": 1.9936671030717832e-05, |
|
"loss": 0.5093, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 6.404733657836914, |
|
"learning_rate": 1.9901107294469595e-05, |
|
"loss": 0.5256, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.016, |
|
"grad_norm": 2.126950263977051, |
|
"learning_rate": 1.985769795314804e-05, |
|
"loss": 0.4664, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.176, |
|
"grad_norm": 1.7593132257461548, |
|
"learning_rate": 1.9806477404173414e-05, |
|
"loss": 0.4251, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.336, |
|
"grad_norm": 1.0036180019378662, |
|
"learning_rate": 1.9747486234541382e-05, |
|
"loss": 0.4481, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.496, |
|
"grad_norm": 1.5059189796447754, |
|
"learning_rate": 1.9680771188662044e-05, |
|
"loss": 0.4185, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.656, |
|
"grad_norm": 2.5380663871765137, |
|
"learning_rate": 1.9606385131319795e-05, |
|
"loss": 0.4336, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.816, |
|
"grad_norm": 1.0066611766815186, |
|
"learning_rate": 1.9524387005783453e-05, |
|
"loss": 0.4488, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.976, |
|
"grad_norm": 2.824939489364624, |
|
"learning_rate": 1.9434841787099804e-05, |
|
"loss": 0.4068, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.128, |
|
"grad_norm": 1.2720246315002441, |
|
"learning_rate": 1.9337820430607594e-05, |
|
"loss": 0.3575, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.288, |
|
"grad_norm": 3.2268316745758057, |
|
"learning_rate": 1.9233399815712737e-05, |
|
"loss": 0.3638, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.448, |
|
"grad_norm": 1.3898473978042603, |
|
"learning_rate": 1.9121662684969337e-05, |
|
"loss": 0.3891, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"grad_norm": 2.0828278064727783, |
|
"learning_rate": 1.9002697578514747e-05, |
|
"loss": 0.3652, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.768, |
|
"grad_norm": 1.149625539779663, |
|
"learning_rate": 1.8876598763910666e-05, |
|
"loss": 0.4091, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.928, |
|
"grad_norm": 1.3654099702835083, |
|
"learning_rate": 1.8743466161445823e-05, |
|
"loss": 0.3508, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 1.3550388813018799, |
|
"learning_rate": 1.8603405264959467e-05, |
|
"loss": 0.3232, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 1.4081593751907349, |
|
"learning_rate": 1.8456527058248398e-05, |
|
"loss": 0.2735, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 2.1102137565612793, |
|
"learning_rate": 1.8302947927123767e-05, |
|
"loss": 0.3039, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.5600000000000005, |
|
"grad_norm": 2.0371267795562744, |
|
"learning_rate": 1.8142789567187327e-05, |
|
"loss": 0.3298, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 1.2168771028518677, |
|
"learning_rate": 1.7976178887400263e-05, |
|
"loss": 0.3153, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 1.5159831047058105, |
|
"learning_rate": 1.780324790952092e-05, |
|
"loss": 0.3051, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.032, |
|
"grad_norm": 1.4497977495193481, |
|
"learning_rate": 1.7624133663491204e-05, |
|
"loss": 0.2808, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.192, |
|
"grad_norm": 1.5660399198532104, |
|
"learning_rate": 1.7438978078854512e-05, |
|
"loss": 0.2225, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.352, |
|
"grad_norm": 2.0142195224761963, |
|
"learning_rate": 1.72479278722912e-05, |
|
"loss": 0.2526, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.5120000000000005, |
|
"grad_norm": 1.3689491748809814, |
|
"learning_rate": 1.7051134431360795e-05, |
|
"loss": 0.2372, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.672, |
|
"grad_norm": 2.383648633956909, |
|
"learning_rate": 1.6848753694542966e-05, |
|
"loss": 0.2611, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.832, |
|
"grad_norm": 2.9119460582733154, |
|
"learning_rate": 1.6640946027672395e-05, |
|
"loss": 0.2292, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.992, |
|
"grad_norm": 1.594862461090088, |
|
"learning_rate": 1.6427876096865394e-05, |
|
"loss": 0.2484, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.144, |
|
"grad_norm": 2.313305616378784, |
|
"learning_rate": 1.620971273803905e-05, |
|
"loss": 0.1605, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.304, |
|
"grad_norm": 1.8747406005859375, |
|
"learning_rate": 1.598662882312615e-05, |
|
"loss": 0.1881, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.464, |
|
"grad_norm": 2.306143045425415, |
|
"learning_rate": 1.5758801123092066e-05, |
|
"loss": 0.1805, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.624, |
|
"grad_norm": 2.0819358825683594, |
|
"learning_rate": 1.552641016786199e-05, |
|
"loss": 0.1365, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.784, |
|
"grad_norm": 1.9174208641052246, |
|
"learning_rate": 1.5289640103269626e-05, |
|
"loss": 0.1521, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.944, |
|
"grad_norm": 1.3912276029586792, |
|
"learning_rate": 1.5048678545140634e-05, |
|
"loss": 0.1691, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.096, |
|
"grad_norm": 1.8673287630081177, |
|
"learning_rate": 1.4803716430626455e-05, |
|
"loss": 0.1194, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.256, |
|
"grad_norm": 4.073314189910889, |
|
"learning_rate": 1.455494786690634e-05, |
|
"loss": 0.1136, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.416, |
|
"grad_norm": 1.3980969190597534, |
|
"learning_rate": 1.4302569977377462e-05, |
|
"loss": 0.0987, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.576, |
|
"grad_norm": 1.0765067338943481, |
|
"learning_rate": 1.404678274545496e-05, |
|
"loss": 0.125, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.736, |
|
"grad_norm": 1.532691240310669, |
|
"learning_rate": 1.3787788856105762e-05, |
|
"loss": 0.0986, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.896, |
|
"grad_norm": 1.394974708557129, |
|
"learning_rate": 1.3525793535241654e-05, |
|
"loss": 0.1131, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.048, |
|
"grad_norm": 1.7207937240600586, |
|
"learning_rate": 1.3261004387098951e-05, |
|
"loss": 0.0854, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 9.208, |
|
"grad_norm": 1.8617466688156128, |
|
"learning_rate": 1.2993631229733584e-05, |
|
"loss": 0.0808, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.368, |
|
"grad_norm": 1.5372612476348877, |
|
"learning_rate": 1.2723885928761934e-05, |
|
"loss": 0.0694, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.528, |
|
"grad_norm": 0.963295042514801, |
|
"learning_rate": 1.2451982229479245e-05, |
|
"loss": 0.0792, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.688, |
|
"grad_norm": 1.102954387664795, |
|
"learning_rate": 1.2178135587488515e-05, |
|
"loss": 0.0704, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.848, |
|
"grad_norm": 0.988452672958374, |
|
"learning_rate": 1.1902562997974211e-05, |
|
"loss": 0.0904, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.3999573886394501, |
|
"learning_rate": 1.1625482823755965e-05, |
|
"loss": 0.0537, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"grad_norm": 1.1528918743133545, |
|
"learning_rate": 1.1347114622258613e-05, |
|
"loss": 0.0433, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"grad_norm": 0.7834718823432922, |
|
"learning_rate": 1.106767897153559e-05, |
|
"loss": 0.0415, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 10.48, |
|
"grad_norm": 1.2862722873687744, |
|
"learning_rate": 1.078739729548362e-05, |
|
"loss": 0.0537, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 10.64, |
|
"grad_norm": 1.1977423429489136, |
|
"learning_rate": 1.0506491688387128e-05, |
|
"loss": 0.0561, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"grad_norm": 2.4095022678375244, |
|
"learning_rate": 1.0225184738931461e-05, |
|
"loss": 0.0434, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 10.96, |
|
"grad_norm": 0.9543105959892273, |
|
"learning_rate": 9.943699353824344e-06, |
|
"loss": 0.0554, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 11.112, |
|
"grad_norm": 0.7812919616699219, |
|
"learning_rate": 9.66225858116532e-06, |
|
"loss": 0.0356, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 11.272, |
|
"grad_norm": 0.16588281095027924, |
|
"learning_rate": 9.381085433703183e-06, |
|
"loss": 0.0304, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 11.432, |
|
"grad_norm": 0.9616058468818665, |
|
"learning_rate": 9.10040271212139e-06, |
|
"loss": 0.0326, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 11.592, |
|
"grad_norm": 0.7438204288482666, |
|
"learning_rate": 8.820432828491542e-06, |
|
"loss": 0.0387, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 11.752, |
|
"grad_norm": 0.6004329919815063, |
|
"learning_rate": 8.541397630034757e-06, |
|
"loss": 0.0344, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 11.912, |
|
"grad_norm": 2.2135913372039795, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.0192, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 12.064, |
|
"grad_norm": 0.8558197021484375, |
|
"learning_rate": 7.987014799113398e-06, |
|
"loss": 0.0203, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 12.224, |
|
"grad_norm": 2.0103511810302734, |
|
"learning_rate": 7.712106457792883e-06, |
|
"loss": 0.0152, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 12.384, |
|
"grad_norm": 1.6075392961502075, |
|
"learning_rate": 7.439011035840684e-06, |
|
"loss": 0.0168, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 12.544, |
|
"grad_norm": 2.277149200439453, |
|
"learning_rate": 7.16794493317696e-06, |
|
"loss": 0.0174, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 12.704, |
|
"grad_norm": 0.9537448287010193, |
|
"learning_rate": 6.899122941695894e-06, |
|
"loss": 0.0146, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 12.864, |
|
"grad_norm": 0.9526250958442688, |
|
"learning_rate": 6.632758075065288e-06, |
|
"loss": 0.0262, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 13.016, |
|
"grad_norm": 0.5227965712547302, |
|
"learning_rate": 6.369061399935255e-06, |
|
"loss": 0.0205, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 13.176, |
|
"grad_norm": 0.3879546821117401, |
|
"learning_rate": 6.108241868689675e-06, |
|
"loss": 0.0075, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 13.336, |
|
"grad_norm": 0.11712963879108429, |
|
"learning_rate": 5.8505061538730105e-06, |
|
"loss": 0.0089, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 13.496, |
|
"grad_norm": 0.2489776313304901, |
|
"learning_rate": 5.5960584844236565e-06, |
|
"loss": 0.0093, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 13.656, |
|
"grad_norm": 0.24504923820495605, |
|
"learning_rate": 5.345100483843617e-06, |
|
"loss": 0.0086, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 13.816, |
|
"grad_norm": 0.9373164176940918, |
|
"learning_rate": 5.097831010432666e-06, |
|
"loss": 0.0056, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 13.975999999999999, |
|
"grad_norm": 0.16996854543685913, |
|
"learning_rate": 4.854445999713715e-06, |
|
"loss": 0.0044, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 14.128, |
|
"grad_norm": 0.40567436814308167, |
|
"learning_rate": 4.615138309174112e-06, |
|
"loss": 0.003, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 14.288, |
|
"grad_norm": 0.05296005308628082, |
|
"learning_rate": 4.38009756544603e-06, |
|
"loss": 0.002, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 14.448, |
|
"grad_norm": 0.034155845642089844, |
|
"learning_rate": 4.149510014046922e-06, |
|
"loss": 0.0051, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 14.608, |
|
"grad_norm": 0.5557677149772644, |
|
"learning_rate": 3.923558371799194e-06, |
|
"loss": 0.0035, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 14.768, |
|
"grad_norm": 0.2064237892627716, |
|
"learning_rate": 3.7024216820459757e-06, |
|
"loss": 0.0016, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 14.928, |
|
"grad_norm": 0.2826680541038513, |
|
"learning_rate": 3.48627517277778e-06, |
|
"loss": 0.0014, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 15.08, |
|
"grad_norm": 0.06162280961871147, |
|
"learning_rate": 3.275290117782397e-06, |
|
"loss": 0.0017, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 15.24, |
|
"grad_norm": 0.08669278025627136, |
|
"learning_rate": 3.0696337009281263e-06, |
|
"loss": 0.0009, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 15.4, |
|
"grad_norm": 0.04744670167565346, |
|
"learning_rate": 2.869468883687798e-06, |
|
"loss": 0.0017, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 15.56, |
|
"grad_norm": 0.04455321654677391, |
|
"learning_rate": 2.6749542760086613e-06, |
|
"loss": 0.0012, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 15.72, |
|
"grad_norm": 0.03492380306124687, |
|
"learning_rate": 2.4862440106303664e-06, |
|
"loss": 0.0009, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 15.88, |
|
"grad_norm": 0.04152894765138626, |
|
"learning_rate": 2.303487620950677e-06, |
|
"loss": 0.0015, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 16.032, |
|
"grad_norm": 0.058656930923461914, |
|
"learning_rate": 2.126829922535718e-06, |
|
"loss": 0.0011, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 16.192, |
|
"grad_norm": 0.023427117615938187, |
|
"learning_rate": 1.9564108983685758e-06, |
|
"loss": 0.0008, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 16.352, |
|
"grad_norm": 0.02621988020837307, |
|
"learning_rate": 1.7923655879272395e-06, |
|
"loss": 0.0011, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 16.512, |
|
"grad_norm": 0.02891433797776699, |
|
"learning_rate": 1.634823980179766e-06, |
|
"loss": 0.001, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 16.672, |
|
"grad_norm": 0.044328656047582626, |
|
"learning_rate": 1.483910910581452e-06, |
|
"loss": 0.0012, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 16.832, |
|
"grad_norm": 0.05566035583615303, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 0.0008, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 16.992, |
|
"grad_norm": 0.030019577592611313, |
|
"learning_rate": 1.2024433707364002e-06, |
|
"loss": 0.0013, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 17.144, |
|
"grad_norm": 0.07299891859292984, |
|
"learning_rate": 1.0721119344486841e-06, |
|
"loss": 0.0013, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 17.304, |
|
"grad_norm": 0.0282142274081707, |
|
"learning_rate": 9.488549274967873e-07, |
|
"loss": 0.0008, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 17.464, |
|
"grad_norm": 0.028321148827672005, |
|
"learning_rate": 8.327700183303433e-07, |
|
"loss": 0.0009, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 17.624, |
|
"grad_norm": 0.03647984564304352, |
|
"learning_rate": 7.239491922521247e-07, |
|
"loss": 0.0011, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 17.784, |
|
"grad_norm": 0.029092855751514435, |
|
"learning_rate": 6.22478678529197e-07, |
|
"loss": 0.0008, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 17.944, |
|
"grad_norm": 0.028766291216015816, |
|
"learning_rate": 5.284388820651331e-07, |
|
"loss": 0.001, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 18.096, |
|
"grad_norm": 0.03702507168054581, |
|
"learning_rate": 4.41904319687424e-07, |
|
"loss": 0.0011, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 18.256, |
|
"grad_norm": 0.025985565036535263, |
|
"learning_rate": 3.629435611005916e-07, |
|
"loss": 0.0008, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 18.416, |
|
"grad_norm": 0.0245125163346529, |
|
"learning_rate": 2.916191745517749e-07, |
|
"loss": 0.001, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 18.576, |
|
"grad_norm": 0.01987411454319954, |
|
"learning_rate": 2.2798767725185856e-07, |
|
"loss": 0.0007, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 18.736, |
|
"grad_norm": 0.06840561330318451, |
|
"learning_rate": 1.7209949059142084e-07, |
|
"loss": 0.001, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 18.896, |
|
"grad_norm": 0.018267642706632614, |
|
"learning_rate": 1.2399890018698345e-07, |
|
"loss": 0.001, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 19.048, |
|
"grad_norm": 0.02801842987537384, |
|
"learning_rate": 8.372402078924092e-08, |
|
"loss": 0.0008, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 19.208, |
|
"grad_norm": 0.030942991375923157, |
|
"learning_rate": 5.1306766081048456e-08, |
|
"loss": 0.0008, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 19.368, |
|
"grad_norm": 0.030538473278284073, |
|
"learning_rate": 2.6772823389131787e-08, |
|
"loss": 0.0009, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 19.528, |
|
"grad_norm": 0.0364975668489933, |
|
"learning_rate": 1.0141633329525669e-08, |
|
"loss": 0.0012, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 19.688, |
|
"grad_norm": 0.029744159430265427, |
|
"learning_rate": 1.4263744029019422e-09, |
|
"loss": 0.0008, |
|
"step": 1240 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1240, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.817147240873984e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|