|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.998916106655105, |
|
"eval_steps": 500, |
|
"global_step": 2306, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008671146759158899, |
|
"grad_norm": 5.864805612767453e+19, |
|
"learning_rate": 1.2987012987012986e-06, |
|
"loss": 1.5218, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004335573379579449, |
|
"grad_norm": 0.5226513429859231, |
|
"learning_rate": 6.493506493506493e-06, |
|
"loss": 1.36, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008671146759158898, |
|
"grad_norm": 0.7095719390962212, |
|
"learning_rate": 1.2987012987012986e-05, |
|
"loss": 1.342, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013006720138738348, |
|
"grad_norm": 0.429950690175044, |
|
"learning_rate": 1.9480519480519476e-05, |
|
"loss": 1.3738, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.017342293518317797, |
|
"grad_norm": 0.25369452359038985, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 1.2215, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.021677866897897247, |
|
"grad_norm": 14.366536861443981, |
|
"learning_rate": 3.246753246753247e-05, |
|
"loss": 1.2357, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.026013440277476697, |
|
"grad_norm": 0.2687159676086448, |
|
"learning_rate": 3.896103896103895e-05, |
|
"loss": 1.1712, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.030349013657056147, |
|
"grad_norm": 0.15608265987863726, |
|
"learning_rate": 4.545454545454545e-05, |
|
"loss": 1.1152, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03468458703663559, |
|
"grad_norm": 0.11635461961055855, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 1.1033, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03902016041621505, |
|
"grad_norm": 0.11449246594254398, |
|
"learning_rate": 5.8441558441558436e-05, |
|
"loss": 1.0896, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04335573379579449, |
|
"grad_norm": 0.11032771125077512, |
|
"learning_rate": 6.493506493506494e-05, |
|
"loss": 1.0795, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04769130717537394, |
|
"grad_norm": 0.10732618594326016, |
|
"learning_rate": 7.142857142857142e-05, |
|
"loss": 1.1107, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05202688055495339, |
|
"grad_norm": 0.11532043672084052, |
|
"learning_rate": 7.79220779220779e-05, |
|
"loss": 1.0941, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05636245393453284, |
|
"grad_norm": 0.10484657398976702, |
|
"learning_rate": 8.441558441558442e-05, |
|
"loss": 1.0636, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06069802731411229, |
|
"grad_norm": 0.11736262481626532, |
|
"learning_rate": 9.09090909090909e-05, |
|
"loss": 1.0939, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06503360069369174, |
|
"grad_norm": 0.14275729093576736, |
|
"learning_rate": 9.740259740259739e-05, |
|
"loss": 1.0876, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06936917407327119, |
|
"grad_norm": 0.15749002462194958, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 1.073, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07370474745285063, |
|
"grad_norm": 0.19946818080973713, |
|
"learning_rate": 0.00011038961038961037, |
|
"loss": 1.0935, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0780403208324301, |
|
"grad_norm": 0.5992707412202727, |
|
"learning_rate": 0.00011688311688311687, |
|
"loss": 1.1826, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08237589421200954, |
|
"grad_norm": 0.19153297117483123, |
|
"learning_rate": 0.00012337662337662337, |
|
"loss": 1.1079, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08671146759158899, |
|
"grad_norm": 0.23132363172618897, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 1.1509, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09104704097116843, |
|
"grad_norm": 13.74435400048776, |
|
"learning_rate": 0.00013636363636363634, |
|
"loss": 1.271, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09538261435074788, |
|
"grad_norm": 0.5226174167068346, |
|
"learning_rate": 0.00014285714285714284, |
|
"loss": 1.1902, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09971818773032734, |
|
"grad_norm": 0.3629740083051462, |
|
"learning_rate": 0.00014935064935064934, |
|
"loss": 1.1643, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.10405376110990679, |
|
"grad_norm": 0.22433655763478755, |
|
"learning_rate": 0.0001558441558441558, |
|
"loss": 1.155, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10838933448948623, |
|
"grad_norm": 0.1335411736809728, |
|
"learning_rate": 0.0001623376623376623, |
|
"loss": 1.1243, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11272490786906568, |
|
"grad_norm": 0.20752640927016786, |
|
"learning_rate": 0.00016883116883116884, |
|
"loss": 1.1104, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11706048124864514, |
|
"grad_norm": 0.12168629703712475, |
|
"learning_rate": 0.0001753246753246753, |
|
"loss": 1.114, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.12139605462822459, |
|
"grad_norm": 0.26232391688421475, |
|
"learning_rate": 0.0001818181818181818, |
|
"loss": 1.1204, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12573162800780402, |
|
"grad_norm": 0.16457276786988467, |
|
"learning_rate": 0.00018831168831168828, |
|
"loss": 1.1442, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.13006720138738348, |
|
"grad_norm": 0.15538288562486247, |
|
"learning_rate": 0.00019480519480519478, |
|
"loss": 1.1111, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13440277476696294, |
|
"grad_norm": 0.201009562048921, |
|
"learning_rate": 0.0002012987012987013, |
|
"loss": 1.1284, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13873834814654237, |
|
"grad_norm": 0.10830435883952598, |
|
"learning_rate": 0.00020779220779220778, |
|
"loss": 1.13, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14307392152612183, |
|
"grad_norm": 0.11463870554451829, |
|
"learning_rate": 0.00021428571428571427, |
|
"loss": 1.1146, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14740949490570127, |
|
"grad_norm": 0.12316133988511195, |
|
"learning_rate": 0.00022077922077922075, |
|
"loss": 1.1238, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15174506828528073, |
|
"grad_norm": 0.19826475153806197, |
|
"learning_rate": 0.00022727272727272725, |
|
"loss": 1.2731, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1560806416648602, |
|
"grad_norm": 0.928027899524197, |
|
"learning_rate": 0.00023376623376623374, |
|
"loss": 1.1896, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16041621504443962, |
|
"grad_norm": 67.14385763593648, |
|
"learning_rate": 0.00024025974025974024, |
|
"loss": 3.7592, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.16475178842401908, |
|
"grad_norm": 172.25626988972317, |
|
"learning_rate": 0.00024675324675324674, |
|
"loss": 6.1514, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1690873618035985, |
|
"grad_norm": 8.384860694083399, |
|
"learning_rate": 0.0002532467532467532, |
|
"loss": 8.2367, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.17342293518317797, |
|
"grad_norm": 15.817512331978708, |
|
"learning_rate": 0.00025974025974025974, |
|
"loss": 10.8846, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17775850856275743, |
|
"grad_norm": 3.4669948773987103, |
|
"learning_rate": 0.0002662337662337662, |
|
"loss": 9.8149, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.18209408194233687, |
|
"grad_norm": 6.211867615520024, |
|
"learning_rate": 0.0002727272727272727, |
|
"loss": 10.4091, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18642965532191633, |
|
"grad_norm": 8.673657491918139, |
|
"learning_rate": 0.0002792207792207792, |
|
"loss": 10.3271, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.19076522870149576, |
|
"grad_norm": 2.503560404486804, |
|
"learning_rate": 0.0002857142857142857, |
|
"loss": 12.6414, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19510080208107522, |
|
"grad_norm": 3.8638747380263703, |
|
"learning_rate": 0.00029220779220779215, |
|
"loss": 8.4926, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19943637546065468, |
|
"grad_norm": 1.0101210010336394, |
|
"learning_rate": 0.0002987012987012987, |
|
"loss": 7.6948, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2037719488402341, |
|
"grad_norm": 0.8467011565643021, |
|
"learning_rate": 0.0002999972492985145, |
|
"loss": 7.5188, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.20810752221981357, |
|
"grad_norm": 0.5245296390809255, |
|
"learning_rate": 0.0002999860747466326, |
|
"loss": 7.4533, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.212443095599393, |
|
"grad_norm": 0.42147554363918227, |
|
"learning_rate": 0.0002999663050653897, |
|
"loss": 7.4204, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21677866897897247, |
|
"grad_norm": 0.38921879090695144, |
|
"learning_rate": 0.00029993794138771085, |
|
"loss": 7.4012, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.22111424235855193, |
|
"grad_norm": 0.2418087757057217, |
|
"learning_rate": 0.0002999009853390101, |
|
"loss": 7.3999, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.22544981573813136, |
|
"grad_norm": 0.35618630058079676, |
|
"learning_rate": 0.0002998554390370975, |
|
"loss": 7.3883, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22978538911771082, |
|
"grad_norm": 0.18510696993654735, |
|
"learning_rate": 0.0002998013050920577, |
|
"loss": 7.3686, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.23412096249729028, |
|
"grad_norm": 0.21788844504555988, |
|
"learning_rate": 0.0002997385866061005, |
|
"loss": 7.3719, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2384565358768697, |
|
"grad_norm": 0.2255775965595111, |
|
"learning_rate": 0.00029966728717338294, |
|
"loss": 7.3634, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.24279210925644917, |
|
"grad_norm": 0.21067089388376176, |
|
"learning_rate": 0.0002995874108798032, |
|
"loss": 7.3456, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2471276826360286, |
|
"grad_norm": 0.1534328867763096, |
|
"learning_rate": 0.00029949896230276675, |
|
"loss": 7.3761, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.25146325601560804, |
|
"grad_norm": 0.12732733473622673, |
|
"learning_rate": 0.000299401946510924, |
|
"loss": 7.3546, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2557988293951875, |
|
"grad_norm": 0.13550982122013763, |
|
"learning_rate": 0.0002992963690638794, |
|
"loss": 7.3462, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.26013440277476696, |
|
"grad_norm": 0.17601177463870957, |
|
"learning_rate": 0.0002991822360118736, |
|
"loss": 7.3682, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2644699761543464, |
|
"grad_norm": 0.14421180444911577, |
|
"learning_rate": 0.00029905955389543604, |
|
"loss": 7.3557, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2688055495339259, |
|
"grad_norm": 0.185975203761939, |
|
"learning_rate": 0.00029892832974501044, |
|
"loss": 7.356, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2731411229135053, |
|
"grad_norm": 0.14867246915893478, |
|
"learning_rate": 0.00029878857108055185, |
|
"loss": 7.3347, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.27747669629308475, |
|
"grad_norm": 0.1334709517444074, |
|
"learning_rate": 0.00029864028591109593, |
|
"loss": 7.375, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2818122696726642, |
|
"grad_norm": 0.1642667927001403, |
|
"learning_rate": 0.00029848348273429947, |
|
"loss": 7.3474, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.28614784305224367, |
|
"grad_norm": 0.17688345956299092, |
|
"learning_rate": 0.0002983181705359541, |
|
"loss": 7.3567, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2904834164318231, |
|
"grad_norm": 0.16088931478744917, |
|
"learning_rate": 0.00029814435878947076, |
|
"loss": 7.3632, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.29481898981140253, |
|
"grad_norm": 0.19798325540549802, |
|
"learning_rate": 0.000297962057455337, |
|
"loss": 7.3831, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.299154563190982, |
|
"grad_norm": 0.1608484618987419, |
|
"learning_rate": 0.0002977712769805465, |
|
"loss": 7.3528, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.30349013657056145, |
|
"grad_norm": 0.1484369817347631, |
|
"learning_rate": 0.00029757202829799986, |
|
"loss": 7.3502, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3078257099501409, |
|
"grad_norm": 0.22145679449709124, |
|
"learning_rate": 0.0002973643228258784, |
|
"loss": 7.3133, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3121612833297204, |
|
"grad_norm": 0.2122717500901206, |
|
"learning_rate": 0.0002971481724669898, |
|
"loss": 7.3684, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3164968567092998, |
|
"grad_norm": 0.14900310266228523, |
|
"learning_rate": 0.0002969235896080861, |
|
"loss": 7.3474, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.32083243008887924, |
|
"grad_norm": 0.17278164384767145, |
|
"learning_rate": 0.0002966905871191534, |
|
"loss": 7.3683, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3251680034684587, |
|
"grad_norm": 0.20274199845375984, |
|
"learning_rate": 0.0002964491783526749, |
|
"loss": 7.3476, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.32950357684803816, |
|
"grad_norm": 0.13540893972237644, |
|
"learning_rate": 0.00029619937714286547, |
|
"loss": 7.3424, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3338391502276176, |
|
"grad_norm": 0.16792713197591935, |
|
"learning_rate": 0.0002959411978048787, |
|
"loss": 7.3629, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.338174723607197, |
|
"grad_norm": 0.16598160402305692, |
|
"learning_rate": 0.00029567465513398694, |
|
"loss": 7.3435, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3425102969867765, |
|
"grad_norm": 0.15743754774094762, |
|
"learning_rate": 0.00029539976440473304, |
|
"loss": 7.3405, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.34684587036635595, |
|
"grad_norm": 0.1928431563375656, |
|
"learning_rate": 0.00029511654137005534, |
|
"loss": 7.3398, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3511814437459354, |
|
"grad_norm": 0.4234041342810784, |
|
"learning_rate": 0.00029482500226038467, |
|
"loss": 7.3163, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.35551701712551487, |
|
"grad_norm": 1.6964467666559244, |
|
"learning_rate": 0.00029452516378271446, |
|
"loss": 7.4424, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.35985259050509427, |
|
"grad_norm": 1.1952504992783122, |
|
"learning_rate": 0.00029421704311964316, |
|
"loss": 7.3051, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.36418816388467373, |
|
"grad_norm": 0.9910699747566197, |
|
"learning_rate": 0.0002939006579283898, |
|
"loss": 7.1588, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3685237372642532, |
|
"grad_norm": 0.5010649163848268, |
|
"learning_rate": 0.00029357602633978185, |
|
"loss": 7.0579, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.37285931064383265, |
|
"grad_norm": 0.3223794058961508, |
|
"learning_rate": 0.0002932431669572163, |
|
"loss": 6.9952, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3771948840234121, |
|
"grad_norm": 0.39385777708948017, |
|
"learning_rate": 0.00029290209885559363, |
|
"loss": 6.9317, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3815304574029915, |
|
"grad_norm": 0.6906673045815623, |
|
"learning_rate": 0.00029255284158022474, |
|
"loss": 6.9197, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.385866030782571, |
|
"grad_norm": 0.5699230517220503, |
|
"learning_rate": 0.00029219541514571075, |
|
"loss": 6.9122, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.39020160416215044, |
|
"grad_norm": 0.3399949766360826, |
|
"learning_rate": 0.00029182984003479613, |
|
"loss": 6.8496, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3945371775417299, |
|
"grad_norm": 0.46915620454457757, |
|
"learning_rate": 0.00029145613719719484, |
|
"loss": 6.8021, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.39887275092130936, |
|
"grad_norm": 0.7795294645969753, |
|
"learning_rate": 0.0002910743280483899, |
|
"loss": 6.7266, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.40320832430088877, |
|
"grad_norm": 1.1660550286252116, |
|
"learning_rate": 0.00029068443446840606, |
|
"loss": 6.8039, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4075438976804682, |
|
"grad_norm": 1.9954850065496847, |
|
"learning_rate": 0.0002902864788005559, |
|
"loss": 6.7036, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4118794710600477, |
|
"grad_norm": 1.3144540313281778, |
|
"learning_rate": 0.00028988048385015955, |
|
"loss": 6.6625, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.41621504443962715, |
|
"grad_norm": 0.5550135068925496, |
|
"learning_rate": 0.00028946647288323766, |
|
"loss": 6.5448, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4205506178192066, |
|
"grad_norm": 0.30284368817332974, |
|
"learning_rate": 0.0002890444696251783, |
|
"loss": 6.4523, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.424886191198786, |
|
"grad_norm": 0.5328756605828856, |
|
"learning_rate": 0.0002886144982593771, |
|
"loss": 6.3727, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4292217645783655, |
|
"grad_norm": 1.2843902165543528, |
|
"learning_rate": 0.0002881765834258516, |
|
"loss": 6.471, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.43355733795794493, |
|
"grad_norm": 0.7139034044101961, |
|
"learning_rate": 0.00028773075021982917, |
|
"loss": 6.3271, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4378929113375244, |
|
"grad_norm": 0.41272252446113744, |
|
"learning_rate": 0.00028727702419030883, |
|
"loss": 6.2754, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.44222848471710385, |
|
"grad_norm": 0.24680608205710353, |
|
"learning_rate": 0.00028681543133859716, |
|
"loss": 6.1946, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4465640580966833, |
|
"grad_norm": 0.9592076263299766, |
|
"learning_rate": 0.0002863459981168184, |
|
"loss": 6.1744, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.4508996314762627, |
|
"grad_norm": 2.2348898675903532, |
|
"learning_rate": 0.0002858687514263983, |
|
"loss": 6.112, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4552352048558422, |
|
"grad_norm": 0.5214216239256872, |
|
"learning_rate": 0.00028538371861652284, |
|
"loss": 6.1034, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.45957077823542164, |
|
"grad_norm": 0.6212645654417902, |
|
"learning_rate": 0.00028489092748257066, |
|
"loss": 6.0164, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4639063516150011, |
|
"grad_norm": 0.7124953084880589, |
|
"learning_rate": 0.0002843904062645204, |
|
"loss": 5.986, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.46824192499458056, |
|
"grad_norm": 0.3376451437228419, |
|
"learning_rate": 0.0002838821836453323, |
|
"loss": 5.9095, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.47257749837415997, |
|
"grad_norm": 2.464753993204931, |
|
"learning_rate": 0.0002833662887493045, |
|
"loss": 5.9207, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4769130717537394, |
|
"grad_norm": 0.8049577940385247, |
|
"learning_rate": 0.00028284275114040395, |
|
"loss": 5.9179, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4812486451333189, |
|
"grad_norm": 1.3390472579975088, |
|
"learning_rate": 0.0002823116008205725, |
|
"loss": 5.9107, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.48558421851289835, |
|
"grad_norm": 0.6905384313954396, |
|
"learning_rate": 0.00028177286822800713, |
|
"loss": 5.796, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4899197918924778, |
|
"grad_norm": 2.8514904783073898, |
|
"learning_rate": 0.0002812265842354162, |
|
"loss": 5.7603, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4942553652720572, |
|
"grad_norm": 1.5555920419031897, |
|
"learning_rate": 0.0002806727801482498, |
|
"loss": 5.9134, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4985909386516367, |
|
"grad_norm": 0.8043247332715469, |
|
"learning_rate": 0.000280111487702906, |
|
"loss": 5.8254, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5029265120312161, |
|
"grad_norm": 0.5751977544216561, |
|
"learning_rate": 0.0002795427390649119, |
|
"loss": 5.7081, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5072620854107955, |
|
"grad_norm": 0.6822440874804724, |
|
"learning_rate": 0.00027896656682708094, |
|
"loss": 5.6121, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.511597658790375, |
|
"grad_norm": 0.8797162911577192, |
|
"learning_rate": 0.0002783830040076444, |
|
"loss": 5.5998, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5159332321699545, |
|
"grad_norm": 0.4760553301829561, |
|
"learning_rate": 0.0002777920840483596, |
|
"loss": 5.5739, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5202688055495339, |
|
"grad_norm": 1.2070114825643932, |
|
"learning_rate": 0.0002771938408125936, |
|
"loss": 5.539, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5246043789291134, |
|
"grad_norm": 0.3589877318117702, |
|
"learning_rate": 0.00027658830858338245, |
|
"loss": 5.5504, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5289399523086928, |
|
"grad_norm": 0.7944121659454099, |
|
"learning_rate": 0.0002759755220614664, |
|
"loss": 5.5072, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5332755256882723, |
|
"grad_norm": 0.7202458729236075, |
|
"learning_rate": 0.00027535551636330175, |
|
"loss": 5.454, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5376110990678518, |
|
"grad_norm": 2.106706884389399, |
|
"learning_rate": 0.0002747283270190482, |
|
"loss": 5.4935, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5419466724474312, |
|
"grad_norm": 1.3364284227174121, |
|
"learning_rate": 0.0002740939899705327, |
|
"loss": 5.5994, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5462822458270106, |
|
"grad_norm": 0.6407462262016212, |
|
"learning_rate": 0.00027345254156918976, |
|
"loss": 5.5447, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.55061781920659, |
|
"grad_norm": 0.7271335411877718, |
|
"learning_rate": 0.00027280401857397854, |
|
"loss": 5.4461, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5549533925861695, |
|
"grad_norm": 0.9256391048561129, |
|
"learning_rate": 0.0002721484581492762, |
|
"loss": 5.3663, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.559288965965749, |
|
"grad_norm": 0.7708081453999853, |
|
"learning_rate": 0.00027148589786274793, |
|
"loss": 5.3796, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5636245393453284, |
|
"grad_norm": 0.7777461095797896, |
|
"learning_rate": 0.00027081637568319446, |
|
"loss": 5.2963, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5679601127249079, |
|
"grad_norm": 0.5329435487662679, |
|
"learning_rate": 0.00027013992997837585, |
|
"loss": 5.2219, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5722956861044873, |
|
"grad_norm": 0.6524870578047196, |
|
"learning_rate": 0.0002694565995128132, |
|
"loss": 5.2601, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5766312594840668, |
|
"grad_norm": 1.2050259366493623, |
|
"learning_rate": 0.0002687664234455667, |
|
"loss": 5.2788, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5809668328636463, |
|
"grad_norm": 0.6205369626056482, |
|
"learning_rate": 0.00026806944132799196, |
|
"loss": 5.1169, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5853024062432257, |
|
"grad_norm": 0.9580684303122623, |
|
"learning_rate": 0.0002673656931014735, |
|
"loss": 5.1311, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5896379796228051, |
|
"grad_norm": 0.9227742616515185, |
|
"learning_rate": 0.00026665521909513545, |
|
"loss": 5.1194, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5939735530023845, |
|
"grad_norm": 0.35677362077396757, |
|
"learning_rate": 0.00026593806002353086, |
|
"loss": 5.0662, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.598309126381964, |
|
"grad_norm": 0.5456219742352171, |
|
"learning_rate": 0.0002652142569843083, |
|
"loss": 4.9998, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6026446997615434, |
|
"grad_norm": 2.0462321590818453, |
|
"learning_rate": 0.0002644838514558568, |
|
"loss": 5.0121, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6069802731411229, |
|
"grad_norm": 1.2389697363524088, |
|
"learning_rate": 0.00026374688529492887, |
|
"loss": 4.9563, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6113158465207024, |
|
"grad_norm": 0.6691025924761106, |
|
"learning_rate": 0.0002630034007342416, |
|
"loss": 4.9738, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6156514199002818, |
|
"grad_norm": 0.6906310139155549, |
|
"learning_rate": 0.00026225344038005707, |
|
"loss": 4.9986, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6199869932798613, |
|
"grad_norm": 0.9744576847003475, |
|
"learning_rate": 0.00026149704720974004, |
|
"loss": 4.9758, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6243225666594407, |
|
"grad_norm": 1.4069257744518648, |
|
"learning_rate": 0.0002607342645692955, |
|
"loss": 4.9898, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6286581400390202, |
|
"grad_norm": 1.2113043966119588, |
|
"learning_rate": 0.0002599651361708846, |
|
"loss": 4.947, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6329937134185996, |
|
"grad_norm": 0.7232403329008882, |
|
"learning_rate": 0.0002591897060903197, |
|
"loss": 4.8734, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.637329286798179, |
|
"grad_norm": 0.5694346606113075, |
|
"learning_rate": 0.0002584080187645384, |
|
"loss": 4.8135, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6416648601777585, |
|
"grad_norm": 0.4648121706598134, |
|
"learning_rate": 0.00025762011898905723, |
|
"loss": 4.8169, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6460004335573379, |
|
"grad_norm": 0.57832813623837, |
|
"learning_rate": 0.00025682605191540447, |
|
"loss": 4.7676, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6503360069369174, |
|
"grad_norm": 0.690363584833736, |
|
"learning_rate": 0.00025602586304853265, |
|
"loss": 4.7134, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6546715803164969, |
|
"grad_norm": 0.2557886503788499, |
|
"learning_rate": 0.000255219598244211, |
|
"loss": 4.7075, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6590071536960763, |
|
"grad_norm": 0.7529134213339467, |
|
"learning_rate": 0.00025440730370639744, |
|
"loss": 4.65, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6633427270756558, |
|
"grad_norm": 0.5243957618805375, |
|
"learning_rate": 0.00025358902598459097, |
|
"loss": 4.6432, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6676783004552352, |
|
"grad_norm": 0.8067461754194933, |
|
"learning_rate": 0.00025276481197116397, |
|
"loss": 4.6508, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6720138738348147, |
|
"grad_norm": 1.2265176252828778, |
|
"learning_rate": 0.00025193470889867505, |
|
"loss": 4.6586, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.676349447214394, |
|
"grad_norm": 1.0124995665289962, |
|
"learning_rate": 0.00025109876433716236, |
|
"loss": 4.5788, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6806850205939735, |
|
"grad_norm": 0.9858327008465355, |
|
"learning_rate": 0.0002502570261914174, |
|
"loss": 4.5459, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.685020593973553, |
|
"grad_norm": 0.675937090658665, |
|
"learning_rate": 0.0002494095426982399, |
|
"loss": 4.5489, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6893561673531324, |
|
"grad_norm": 0.9300433002492945, |
|
"learning_rate": 0.0002485563624236736, |
|
"loss": 4.55, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6936917407327119, |
|
"grad_norm": 0.5489664608896133, |
|
"learning_rate": 0.0002476975342602229, |
|
"loss": 4.4796, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6980273141122914, |
|
"grad_norm": 0.826574591896581, |
|
"learning_rate": 0.00024683310742405106, |
|
"loss": 4.4609, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.7023628874918708, |
|
"grad_norm": 1.1037971961437545, |
|
"learning_rate": 0.00024596313145216033, |
|
"loss": 4.5026, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7066984608714503, |
|
"grad_norm": 0.4898602321700531, |
|
"learning_rate": 0.0002450876561995523, |
|
"loss": 4.4346, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7110340342510297, |
|
"grad_norm": 0.5951952065642176, |
|
"learning_rate": 0.00024420673183637146, |
|
"loss": 4.4397, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7153696076306092, |
|
"grad_norm": 0.8199632042459121, |
|
"learning_rate": 0.00024332040884503023, |
|
"loss": 4.4169, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7197051810101885, |
|
"grad_norm": 0.7484859334102708, |
|
"learning_rate": 0.00024242873801731552, |
|
"loss": 4.4214, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.724040754389768, |
|
"grad_norm": 0.3847206228382832, |
|
"learning_rate": 0.0002415317704514785, |
|
"loss": 4.4005, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7283763277693475, |
|
"grad_norm": 0.25033596262216246, |
|
"learning_rate": 0.0002406295575493061, |
|
"loss": 4.2858, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7327119011489269, |
|
"grad_norm": 0.6631031175545227, |
|
"learning_rate": 0.00023972215101317545, |
|
"loss": 4.2667, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7370474745285064, |
|
"grad_norm": 0.9152978939845885, |
|
"learning_rate": 0.00023880960284309116, |
|
"loss": 4.2363, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7413830479080858, |
|
"grad_norm": 1.6470126913016125, |
|
"learning_rate": 0.000237891965333705, |
|
"loss": 4.2788, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.7457186212876653, |
|
"grad_norm": 0.9797191599004809, |
|
"learning_rate": 0.00023696929107131962, |
|
"loss": 4.3022, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7500541946672448, |
|
"grad_norm": 0.6420083064783988, |
|
"learning_rate": 0.00023604163293087447, |
|
"loss": 4.2127, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7543897680468242, |
|
"grad_norm": 0.3430659331012604, |
|
"learning_rate": 0.0002351090440729163, |
|
"loss": 4.183, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7587253414264037, |
|
"grad_norm": 0.5456932919947513, |
|
"learning_rate": 0.00023417157794055233, |
|
"loss": 4.1664, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.763060914805983, |
|
"grad_norm": 0.6297764897631516, |
|
"learning_rate": 0.0002332292882563877, |
|
"loss": 4.1577, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7673964881855625, |
|
"grad_norm": 0.7190804662247191, |
|
"learning_rate": 0.00023228222901944693, |
|
"loss": 4.1005, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.771732061565142, |
|
"grad_norm": 0.7661623579650987, |
|
"learning_rate": 0.00023133045450207952, |
|
"loss": 4.1292, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7760676349447214, |
|
"grad_norm": 1.0103277074141193, |
|
"learning_rate": 0.00023037401924684946, |
|
"loss": 4.1244, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7804032083243009, |
|
"grad_norm": 0.6247976412624747, |
|
"learning_rate": 0.0002294129780634101, |
|
"loss": 4.1062, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7847387817038803, |
|
"grad_norm": 0.6772595937883702, |
|
"learning_rate": 0.00022844738602536275, |
|
"loss": 4.0618, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7890743550834598, |
|
"grad_norm": 0.6184206318420459, |
|
"learning_rate": 0.00022747729846710085, |
|
"loss": 4.0676, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7934099284630393, |
|
"grad_norm": 0.638858720650207, |
|
"learning_rate": 0.0002265027709806391, |
|
"loss": 4.0643, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7977455018426187, |
|
"grad_norm": 0.6957940244542513, |
|
"learning_rate": 0.00022552385941242736, |
|
"loss": 4.0841, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8020810752221982, |
|
"grad_norm": 0.9335514321597355, |
|
"learning_rate": 0.00022454061986015047, |
|
"loss": 4.0154, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.8064166486017775, |
|
"grad_norm": 0.4531070372158566, |
|
"learning_rate": 0.0002235531086695137, |
|
"loss": 3.9897, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.810752221981357, |
|
"grad_norm": 0.9518970467477127, |
|
"learning_rate": 0.00022256138243101337, |
|
"loss": 3.9785, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8150877953609365, |
|
"grad_norm": 0.7222836717307966, |
|
"learning_rate": 0.00022156549797669434, |
|
"loss": 3.9408, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8194233687405159, |
|
"grad_norm": 0.4376277363109283, |
|
"learning_rate": 0.00022056551237689277, |
|
"loss": 3.9633, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8237589421200954, |
|
"grad_norm": 0.40106634394410035, |
|
"learning_rate": 0.00021956148293696584, |
|
"loss": 3.9324, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8280945154996748, |
|
"grad_norm": 0.4672307168059042, |
|
"learning_rate": 0.00021855346719400787, |
|
"loss": 3.9066, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8324300888792543, |
|
"grad_norm": 0.9110683889580367, |
|
"learning_rate": 0.00021754152291355284, |
|
"loss": 3.8493, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8367656622588338, |
|
"grad_norm": 0.8166157046078413, |
|
"learning_rate": 0.0002165257080862643, |
|
"loss": 3.8129, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8411012356384132, |
|
"grad_norm": 0.5406920005139866, |
|
"learning_rate": 0.00021550608092461208, |
|
"loss": 3.8946, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8454368090179927, |
|
"grad_norm": 0.9346807879316044, |
|
"learning_rate": 0.00021448269985953634, |
|
"loss": 3.8407, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.849772382397572, |
|
"grad_norm": 0.5449137337223598, |
|
"learning_rate": 0.00021345562353709905, |
|
"loss": 3.8459, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8541079557771515, |
|
"grad_norm": 0.5017093393188926, |
|
"learning_rate": 0.00021242491081512329, |
|
"loss": 3.8334, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.858443529156731, |
|
"grad_norm": 0.3346538476585638, |
|
"learning_rate": 0.00021139062075982038, |
|
"loss": 3.7552, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8627791025363104, |
|
"grad_norm": 0.3170992203885485, |
|
"learning_rate": 0.00021035281264240491, |
|
"loss": 3.7351, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8671146759158899, |
|
"grad_norm": 0.6053179178799784, |
|
"learning_rate": 0.00020931154593569813, |
|
"loss": 3.7225, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8714502492954693, |
|
"grad_norm": 1.3058143067536492, |
|
"learning_rate": 0.00020826688031072, |
|
"loss": 3.7079, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8757858226750488, |
|
"grad_norm": 0.5266184524547373, |
|
"learning_rate": 0.00020721887563326924, |
|
"loss": 3.7352, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8801213960546282, |
|
"grad_norm": 0.420302054844465, |
|
"learning_rate": 0.0002061675919604932, |
|
"loss": 3.6589, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8844569694342077, |
|
"grad_norm": 0.7488296555492675, |
|
"learning_rate": 0.00020511308953744578, |
|
"loss": 3.6358, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8887925428137872, |
|
"grad_norm": 0.6601885350762652, |
|
"learning_rate": 0.0002040554287936352, |
|
"loss": 3.6682, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8931281161933666, |
|
"grad_norm": 0.3880230100654199, |
|
"learning_rate": 0.000202994670339561, |
|
"loss": 3.6391, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.897463689572946, |
|
"grad_norm": 0.3917175363168505, |
|
"learning_rate": 0.00020193087496324068, |
|
"loss": 3.6016, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.9017992629525254, |
|
"grad_norm": 0.46356273931086533, |
|
"learning_rate": 0.00020086410362672608, |
|
"loss": 3.5906, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9061348363321049, |
|
"grad_norm": 1.1143119522475413, |
|
"learning_rate": 0.00019979441746261007, |
|
"loss": 3.6533, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.9104704097116844, |
|
"grad_norm": 0.9967538144738599, |
|
"learning_rate": 0.0001987218777705231, |
|
"loss": 3.6323, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9148059830912638, |
|
"grad_norm": 0.44096052107987527, |
|
"learning_rate": 0.0001976465460136204, |
|
"loss": 3.5632, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.9191415564708433, |
|
"grad_norm": 0.2828790472438776, |
|
"learning_rate": 0.0001965684838150598, |
|
"loss": 3.5499, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9234771298504227, |
|
"grad_norm": 0.3974061570448302, |
|
"learning_rate": 0.00019548775295447047, |
|
"loss": 3.5173, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9278127032300022, |
|
"grad_norm": 0.33203900964680516, |
|
"learning_rate": 0.00019440441536441202, |
|
"loss": 3.514, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9321482766095817, |
|
"grad_norm": 0.5937750161188399, |
|
"learning_rate": 0.00019331853312682613, |
|
"loss": 3.4923, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.9364838499891611, |
|
"grad_norm": 0.45882557948133224, |
|
"learning_rate": 0.00019223016846947843, |
|
"loss": 3.4693, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9408194233687405, |
|
"grad_norm": 0.49337347425087247, |
|
"learning_rate": 0.00019113938376239247, |
|
"loss": 3.4604, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.9451549967483199, |
|
"grad_norm": 0.35941699726534343, |
|
"learning_rate": 0.00019004624151427568, |
|
"loss": 3.4682, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9494905701278994, |
|
"grad_norm": 0.30818796537571425, |
|
"learning_rate": 0.0001889508043689372, |
|
"loss": 3.4252, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.9538261435074789, |
|
"grad_norm": 0.7001199109738344, |
|
"learning_rate": 0.00018785313510169782, |
|
"loss": 3.4065, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9581617168870583, |
|
"grad_norm": 0.9768195615514486, |
|
"learning_rate": 0.0001867532966157929, |
|
"loss": 3.4084, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.9624972902666378, |
|
"grad_norm": 0.5379630797492526, |
|
"learning_rate": 0.0001856513519387673, |
|
"loss": 3.4402, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9668328636462172, |
|
"grad_norm": 0.6853314266411482, |
|
"learning_rate": 0.0001845473642188637, |
|
"loss": 3.411, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9711684370257967, |
|
"grad_norm": 0.28464215705198403, |
|
"learning_rate": 0.00018344139672140384, |
|
"loss": 3.396, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9755040104053762, |
|
"grad_norm": 0.3931004534338328, |
|
"learning_rate": 0.00018233351282516283, |
|
"loss": 3.3599, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9798395837849556, |
|
"grad_norm": 0.4836933366464829, |
|
"learning_rate": 0.00018122377601873733, |
|
"loss": 3.3365, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.984175157164535, |
|
"grad_norm": 0.4500537170978018, |
|
"learning_rate": 0.00018011224989690727, |
|
"loss": 3.3036, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.9885107305441144, |
|
"grad_norm": 0.2859298083164817, |
|
"learning_rate": 0.00017899899815699134, |
|
"loss": 3.2616, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9928463039236939, |
|
"grad_norm": 0.7548873266247055, |
|
"learning_rate": 0.00017788408459519674, |
|
"loss": 3.2599, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.9971818773032733, |
|
"grad_norm": 0.32602384410109714, |
|
"learning_rate": 0.00017676757310296356, |
|
"loss": 3.2946, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.0008671146759158, |
|
"grad_norm": 1.0724922966955157, |
|
"learning_rate": 0.00017564952766330308, |
|
"loss": 3.2325, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.0052026880554954, |
|
"grad_norm": 0.3056819985122965, |
|
"learning_rate": 0.00017453001234713107, |
|
"loss": 3.2937, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.0095382614350747, |
|
"grad_norm": 0.36438507475227383, |
|
"learning_rate": 0.0001734090913095966, |
|
"loss": 3.2332, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.0138738348146543, |
|
"grad_norm": 0.30915725984980597, |
|
"learning_rate": 0.00017228682878640508, |
|
"loss": 3.2364, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.0182094081942337, |
|
"grad_norm": 0.33647494500052355, |
|
"learning_rate": 0.0001711632890901374, |
|
"loss": 3.2003, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.0225449815738132, |
|
"grad_norm": 0.312162857766132, |
|
"learning_rate": 0.00017003853660656435, |
|
"loss": 3.1807, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.0268805549533926, |
|
"grad_norm": 0.28432133622360134, |
|
"learning_rate": 0.00016891263579095698, |
|
"loss": 3.1668, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.031216128332972, |
|
"grad_norm": 0.27204342913825097, |
|
"learning_rate": 0.0001677856511643928, |
|
"loss": 3.1283, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.0355517017125515, |
|
"grad_norm": 0.47073942395246565, |
|
"learning_rate": 0.00016665764731005838, |
|
"loss": 3.0741, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.0398872750921309, |
|
"grad_norm": 0.48701637427424527, |
|
"learning_rate": 0.0001655286888695484, |
|
"loss": 3.079, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.0442228484717104, |
|
"grad_norm": 0.43725300587861615, |
|
"learning_rate": 0.0001643988405391612, |
|
"loss": 3.1095, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.0485584218512898, |
|
"grad_norm": 0.4519430510361337, |
|
"learning_rate": 0.00016326816706619136, |
|
"loss": 3.0779, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.0528939952308694, |
|
"grad_norm": 0.3697980433575495, |
|
"learning_rate": 0.00016213673324521913, |
|
"loss": 3.1321, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.0572295686104487, |
|
"grad_norm": 0.3772355822034261, |
|
"learning_rate": 0.00016100460391439749, |
|
"loss": 3.0517, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.0615651419900283, |
|
"grad_norm": 0.3795892280711555, |
|
"learning_rate": 0.0001598718439517364, |
|
"loss": 3.0278, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.0659007153696076, |
|
"grad_norm": 0.2889179402680778, |
|
"learning_rate": 0.0001587385182713849, |
|
"loss": 3.0402, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.0702362887491872, |
|
"grad_norm": 0.3755593221213261, |
|
"learning_rate": 0.0001576046918199112, |
|
"loss": 2.994, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.0745718621287665, |
|
"grad_norm": 0.3476317696753179, |
|
"learning_rate": 0.0001564704295725808, |
|
"loss": 3.0468, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.078907435508346, |
|
"grad_norm": 0.32852407166347947, |
|
"learning_rate": 0.00015533579652963288, |
|
"loss": 2.9539, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.0832430088879255, |
|
"grad_norm": 0.1822141394179994, |
|
"learning_rate": 0.00015420085771255566, |
|
"loss": 3.0026, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0875785822675048, |
|
"grad_norm": 0.270319630849222, |
|
"learning_rate": 0.00015306567816036006, |
|
"loss": 2.976, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.0919141556470844, |
|
"grad_norm": 0.5149999760979279, |
|
"learning_rate": 0.00015193032292585247, |
|
"loss": 2.9326, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0962497290266637, |
|
"grad_norm": 0.4445856562229245, |
|
"learning_rate": 0.00015079485707190717, |
|
"loss": 2.9483, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.1005853024062433, |
|
"grad_norm": 0.5660752187512902, |
|
"learning_rate": 0.00014965934566773753, |
|
"loss": 2.9209, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.1049208757858227, |
|
"grad_norm": 0.36107453692493174, |
|
"learning_rate": 0.00014852385378516712, |
|
"loss": 2.9059, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.1092564491654022, |
|
"grad_norm": 0.8307960065897146, |
|
"learning_rate": 0.00014738844649490106, |
|
"loss": 2.9135, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.1135920225449816, |
|
"grad_norm": 0.32930556448154574, |
|
"learning_rate": 0.0001462531888627966, |
|
"loss": 2.931, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.117927595924561, |
|
"grad_norm": 0.45094251331146196, |
|
"learning_rate": 0.00014511814594613461, |
|
"loss": 2.8794, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.1222631693041405, |
|
"grad_norm": 1.1023315045514466, |
|
"learning_rate": 0.00014398338278989167, |
|
"loss": 2.8964, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.1265987426837198, |
|
"grad_norm": 0.6884577647184886, |
|
"learning_rate": 0.00014284896442301218, |
|
"loss": 2.9186, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.1309343160632994, |
|
"grad_norm": 0.6853011714731221, |
|
"learning_rate": 0.00014171495585468195, |
|
"loss": 2.9093, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.1352698894428788, |
|
"grad_norm": 0.4444521764738651, |
|
"learning_rate": 0.000140581422070603, |
|
"loss": 2.8949, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.1396054628224583, |
|
"grad_norm": 0.36607451431757926, |
|
"learning_rate": 0.00013944842802926904, |
|
"loss": 2.8727, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.1439410362020377, |
|
"grad_norm": 0.20621050239222513, |
|
"learning_rate": 0.00013831603865824328, |
|
"loss": 2.8068, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.1482766095816173, |
|
"grad_norm": 0.29548591941866714, |
|
"learning_rate": 0.00013718431885043772, |
|
"loss": 2.8033, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.1526121829611966, |
|
"grad_norm": 0.3480007759133749, |
|
"learning_rate": 0.000136053333460394, |
|
"loss": 2.8303, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.156947756340776, |
|
"grad_norm": 0.2962266371943945, |
|
"learning_rate": 0.0001349231473005673, |
|
"loss": 2.7893, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.1612833297203555, |
|
"grad_norm": 0.23448406684507436, |
|
"learning_rate": 0.00013379382513761175, |
|
"loss": 2.7797, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.1656189030999349, |
|
"grad_norm": 0.3234043699614117, |
|
"learning_rate": 0.00013266543168866934, |
|
"loss": 2.7607, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.1699544764795144, |
|
"grad_norm": 0.3404947812121631, |
|
"learning_rate": 0.0001315380316176609, |
|
"loss": 2.7567, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.1742900498590938, |
|
"grad_norm": 0.6066048669028199, |
|
"learning_rate": 0.0001304116895315805, |
|
"loss": 2.7501, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.1786256232386734, |
|
"grad_norm": 0.44548332421799974, |
|
"learning_rate": 0.00012928646997679326, |
|
"loss": 2.7475, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.1829611966182527, |
|
"grad_norm": 0.3939467687702201, |
|
"learning_rate": 0.00012816243743533624, |
|
"loss": 2.7117, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.1872967699978323, |
|
"grad_norm": 0.3011307288220261, |
|
"learning_rate": 0.00012703965632122327, |
|
"loss": 2.7543, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.1916323433774116, |
|
"grad_norm": 0.22501315848677994, |
|
"learning_rate": 0.00012591819097675382, |
|
"loss": 2.7462, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.1959679167569912, |
|
"grad_norm": 0.32722976515069685, |
|
"learning_rate": 0.0001247981056688254, |
|
"loss": 2.6968, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.2003034901365706, |
|
"grad_norm": 0.1590488680202715, |
|
"learning_rate": 0.00012367946458525099, |
|
"loss": 2.7045, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.20463906351615, |
|
"grad_norm": 0.3152582769975424, |
|
"learning_rate": 0.00012256233183108068, |
|
"loss": 2.6789, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.2089746368957295, |
|
"grad_norm": 0.22965781079535302, |
|
"learning_rate": 0.00012144677142492789, |
|
"loss": 2.7101, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.2133102102753088, |
|
"grad_norm": 0.4684583447317611, |
|
"learning_rate": 0.00012033284729530057, |
|
"loss": 2.6259, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.2176457836548884, |
|
"grad_norm": 0.22363991138693204, |
|
"learning_rate": 0.00011922062327693832, |
|
"loss": 2.6717, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.2219813570344678, |
|
"grad_norm": 0.33497001761272166, |
|
"learning_rate": 0.00011811016310715355, |
|
"loss": 2.6517, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.2263169304140473, |
|
"grad_norm": 0.20548859646502277, |
|
"learning_rate": 0.00011700153042217931, |
|
"loss": 2.6677, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.2306525037936267, |
|
"grad_norm": 0.2652083246967948, |
|
"learning_rate": 0.00011589478875352255, |
|
"loss": 2.6543, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.2349880771732062, |
|
"grad_norm": 0.2529645672839692, |
|
"learning_rate": 0.00011479000152432319, |
|
"loss": 2.6205, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.2393236505527856, |
|
"grad_norm": 0.2601375038926653, |
|
"learning_rate": 0.0001136872320457197, |
|
"loss": 2.6102, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.2436592239323652, |
|
"grad_norm": 0.19250161537810334, |
|
"learning_rate": 0.00011258654351322107, |
|
"loss": 2.631, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.2479947973119445, |
|
"grad_norm": 0.35217150816617415, |
|
"learning_rate": 0.00011148799900308509, |
|
"loss": 2.6013, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.2523303706915239, |
|
"grad_norm": 0.8877189979150436, |
|
"learning_rate": 0.00011039166146870383, |
|
"loss": 2.6335, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.2566659440711034, |
|
"grad_norm": 0.6027781279490154, |
|
"learning_rate": 0.00010929759373699613, |
|
"loss": 2.6011, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.2610015174506828, |
|
"grad_norm": 0.29684477637886336, |
|
"learning_rate": 0.00010820585850480696, |
|
"loss": 2.6083, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.2653370908302624, |
|
"grad_norm": 0.22866123103951785, |
|
"learning_rate": 0.00010711651833531463, |
|
"loss": 2.6249, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.2696726642098417, |
|
"grad_norm": 0.2570183068878416, |
|
"learning_rate": 0.00010602963565444577, |
|
"loss": 2.5858, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.2740082375894213, |
|
"grad_norm": 0.31391058369721064, |
|
"learning_rate": 0.00010494527274729748, |
|
"loss": 2.5606, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.2783438109690006, |
|
"grad_norm": 0.3203458432202096, |
|
"learning_rate": 0.00010386349175456825, |
|
"loss": 2.5637, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.28267938434858, |
|
"grad_norm": 0.30834075039079006, |
|
"learning_rate": 0.00010278435466899714, |
|
"loss": 2.6011, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.2870149577281595, |
|
"grad_norm": 0.22399943173892975, |
|
"learning_rate": 0.00010170792333181084, |
|
"loss": 2.5288, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.2913505311077391, |
|
"grad_norm": 0.3319547375893817, |
|
"learning_rate": 0.00010063425942917974, |
|
"loss": 2.5375, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2956861044873185, |
|
"grad_norm": 0.3315527934802732, |
|
"learning_rate": 9.956342448868354e-05, |
|
"loss": 2.5274, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.3000216778668978, |
|
"grad_norm": 0.3580645974138616, |
|
"learning_rate": 9.849547987578457e-05, |
|
"loss": 2.5585, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.3043572512464774, |
|
"grad_norm": 0.2780635499992022, |
|
"learning_rate": 9.743048679031163e-05, |
|
"loss": 2.5291, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.3086928246260567, |
|
"grad_norm": 0.40375989395133016, |
|
"learning_rate": 9.636850626295282e-05, |
|
"loss": 2.517, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.3130283980056363, |
|
"grad_norm": 0.3527800248976021, |
|
"learning_rate": 9.530959915175796e-05, |
|
"loss": 2.5277, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.3173639713852157, |
|
"grad_norm": 0.23822673694395258, |
|
"learning_rate": 9.425382613865107e-05, |
|
"loss": 2.5014, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.3216995447647952, |
|
"grad_norm": 0.1747138201174964, |
|
"learning_rate": 9.32012477259531e-05, |
|
"loss": 2.4866, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.3260351181443746, |
|
"grad_norm": 0.1954668050935581, |
|
"learning_rate": 9.215192423291463e-05, |
|
"loss": 2.5021, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.330370691523954, |
|
"grad_norm": 0.19057547870092642, |
|
"learning_rate": 9.110591579225906e-05, |
|
"loss": 2.5044, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.3347062649035335, |
|
"grad_norm": 0.2745708957809259, |
|
"learning_rate": 9.006328234673701e-05, |
|
"loss": 2.5073, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.339041838283113, |
|
"grad_norm": 0.20139794408374664, |
|
"learning_rate": 8.90240836456909e-05, |
|
"loss": 2.5033, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.3433774116626924, |
|
"grad_norm": 0.201183788959332, |
|
"learning_rate": 8.798837924163098e-05, |
|
"loss": 2.4782, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.3477129850422718, |
|
"grad_norm": 0.3799911989499351, |
|
"learning_rate": 8.695622848682291e-05, |
|
"loss": 2.4951, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.3520485584218513, |
|
"grad_norm": 0.1990146960863876, |
|
"learning_rate": 8.592769052988607e-05, |
|
"loss": 2.4901, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.3563841318014307, |
|
"grad_norm": 0.34068891992062217, |
|
"learning_rate": 8.490282431240416e-05, |
|
"loss": 2.4522, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.3607197051810103, |
|
"grad_norm": 0.2291955529635031, |
|
"learning_rate": 8.388168856554777e-05, |
|
"loss": 2.4203, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.3650552785605896, |
|
"grad_norm": 0.3248337694954652, |
|
"learning_rate": 8.286434180670822e-05, |
|
"loss": 2.4868, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.3693908519401692, |
|
"grad_norm": 0.2990054213661462, |
|
"learning_rate": 8.185084233614444e-05, |
|
"loss": 2.4363, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.3737264253197485, |
|
"grad_norm": 0.233583205490779, |
|
"learning_rate": 8.084124823364204e-05, |
|
"loss": 2.4807, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.3780619986993279, |
|
"grad_norm": 0.2518607909224173, |
|
"learning_rate": 7.983561735518474e-05, |
|
"loss": 2.4358, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.3823975720789075, |
|
"grad_norm": 0.19984297609600038, |
|
"learning_rate": 7.883400732963913e-05, |
|
"loss": 2.478, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.3867331454584868, |
|
"grad_norm": 0.1648080690690456, |
|
"learning_rate": 7.783647555545217e-05, |
|
"loss": 2.442, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3910687188380664, |
|
"grad_norm": 0.17610648998979445, |
|
"learning_rate": 7.684307919736158e-05, |
|
"loss": 2.41, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.3954042922176457, |
|
"grad_norm": 0.14818666921811272, |
|
"learning_rate": 7.585387518312028e-05, |
|
"loss": 2.4206, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.3997398655972253, |
|
"grad_norm": 0.2001322045633342, |
|
"learning_rate": 7.486892020023406e-05, |
|
"loss": 2.3821, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.4040754389768046, |
|
"grad_norm": 0.2200377253653553, |
|
"learning_rate": 7.388827069271276e-05, |
|
"loss": 2.4257, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.408411012356384, |
|
"grad_norm": 0.1844230292041018, |
|
"learning_rate": 7.291198285783602e-05, |
|
"loss": 2.4135, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.4127465857359636, |
|
"grad_norm": 0.1929745133150067, |
|
"learning_rate": 7.194011264293254e-05, |
|
"loss": 2.3777, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.4170821591155431, |
|
"grad_norm": 0.2915968829982784, |
|
"learning_rate": 7.097271574217421e-05, |
|
"loss": 2.4181, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.4214177324951225, |
|
"grad_norm": 0.269578671519446, |
|
"learning_rate": 7.000984759338422e-05, |
|
"loss": 2.3788, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.4257533058747018, |
|
"grad_norm": 0.19512593934978045, |
|
"learning_rate": 6.905156337486045e-05, |
|
"loss": 2.391, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.4300888792542814, |
|
"grad_norm": 0.2593023962861574, |
|
"learning_rate": 6.809791800221313e-05, |
|
"loss": 2.3963, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.4344244526338608, |
|
"grad_norm": 0.14901754063689115, |
|
"learning_rate": 6.714896612521794e-05, |
|
"loss": 2.3976, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.4387600260134403, |
|
"grad_norm": 0.302187906502475, |
|
"learning_rate": 6.620476212468424e-05, |
|
"loss": 2.4194, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.4430955993930197, |
|
"grad_norm": 0.24180384264466487, |
|
"learning_rate": 6.526536010933874e-05, |
|
"loss": 2.4295, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.4474311727725993, |
|
"grad_norm": 0.22169225675339646, |
|
"learning_rate": 6.433081391272467e-05, |
|
"loss": 2.3976, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.4517667461521786, |
|
"grad_norm": 0.2799481789977155, |
|
"learning_rate": 6.340117709011693e-05, |
|
"loss": 2.392, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.456102319531758, |
|
"grad_norm": 0.28103864837065845, |
|
"learning_rate": 6.247650291545287e-05, |
|
"loss": 2.3708, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.4604378929113375, |
|
"grad_norm": 0.24593789213337805, |
|
"learning_rate": 6.155684437827931e-05, |
|
"loss": 2.4043, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.464773466290917, |
|
"grad_norm": 0.19968974812237159, |
|
"learning_rate": 6.064225418071632e-05, |
|
"loss": 2.3784, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.4691090396704964, |
|
"grad_norm": 0.2665198920246072, |
|
"learning_rate": 5.9732784734436554e-05, |
|
"loss": 2.387, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.4734446130500758, |
|
"grad_norm": 0.33954513870533093, |
|
"learning_rate": 5.882848815766189e-05, |
|
"loss": 2.3659, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.4777801864296554, |
|
"grad_norm": 0.3174285529770626, |
|
"learning_rate": 5.792941627217707e-05, |
|
"loss": 2.3703, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.4821157598092347, |
|
"grad_norm": 0.21514280209891976, |
|
"learning_rate": 5.703562060035951e-05, |
|
"loss": 2.3311, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.4864513331888143, |
|
"grad_norm": 0.26026170716900454, |
|
"learning_rate": 5.614715236222702e-05, |
|
"loss": 2.3534, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.4907869065683936, |
|
"grad_norm": 0.2072827021307388, |
|
"learning_rate": 5.52640624725026e-05, |
|
"loss": 2.362, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.4951224799479732, |
|
"grad_norm": 0.18636459137372047, |
|
"learning_rate": 5.4386401537696536e-05, |
|
"loss": 2.367, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.4994580533275526, |
|
"grad_norm": 0.2616321440338591, |
|
"learning_rate": 5.3514219853206464e-05, |
|
"loss": 2.3517, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.503793626707132, |
|
"grad_norm": 0.17660365833901004, |
|
"learning_rate": 5.264756740043511e-05, |
|
"loss": 2.3366, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.5081292000867115, |
|
"grad_norm": 0.14325753486785228, |
|
"learning_rate": 5.178649384392603e-05, |
|
"loss": 2.3628, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.512464773466291, |
|
"grad_norm": 0.16306886697377787, |
|
"learning_rate": 5.093104852851749e-05, |
|
"loss": 2.3403, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.5168003468458704, |
|
"grad_norm": 0.17105852766007526, |
|
"learning_rate": 5.008128047651488e-05, |
|
"loss": 2.3193, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.5211359202254497, |
|
"grad_norm": 0.20749830784985143, |
|
"learning_rate": 4.923723838488117e-05, |
|
"loss": 2.3519, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.5254714936050293, |
|
"grad_norm": 0.3157660095405772, |
|
"learning_rate": 4.839897062244638e-05, |
|
"loss": 2.3197, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.5298070669846087, |
|
"grad_norm": 0.18219178823449922, |
|
"learning_rate": 4.756652522713599e-05, |
|
"loss": 2.3279, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.534142640364188, |
|
"grad_norm": 0.12524649727104128, |
|
"learning_rate": 4.673994990321752e-05, |
|
"loss": 2.3019, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.5384782137437676, |
|
"grad_norm": 0.1636956452865002, |
|
"learning_rate": 4.591929201856727e-05, |
|
"loss": 2.2859, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.5428137871233472, |
|
"grad_norm": 0.21967450962811522, |
|
"learning_rate": 4.5104598601955805e-05, |
|
"loss": 2.3095, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.5471493605029265, |
|
"grad_norm": 0.19863296073280773, |
|
"learning_rate": 4.4295916340352625e-05, |
|
"loss": 2.2826, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.5514849338825059, |
|
"grad_norm": 0.15602204927659435, |
|
"learning_rate": 4.349329157625088e-05, |
|
"loss": 2.3522, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.5558205072620854, |
|
"grad_norm": 0.15284396668890557, |
|
"learning_rate": 4.269677030501184e-05, |
|
"loss": 2.3546, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.560156080641665, |
|
"grad_norm": 0.16000621000672735, |
|
"learning_rate": 4.1906398172228704e-05, |
|
"loss": 2.3456, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.5644916540212443, |
|
"grad_norm": 0.5401953444913549, |
|
"learning_rate": 4.112222047111111e-05, |
|
"loss": 2.3475, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.5688272274008237, |
|
"grad_norm": 0.18490516183802747, |
|
"learning_rate": 4.034428213988946e-05, |
|
"loss": 2.3064, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.5731628007804033, |
|
"grad_norm": 0.2648705938773556, |
|
"learning_rate": 3.957262775923969e-05, |
|
"loss": 2.3087, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.5774983741599826, |
|
"grad_norm": 0.20317358607574193, |
|
"learning_rate": 3.8807301549728435e-05, |
|
"loss": 2.292, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.581833947539562, |
|
"grad_norm": 0.18781915869958946, |
|
"learning_rate": 3.804834736927918e-05, |
|
"loss": 2.3321, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.5861695209191415, |
|
"grad_norm": 0.22963664052911378, |
|
"learning_rate": 3.7295808710658594e-05, |
|
"loss": 2.3105, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.5905050942987211, |
|
"grad_norm": 0.16547609467569466, |
|
"learning_rate": 3.654972869898435e-05, |
|
"loss": 2.3441, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.5948406676783005, |
|
"grad_norm": 0.15450897850919312, |
|
"learning_rate": 3.581015008925367e-05, |
|
"loss": 2.2963, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.5991762410578798, |
|
"grad_norm": 0.2368783078996937, |
|
"learning_rate": 3.507711526389331e-05, |
|
"loss": 2.2701, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.6035118144374594, |
|
"grad_norm": 0.23275669114568637, |
|
"learning_rate": 3.4350666230330684e-05, |
|
"loss": 2.3027, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.607847387817039, |
|
"grad_norm": 0.23123161162094558, |
|
"learning_rate": 3.363084461858659e-05, |
|
"loss": 2.3271, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.6121829611966183, |
|
"grad_norm": 0.19114205349654592, |
|
"learning_rate": 3.291769167888971e-05, |
|
"loss": 2.3085, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.6165185345761977, |
|
"grad_norm": 0.19601739480919383, |
|
"learning_rate": 3.221124827931248e-05, |
|
"loss": 2.297, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.6208541079557772, |
|
"grad_norm": 0.14221216745928258, |
|
"learning_rate": 3.151155490342917e-05, |
|
"loss": 2.2855, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.6251896813353566, |
|
"grad_norm": 0.1584865862659709, |
|
"learning_rate": 3.081865164799613e-05, |
|
"loss": 2.2614, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.629525254714936, |
|
"grad_norm": 0.1414839158508025, |
|
"learning_rate": 3.0132578220653648e-05, |
|
"loss": 2.2795, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.6338608280945155, |
|
"grad_norm": 0.1254646077472122, |
|
"learning_rate": 2.9453373937650664e-05, |
|
"loss": 2.2965, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.638196401474095, |
|
"grad_norm": 0.16882648715438078, |
|
"learning_rate": 2.8781077721591828e-05, |
|
"loss": 2.3278, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.6425319748536744, |
|
"grad_norm": 0.18395517918844081, |
|
"learning_rate": 2.811572809920669e-05, |
|
"loss": 2.2801, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.6468675482332538, |
|
"grad_norm": 0.1309521536420322, |
|
"learning_rate": 2.7457363199142062e-05, |
|
"loss": 2.2852, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.6512031216128333, |
|
"grad_norm": 0.12473429715301326, |
|
"learning_rate": 2.680602074977708e-05, |
|
"loss": 2.259, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.655538694992413, |
|
"grad_norm": 0.1312047736460762, |
|
"learning_rate": 2.6161738077060924e-05, |
|
"loss": 2.2868, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.659874268371992, |
|
"grad_norm": 0.14567444941618224, |
|
"learning_rate": 2.552455210237398e-05, |
|
"loss": 2.2633, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.6642098417515716, |
|
"grad_norm": 0.13883443962415717, |
|
"learning_rate": 2.4894499340411968e-05, |
|
"loss": 2.2541, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.6685454151311512, |
|
"grad_norm": 0.15369994803840648, |
|
"learning_rate": 2.427161589709337e-05, |
|
"loss": 2.2996, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.6728809885107305, |
|
"grad_norm": 0.21447874396163935, |
|
"learning_rate": 2.365593746749041e-05, |
|
"loss": 2.2679, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.6772165618903099, |
|
"grad_norm": 0.18513858979140632, |
|
"learning_rate": 2.3047499333783558e-05, |
|
"loss": 2.2658, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.6815521352698894, |
|
"grad_norm": 0.18391090938011884, |
|
"learning_rate": 2.244633636323946e-05, |
|
"loss": 2.2907, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.685887708649469, |
|
"grad_norm": 0.16891631837991977, |
|
"learning_rate": 2.1852483006212978e-05, |
|
"loss": 2.2478, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.6902232820290484, |
|
"grad_norm": 0.1518091020243819, |
|
"learning_rate": 2.126597329417293e-05, |
|
"loss": 2.2473, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.6945588554086277, |
|
"grad_norm": 0.17161647974253239, |
|
"learning_rate": 2.068684083775185e-05, |
|
"loss": 2.2537, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.6988944287882073, |
|
"grad_norm": 0.14187218785526223, |
|
"learning_rate": 2.0115118824819914e-05, |
|
"loss": 2.2616, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.7032300021677866, |
|
"grad_norm": 0.11929928350263867, |
|
"learning_rate": 1.9550840018583153e-05, |
|
"loss": 2.2694, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.707565575547366, |
|
"grad_norm": 0.1245533165549466, |
|
"learning_rate": 1.899403675570576e-05, |
|
"loss": 2.2595, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.7119011489269456, |
|
"grad_norm": 0.11773193996812116, |
|
"learning_rate": 1.844474094445705e-05, |
|
"loss": 2.2604, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.7162367223065251, |
|
"grad_norm": 0.1419568607634185, |
|
"learning_rate": 1.7902984062883053e-05, |
|
"loss": 2.2311, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.7205722956861045, |
|
"grad_norm": 0.16983875746382313, |
|
"learning_rate": 1.736879715700243e-05, |
|
"loss": 2.2403, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.7249078690656838, |
|
"grad_norm": 0.12131945112331936, |
|
"learning_rate": 1.684221083902746e-05, |
|
"loss": 2.2474, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.7292434424452634, |
|
"grad_norm": 0.44071395908424116, |
|
"learning_rate": 1.6323255285609722e-05, |
|
"loss": 2.2337, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.733579015824843, |
|
"grad_norm": 0.23334817695713936, |
|
"learning_rate": 1.5811960236110855e-05, |
|
"loss": 2.2489, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.7379145892044223, |
|
"grad_norm": 0.2111669535853138, |
|
"learning_rate": 1.530835499089821e-05, |
|
"loss": 2.2269, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.7422501625840017, |
|
"grad_norm": 0.18179903995116767, |
|
"learning_rate": 1.4812468409665884e-05, |
|
"loss": 2.2706, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.7465857359635812, |
|
"grad_norm": 0.12994276828633272, |
|
"learning_rate": 1.432432890978074e-05, |
|
"loss": 2.2688, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.7509213093431606, |
|
"grad_norm": 0.20415942563901335, |
|
"learning_rate": 1.3843964464654018e-05, |
|
"loss": 2.2725, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.75525688272274, |
|
"grad_norm": 0.13423972154083932, |
|
"learning_rate": 1.3371402602138242e-05, |
|
"loss": 2.2614, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.7595924561023195, |
|
"grad_norm": 0.21592839787669263, |
|
"learning_rate": 1.2906670402949703e-05, |
|
"loss": 2.2278, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.763928029481899, |
|
"grad_norm": 0.1195430112602107, |
|
"learning_rate": 1.2449794499116567e-05, |
|
"loss": 2.2434, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.7682636028614784, |
|
"grad_norm": 0.1200861082736604, |
|
"learning_rate": 1.200080107245278e-05, |
|
"loss": 2.2547, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.7725991762410578, |
|
"grad_norm": 0.11672129617620956, |
|
"learning_rate": 1.1559715853057516e-05, |
|
"loss": 2.2196, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.7769347496206374, |
|
"grad_norm": 0.11729537916907379, |
|
"learning_rate": 1.1126564117840819e-05, |
|
"loss": 2.2613, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.781270323000217, |
|
"grad_norm": 0.10450327625927365, |
|
"learning_rate": 1.0701370689075094e-05, |
|
"loss": 2.244, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.7856058963797963, |
|
"grad_norm": 0.13649256070362986, |
|
"learning_rate": 1.0284159932972524e-05, |
|
"loss": 2.2222, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.7899414697593756, |
|
"grad_norm": 0.11456259112458032, |
|
"learning_rate": 9.87495575828875e-06, |
|
"loss": 2.2401, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.7942770431389552, |
|
"grad_norm": 0.12074570004540981, |
|
"learning_rate": 9.473781614952918e-06, |
|
"loss": 2.2401, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.7986126165185345, |
|
"grad_norm": 0.12686179535370362, |
|
"learning_rate": 9.080660492723663e-06, |
|
"loss": 2.2295, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.802948189898114, |
|
"grad_norm": 0.1300450179470363, |
|
"learning_rate": 8.695614919871679e-06, |
|
"loss": 2.2569, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.8072837632776935, |
|
"grad_norm": 0.11937907719387332, |
|
"learning_rate": 8.31866696188887e-06, |
|
"loss": 2.2294, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.811619336657273, |
|
"grad_norm": 0.11764431526486717, |
|
"learning_rate": 7.949838220223664e-06, |
|
"loss": 2.217, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.8159549100368524, |
|
"grad_norm": 0.1234907525541496, |
|
"learning_rate": 7.589149831043212e-06, |
|
"loss": 2.217, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.8202904834164317, |
|
"grad_norm": 0.1517422441513512, |
|
"learning_rate": 7.236622464022151e-06, |
|
"loss": 2.2453, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.8246260567960113, |
|
"grad_norm": 0.33341778139897915, |
|
"learning_rate": 6.892276321158058e-06, |
|
"loss": 2.2356, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.8289616301755909, |
|
"grad_norm": 0.11642132397770094, |
|
"learning_rate": 6.556131135613818e-06, |
|
"loss": 2.2423, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.83329720355517, |
|
"grad_norm": 0.1407587899536127, |
|
"learning_rate": 6.2282061705868025e-06, |
|
"loss": 2.203, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.8376327769347496, |
|
"grad_norm": 0.13035967664280043, |
|
"learning_rate": 5.908520218204832e-06, |
|
"loss": 2.1993, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.8419683503143292, |
|
"grad_norm": 0.09453247400951749, |
|
"learning_rate": 5.597091598449438e-06, |
|
"loss": 2.228, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.8463039236939085, |
|
"grad_norm": 0.12255843625223094, |
|
"learning_rate": 5.293938158105904e-06, |
|
"loss": 2.2373, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.8506394970734878, |
|
"grad_norm": 0.10290208504992138, |
|
"learning_rate": 4.999077269740581e-06, |
|
"loss": 2.1896, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.8549750704530674, |
|
"grad_norm": 0.10229541966122209, |
|
"learning_rate": 4.712525830705338e-06, |
|
"loss": 2.2202, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.859310643832647, |
|
"grad_norm": 0.1058407353264288, |
|
"learning_rate": 4.4343002621692155e-06, |
|
"loss": 2.2105, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.8636462172122263, |
|
"grad_norm": 0.12091080320995722, |
|
"learning_rate": 4.164416508177398e-06, |
|
"loss": 2.2192, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.8679817905918057, |
|
"grad_norm": 0.11922564089206397, |
|
"learning_rate": 3.902890034737527e-06, |
|
"loss": 2.2558, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.8723173639713853, |
|
"grad_norm": 0.09787488554546948, |
|
"learning_rate": 3.649735828933409e-06, |
|
"loss": 2.1973, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.8766529373509646, |
|
"grad_norm": 0.12237194203335397, |
|
"learning_rate": 3.4049683980661214e-06, |
|
"loss": 2.2213, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.880988510730544, |
|
"grad_norm": 0.09606403673967055, |
|
"learning_rate": 3.168601768822726e-06, |
|
"loss": 2.1992, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.8853240841101235, |
|
"grad_norm": 0.09953286517322664, |
|
"learning_rate": 2.940649486472396e-06, |
|
"loss": 2.2528, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.889659657489703, |
|
"grad_norm": 0.10835068342395951, |
|
"learning_rate": 2.72112461409022e-06, |
|
"loss": 2.2531, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.8939952308692825, |
|
"grad_norm": 0.10266855095126044, |
|
"learning_rate": 2.510039731808533e-06, |
|
"loss": 2.269, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 1.8983308042488618, |
|
"grad_norm": 0.10341900472056524, |
|
"learning_rate": 2.3074069360961623e-06, |
|
"loss": 2.2062, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.9026663776284414, |
|
"grad_norm": 0.10842551877471102, |
|
"learning_rate": 2.1132378390650463e-06, |
|
"loss": 2.2534, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 1.907001951008021, |
|
"grad_norm": 0.10059763675561238, |
|
"learning_rate": 1.9275435678048845e-06, |
|
"loss": 2.2473, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.9113375243876003, |
|
"grad_norm": 0.10768116633333848, |
|
"learning_rate": 1.7503347637454479e-06, |
|
"loss": 2.2552, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.9156730977671796, |
|
"grad_norm": 0.09414809031667026, |
|
"learning_rate": 1.5816215820467992e-06, |
|
"loss": 2.2367, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.9200086711467592, |
|
"grad_norm": 0.10690488271519041, |
|
"learning_rate": 1.4214136910172925e-06, |
|
"loss": 2.2253, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 1.9243442445263386, |
|
"grad_norm": 0.10872884844513397, |
|
"learning_rate": 1.2697202715595822e-06, |
|
"loss": 2.2289, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.928679817905918, |
|
"grad_norm": 0.1254620474939429, |
|
"learning_rate": 1.126550016644412e-06, |
|
"loss": 2.2164, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.9330153912854975, |
|
"grad_norm": 0.1025587696285709, |
|
"learning_rate": 9.919111308125449e-07, |
|
"loss": 2.2039, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.937350964665077, |
|
"grad_norm": 0.09872117993630683, |
|
"learning_rate": 8.65811329704541e-07, |
|
"loss": 2.2492, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.9416865380446564, |
|
"grad_norm": 0.1085639607949179, |
|
"learning_rate": 7.482578396185934e-07, |
|
"loss": 2.2449, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.9460221114242358, |
|
"grad_norm": 0.09973275488637519, |
|
"learning_rate": 6.392573970964432e-07, |
|
"loss": 2.2074, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 1.9503576848038153, |
|
"grad_norm": 0.11503330375288269, |
|
"learning_rate": 5.388162485373548e-07, |
|
"loss": 2.2473, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.954693258183395, |
|
"grad_norm": 0.11928896227233268, |
|
"learning_rate": 4.4694014984010264e-07, |
|
"loss": 2.2128, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 1.9590288315629742, |
|
"grad_norm": 0.08668925207942389, |
|
"learning_rate": 3.6363436607313446e-07, |
|
"loss": 2.2183, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.9633644049425536, |
|
"grad_norm": 0.0953840596497321, |
|
"learning_rate": 2.889036711729298e-07, |
|
"loss": 2.2397, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.9676999783221332, |
|
"grad_norm": 0.10418411546023475, |
|
"learning_rate": 2.2275234767030193e-07, |
|
"loss": 2.2146, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.9720355517017125, |
|
"grad_norm": 0.09712806974011044, |
|
"learning_rate": 1.6518418644507758e-07, |
|
"loss": 2.2166, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.9763711250812919, |
|
"grad_norm": 0.12331447239619826, |
|
"learning_rate": 1.1620248650878739e-07, |
|
"loss": 2.2371, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.9807066984608714, |
|
"grad_norm": 0.09998286807181965, |
|
"learning_rate": 7.581005481566704e-08, |
|
"loss": 2.2271, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 1.985042271840451, |
|
"grad_norm": 0.09866409081999304, |
|
"learning_rate": 4.4009206101786043e-08, |
|
"loss": 2.2148, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.9893778452200304, |
|
"grad_norm": 0.10987887336557695, |
|
"learning_rate": 2.0801762752387097e-08, |
|
"loss": 2.2046, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.9937134185996097, |
|
"grad_norm": 0.10003495023127681, |
|
"learning_rate": 6.189054697436357e-09, |
|
"loss": 2.1954, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.9980489919791893, |
|
"grad_norm": 0.09752064885234579, |
|
"learning_rate": 1.7191933545102067e-10, |
|
"loss": 2.2409, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 1.998916106655105, |
|
"step": 2306, |
|
"total_flos": 1.542232840692197e+19, |
|
"train_loss": 3.6333630210094006, |
|
"train_runtime": 27233.3224, |
|
"train_samples_per_second": 2.71, |
|
"train_steps_per_second": 0.085 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2306, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.542232840692197e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|