|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.9963586709148835, |
|
"eval_steps": 500, |
|
"global_step": 4392, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018206645425580335, |
|
"grad_norm": 15.35500484249196, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0839, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03641329085116067, |
|
"grad_norm": 2.618926418245599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9627, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05461993627674101, |
|
"grad_norm": 2.080636783209041, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9308, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07282658170232134, |
|
"grad_norm": 1.436134442652753, |
|
"learning_rate": 5e-06, |
|
"loss": 0.897, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09103322712790168, |
|
"grad_norm": 1.581598449271548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8783, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10923987255348203, |
|
"grad_norm": 1.3764881165967209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8594, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12744651797906237, |
|
"grad_norm": 0.8508264971874785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8476, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14565316340464268, |
|
"grad_norm": 0.9826078435096594, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8351, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16385980883022302, |
|
"grad_norm": 0.7948332220104163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8261, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18206645425580337, |
|
"grad_norm": 0.7335767518254585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8185, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2002730996813837, |
|
"grad_norm": 1.0534204936405038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8204, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21847974510696405, |
|
"grad_norm": 0.847321277840478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8108, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23668639053254437, |
|
"grad_norm": 0.6210895897293146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8084, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.25489303595812474, |
|
"grad_norm": 0.5781940572538745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8103, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27309968138370505, |
|
"grad_norm": 0.9063984151725509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8077, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.29130632680928537, |
|
"grad_norm": 0.6571941221537146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.799, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.30951297223486574, |
|
"grad_norm": 0.5952826448438556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7999, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.32771961766044605, |
|
"grad_norm": 1.4601461457218337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8004, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3459262630860264, |
|
"grad_norm": 1.3162465370008012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7973, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.36413290851160673, |
|
"grad_norm": 1.2972378846813408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7959, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.38233955393718705, |
|
"grad_norm": 0.9152791184028909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7892, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4005461993627674, |
|
"grad_norm": 0.823555803312372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7932, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.41875284478834773, |
|
"grad_norm": 0.7145315436546384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7848, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4369594902139281, |
|
"grad_norm": 0.5776875953876701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.791, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4551661356395084, |
|
"grad_norm": 0.6846063811362885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7875, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.47337278106508873, |
|
"grad_norm": 0.7232578149621245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7843, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4915794264906691, |
|
"grad_norm": 0.6962364276167124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7839, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5097860719162495, |
|
"grad_norm": 0.8881236192032435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.779, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5279927173418297, |
|
"grad_norm": 0.6192460064199938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7821, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5461993627674101, |
|
"grad_norm": 0.7429540921229376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7771, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5644060081929905, |
|
"grad_norm": 0.5678182946639223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7837, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5826126536185707, |
|
"grad_norm": 0.7927883549153515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7795, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6008192990441511, |
|
"grad_norm": 0.6630873675584549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7788, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6190259444697315, |
|
"grad_norm": 0.8821281227865143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7817, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6372325898953118, |
|
"grad_norm": 0.5862391322505137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7818, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6554392353208921, |
|
"grad_norm": 0.7933124674797363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7714, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6736458807464725, |
|
"grad_norm": 0.5912540694609474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7732, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6918525261720528, |
|
"grad_norm": 0.5222298817170612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7697, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7100591715976331, |
|
"grad_norm": 0.6154935564611252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7716, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7282658170232135, |
|
"grad_norm": 0.7305900161607543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7708, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7464724624487938, |
|
"grad_norm": 0.7798387910594561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7724, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7646791078743741, |
|
"grad_norm": 1.100346410427369, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7716, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7828857532999545, |
|
"grad_norm": 0.657718484418182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7666, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8010923987255348, |
|
"grad_norm": 0.6103104885939142, |
|
"learning_rate": 5e-06, |
|
"loss": 0.765, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8192990441511152, |
|
"grad_norm": 0.6884438533501283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7699, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8375056895766955, |
|
"grad_norm": 0.6270481654771743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.767, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8557123350022758, |
|
"grad_norm": 0.5861612234613632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7645, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8739189804278562, |
|
"grad_norm": 0.586876291200088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7686, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8921256258534365, |
|
"grad_norm": 0.7466830111479166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7688, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9103322712790168, |
|
"grad_norm": 0.6792188281050604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7685, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9285389167045972, |
|
"grad_norm": 0.6846050616906328, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7616, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9467455621301775, |
|
"grad_norm": 0.7324962552104101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.762, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9649522075557578, |
|
"grad_norm": 0.6893904439545421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7664, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9831588529813382, |
|
"grad_norm": 0.6529796330088168, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7631, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9995448338643604, |
|
"eval_loss": 0.7638739347457886, |
|
"eval_runtime": 379.9866, |
|
"eval_samples_per_second": 38.951, |
|
"eval_steps_per_second": 0.611, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.0013654984069185, |
|
"grad_norm": 1.2061555179450747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.812, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.019572143832499, |
|
"grad_norm": 0.8129440249983992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7222, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0377787892580792, |
|
"grad_norm": 0.6593013104808815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7205, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0559854346836595, |
|
"grad_norm": 0.6107941470930411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7205, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.07419208010924, |
|
"grad_norm": 0.8276670919993867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7144, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0923987255348202, |
|
"grad_norm": 0.6253185542660747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7136, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1106053709604005, |
|
"grad_norm": 0.6108900775704884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7189, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.128812016385981, |
|
"grad_norm": 0.8988785705551109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7122, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1470186618115612, |
|
"grad_norm": 0.616453203309773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7138, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1652253072371415, |
|
"grad_norm": 0.8816478964333492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7174, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"grad_norm": 0.6581015672880766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7183, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2016385980883022, |
|
"grad_norm": 0.6253002121195789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7169, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2198452435138827, |
|
"grad_norm": 0.7483871477416271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7136, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.238051888939463, |
|
"grad_norm": 0.6941445361796307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7114, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2562585343650432, |
|
"grad_norm": 0.6143667364149711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7185, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2744651797906235, |
|
"grad_norm": 0.6592963882014911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7166, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.292671825216204, |
|
"grad_norm": 0.6189597729803721, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7158, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3108784706417842, |
|
"grad_norm": 0.6920980650430207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7144, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3290851160673647, |
|
"grad_norm": 0.49530512173626284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.718, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.347291761492945, |
|
"grad_norm": 0.8064053486625922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7131, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3654984069185252, |
|
"grad_norm": 0.5821998941102782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7136, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3837050523441057, |
|
"grad_norm": 0.5735380427187717, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7154, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.401911697769686, |
|
"grad_norm": 0.5961452096313207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7148, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4201183431952662, |
|
"grad_norm": 0.6364200555389256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7152, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4383249886208467, |
|
"grad_norm": 0.618866609217087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7139, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.456531634046427, |
|
"grad_norm": 0.5153453443480923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7169, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4747382794720072, |
|
"grad_norm": 0.5921562610346923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7157, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.4929449248975877, |
|
"grad_norm": 0.6864315768684789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7186, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.511151570323168, |
|
"grad_norm": 0.6718756873067311, |
|
"learning_rate": 5e-06, |
|
"loss": 0.713, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5293582157487484, |
|
"grad_norm": 0.5292103286821733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7121, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5475648611743287, |
|
"grad_norm": 0.5807337157950755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7161, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.565771506599909, |
|
"grad_norm": 0.5725693174193143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7144, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.5839781520254892, |
|
"grad_norm": 0.6693290134206946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7137, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6021847974510697, |
|
"grad_norm": 0.5960183936149324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.718, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.62039144287665, |
|
"grad_norm": 0.6717348604009659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7107, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6385980883022304, |
|
"grad_norm": 0.6147573815012755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7101, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6568047337278107, |
|
"grad_norm": 0.5451936467703803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7128, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.675011379153391, |
|
"grad_norm": 0.7294513986677604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7104, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.6932180245789712, |
|
"grad_norm": 0.6756274493055726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7115, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7114246700045517, |
|
"grad_norm": 0.6503407532587065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7105, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.7296313154301322, |
|
"grad_norm": 0.5369254794383076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7116, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7478379608557124, |
|
"grad_norm": 0.6658976879506388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7108, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7660446062812927, |
|
"grad_norm": 0.5443862825931207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7107, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.784251251706873, |
|
"grad_norm": 0.6571290419201206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7118, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8024578971324532, |
|
"grad_norm": 0.543057907363159, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7109, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8206645425580337, |
|
"grad_norm": 0.5863888559339765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7128, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8388711879836142, |
|
"grad_norm": 0.7512763814183313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7129, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8570778334091944, |
|
"grad_norm": 0.6875839517913319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7163, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.8752844788347747, |
|
"grad_norm": 0.5392039305234562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.712, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.893491124260355, |
|
"grad_norm": 0.5715846033792595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7106, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9116977696859354, |
|
"grad_norm": 0.554967637717086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7135, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9299044151115157, |
|
"grad_norm": 0.6064174196129051, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7113, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9481110605370962, |
|
"grad_norm": 0.57876695036443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7133, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.9663177059626764, |
|
"grad_norm": 0.6168643227515411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7107, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.9845243513882567, |
|
"grad_norm": 0.5918567965220687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7148, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.9990896677287209, |
|
"eval_loss": 0.7494105696678162, |
|
"eval_runtime": 371.303, |
|
"eval_samples_per_second": 39.862, |
|
"eval_steps_per_second": 0.625, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 2.002730996813837, |
|
"grad_norm": 0.7112677344136977, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7534, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.020937642239417, |
|
"grad_norm": 0.7657125714853397, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6594, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.039144287664998, |
|
"grad_norm": 0.7269679004826854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6665, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.057350933090578, |
|
"grad_norm": 0.5908930298865153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6641, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.0755575785161584, |
|
"grad_norm": 0.6685407405530368, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6611, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.0937642239417387, |
|
"grad_norm": 0.6835975141817704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6631, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.111970869367319, |
|
"grad_norm": 0.6561709037357517, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6631, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.1301775147928996, |
|
"grad_norm": 0.9447854836936921, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6641, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.14838416021848, |
|
"grad_norm": 0.6554564933004847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6644, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.16659080564406, |
|
"grad_norm": 1.0127997383891072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6711, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.1847974510696404, |
|
"grad_norm": 0.7206567370516284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6668, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2030040964952207, |
|
"grad_norm": 0.655937928590562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6717, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.221210741920801, |
|
"grad_norm": 0.8360483477674778, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6689, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.2394173873463816, |
|
"grad_norm": 0.6092424114015346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6628, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.257624032771962, |
|
"grad_norm": 0.6406882124202334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6651, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.275830678197542, |
|
"grad_norm": 0.7614678401383996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6677, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.2940373236231224, |
|
"grad_norm": 0.7284934278440872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6649, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.3122439690487027, |
|
"grad_norm": 0.6736018159497253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.330450614474283, |
|
"grad_norm": 0.682991318579073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6706, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.3486572598998636, |
|
"grad_norm": 0.6067823760231827, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6642, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.366863905325444, |
|
"grad_norm": 0.6016632433423775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6675, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.385070550751024, |
|
"grad_norm": 0.6784572170158043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6658, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.4032771961766044, |
|
"grad_norm": 0.6804813510854688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6693, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.4214838416021847, |
|
"grad_norm": 0.6667510974726526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6667, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.4396904870277654, |
|
"grad_norm": 0.5712701093393895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6693, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.4578971324533456, |
|
"grad_norm": 0.5495878729178928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6691, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.476103777878926, |
|
"grad_norm": 0.6863248077276337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6697, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.494310423304506, |
|
"grad_norm": 0.6281725692693549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6659, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.5125170687300864, |
|
"grad_norm": 0.584114706811986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6689, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.5307237141556667, |
|
"grad_norm": 0.8069687501187531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6713, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.548930359581247, |
|
"grad_norm": 0.5837408246005996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6702, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.5671370050068276, |
|
"grad_norm": 0.6602631718487662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6729, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.585343650432408, |
|
"grad_norm": 0.5751520967539265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6727, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.603550295857988, |
|
"grad_norm": 0.5753704561331634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.6217569412835684, |
|
"grad_norm": 0.5947582650600889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6687, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.6399635867091487, |
|
"grad_norm": 0.6421455226025955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6673, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.6581702321347294, |
|
"grad_norm": 0.625971391993565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6723, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.6763768775603096, |
|
"grad_norm": 0.6230286833589651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6697, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.69458352298589, |
|
"grad_norm": 0.6704340829719386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6726, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.71279016841147, |
|
"grad_norm": 0.7115047592117129, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6697, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.7309968138370504, |
|
"grad_norm": 0.5348550201712026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6713, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.749203459262631, |
|
"grad_norm": 0.6018887550964896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6713, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.7674101046882114, |
|
"grad_norm": 0.5829651029728877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6711, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.7856167501137916, |
|
"grad_norm": 0.7257452058424095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6679, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.803823395539372, |
|
"grad_norm": 0.7324462033376344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6705, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.822030040964952, |
|
"grad_norm": 0.6518477868742856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6714, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.8402366863905324, |
|
"grad_norm": 0.5915504419734868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6678, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.8584433318161127, |
|
"grad_norm": 0.5596845389686367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6738, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.8766499772416934, |
|
"grad_norm": 0.6211271364874157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6682, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.8948566226672736, |
|
"grad_norm": 0.6029083750419233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6689, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.913063268092854, |
|
"grad_norm": 0.6997154043188799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6654, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.931269913518434, |
|
"grad_norm": 0.6561810265940334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6648, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.9494765589440144, |
|
"grad_norm": 0.5949195869956836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6674, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.967683204369595, |
|
"grad_norm": 0.6572460988263176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6748, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.9858898497951754, |
|
"grad_norm": 0.6466132865038215, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6707, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.9986345015930813, |
|
"eval_loss": 0.7486923933029175, |
|
"eval_runtime": 371.6394, |
|
"eval_samples_per_second": 39.826, |
|
"eval_steps_per_second": 0.624, |
|
"step": 1647 |
|
}, |
|
{ |
|
"epoch": 3.0040964952207556, |
|
"grad_norm": 0.9066558799698319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7123, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.022303140646336, |
|
"grad_norm": 0.6985831692717679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6192, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.040509786071916, |
|
"grad_norm": 0.7297757031418619, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6192, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.0587164314974964, |
|
"grad_norm": 0.7384505423128972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6218, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 0.7273583338941372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6211, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.0951297223486574, |
|
"grad_norm": 0.7471474278740236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.621, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.1133363677742376, |
|
"grad_norm": 0.6193936409655777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6201, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.131543013199818, |
|
"grad_norm": 0.6710258448973723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6224, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.149749658625398, |
|
"grad_norm": 0.7409460472770332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.625, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.1679563040509784, |
|
"grad_norm": 0.709501536492518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6204, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.186162949476559, |
|
"grad_norm": 0.7649819943694469, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6163, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.2043695949021394, |
|
"grad_norm": 0.7173580779194709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6234, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.2225762403277196, |
|
"grad_norm": 0.6457493603144049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6213, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.2407828857533, |
|
"grad_norm": 0.6074440281419403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6229, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.25898953117888, |
|
"grad_norm": 0.7025871444184026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.621, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.277196176604461, |
|
"grad_norm": 0.7659855454008233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6244, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.295402822030041, |
|
"grad_norm": 0.6532860177479579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6229, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.3136094674556213, |
|
"grad_norm": 0.5432923070386573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6226, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.3318161128812016, |
|
"grad_norm": 0.6927506071213173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6241, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.350022758306782, |
|
"grad_norm": 0.6319912049544428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6234, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.368229403732362, |
|
"grad_norm": 0.6730493262615627, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6239, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.386436049157943, |
|
"grad_norm": 0.624938299600225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6225, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.404642694583523, |
|
"grad_norm": 0.623354310700811, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6255, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.4228493400091033, |
|
"grad_norm": 0.6329249624990303, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6253, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.4410559854346836, |
|
"grad_norm": 0.691830816858052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6313, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.459262630860264, |
|
"grad_norm": 0.6032142068639882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6279, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.477469276285844, |
|
"grad_norm": 0.6620277244742816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6279, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.495675921711425, |
|
"grad_norm": 0.6334218656408422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6299, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.513882567137005, |
|
"grad_norm": 0.7375169169002789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6281, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.5320892125625853, |
|
"grad_norm": 0.6448512236031347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6275, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.5502958579881656, |
|
"grad_norm": 0.5665149421787481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6299, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.568502503413746, |
|
"grad_norm": 0.5999159680891181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6286, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.5867091488393266, |
|
"grad_norm": 0.6281107227517486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6248, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.604915794264907, |
|
"grad_norm": 0.6353135942879086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6281, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.623122439690487, |
|
"grad_norm": 0.6193631021536377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6278, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.6413290851160673, |
|
"grad_norm": 0.5918570154763652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6296, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.6595357305416476, |
|
"grad_norm": 0.6010286514232586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6312, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.6777423759672283, |
|
"grad_norm": 0.556597304136066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6282, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.695949021392808, |
|
"grad_norm": 0.7271994139802581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6282, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.714155666818389, |
|
"grad_norm": 0.6169419004473696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.633, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.732362312243969, |
|
"grad_norm": 0.6025566845682745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6299, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.7505689576695493, |
|
"grad_norm": 0.5736503199358731, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6307, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.7687756030951296, |
|
"grad_norm": 0.6112656378611477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6314, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.78698224852071, |
|
"grad_norm": 0.683407900900997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6292, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.8051888939462906, |
|
"grad_norm": 0.5823312600994301, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6302, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.823395539371871, |
|
"grad_norm": 0.7722774266919119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6329, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.841602184797451, |
|
"grad_norm": 0.5962312514475476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6336, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.8598088302230313, |
|
"grad_norm": 0.5704378657470786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6343, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.8780154756486116, |
|
"grad_norm": 0.6221717567681257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6323, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.8962221210741923, |
|
"grad_norm": 0.8715901410739767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6289, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.9144287664997726, |
|
"grad_norm": 0.7264743862880063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6319, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.932635411925353, |
|
"grad_norm": 0.5646894741460851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6326, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.950842057350933, |
|
"grad_norm": 0.6303177348076553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6347, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.9690487027765133, |
|
"grad_norm": 0.5882842156900018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6255, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.987255348202094, |
|
"grad_norm": 0.7637770706292029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6324, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.7583181262016296, |
|
"eval_runtime": 379.7995, |
|
"eval_samples_per_second": 38.971, |
|
"eval_steps_per_second": 0.611, |
|
"step": 2197 |
|
}, |
|
{ |
|
"epoch": 4.005461993627674, |
|
"grad_norm": 1.3302589404721668, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6642, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.023668639053255, |
|
"grad_norm": 1.0479601737495694, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5747, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.041875284478834, |
|
"grad_norm": 0.8236224363789262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5739, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.060081929904415, |
|
"grad_norm": 0.6514312376533014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.078288575329996, |
|
"grad_norm": 0.6470033460056318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.577, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.096495220755576, |
|
"grad_norm": 0.6842183198102436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5772, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.114701866181156, |
|
"grad_norm": 0.7773595484089195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5787, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.132908511606736, |
|
"grad_norm": 0.8512328139098053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5782, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.151115157032317, |
|
"grad_norm": 0.7556248928216761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5794, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.1693218024578975, |
|
"grad_norm": 0.649345136898707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5806, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.187528447883477, |
|
"grad_norm": 0.7218760078584716, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5797, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.205735093309058, |
|
"grad_norm": 0.6931763868077586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5767, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.223941738734638, |
|
"grad_norm": 0.764188364334872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5778, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.2421483841602186, |
|
"grad_norm": 0.7472509039059302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5786, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.260355029585799, |
|
"grad_norm": 0.7681897599903893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5824, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.278561675011379, |
|
"grad_norm": 0.6853235525321896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.578, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.29676832043696, |
|
"grad_norm": 0.6371627957254833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5767, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.31497496586254, |
|
"grad_norm": 0.62518793171934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5828, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.33318161128812, |
|
"grad_norm": 0.6835225300328861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5845, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.3513882567137, |
|
"grad_norm": 0.6635739669645049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5831, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.369594902139281, |
|
"grad_norm": 0.6905352305966937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5838, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.3878015475648615, |
|
"grad_norm": 0.675578669245968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5838, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.406008192990441, |
|
"grad_norm": 0.6472341173499976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5873, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.424214838416022, |
|
"grad_norm": 0.6312793186052579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5862, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.442421483841602, |
|
"grad_norm": 0.6451118193568525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5825, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.4606281292671826, |
|
"grad_norm": 0.6570443368465694, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5854, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.478834774692763, |
|
"grad_norm": 0.658937290588797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5866, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.497041420118343, |
|
"grad_norm": 0.7077581907222236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5869, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.515248065543924, |
|
"grad_norm": 0.6792893518112693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5873, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.533454710969504, |
|
"grad_norm": 0.7399002369000202, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5832, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.551661356395084, |
|
"grad_norm": 0.7316128957109573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5878, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.569868001820664, |
|
"grad_norm": 0.709723130607844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5844, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.588074647246245, |
|
"grad_norm": 0.6462822194584583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5883, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.6062812926718255, |
|
"grad_norm": 0.7572060951703973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5914, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.624487938097405, |
|
"grad_norm": 0.7698033683902142, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5882, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.642694583522986, |
|
"grad_norm": 0.7913214081907007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5865, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.660901228948566, |
|
"grad_norm": 0.652908829617188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5885, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.6791078743741465, |
|
"grad_norm": 0.7196290964249576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.585, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.697314519799727, |
|
"grad_norm": 0.6963685189277059, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5874, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.715521165225307, |
|
"grad_norm": 0.6213064117674314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5886, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.733727810650888, |
|
"grad_norm": 0.6931094000594287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5931, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.751934456076468, |
|
"grad_norm": 0.6421262946164188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5877, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.770141101502048, |
|
"grad_norm": 0.7592504356723478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5901, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.788347746927629, |
|
"grad_norm": 0.5755276719477025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5877, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.806554392353209, |
|
"grad_norm": 0.7579206553363019, |
|
"learning_rate": 5e-06, |
|
"loss": 0.593, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.8247610377787895, |
|
"grad_norm": 0.7227783750503406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.592, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.842967683204369, |
|
"grad_norm": 0.6143720314418375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5923, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.86117432862995, |
|
"grad_norm": 0.684711113595212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5906, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.879380974055531, |
|
"grad_norm": 0.6041365723044803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5943, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.8975876194811105, |
|
"grad_norm": 0.6690754978083606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5909, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.915794264906691, |
|
"grad_norm": 0.6408949623705275, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5938, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.934000910332271, |
|
"grad_norm": 0.6310380336524642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5896, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.952207555757852, |
|
"grad_norm": 0.7658719466797836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5925, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.970414201183432, |
|
"grad_norm": 0.6764346949045434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.587, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.988620846609012, |
|
"grad_norm": 0.6578929361436391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5947, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.999544833864361, |
|
"eval_loss": 0.7756755352020264, |
|
"eval_runtime": 379.7908, |
|
"eval_samples_per_second": 38.971, |
|
"eval_steps_per_second": 0.611, |
|
"step": 2746 |
|
}, |
|
{ |
|
"epoch": 5.006827492034593, |
|
"grad_norm": 1.6971405469705954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6175, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.025034137460173, |
|
"grad_norm": 1.1249548156787617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5284, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 5.0432407828857535, |
|
"grad_norm": 0.8687787258109687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5308, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 5.061447428311333, |
|
"grad_norm": 0.8717275105088685, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5312, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 5.079654073736914, |
|
"grad_norm": 0.8465418731492865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5304, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 5.097860719162495, |
|
"grad_norm": 0.7886283884740087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5279, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.1160673645880745, |
|
"grad_norm": 0.8398942783728177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5363, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 5.134274010013655, |
|
"grad_norm": 0.769923514886696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5338, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 5.152480655439235, |
|
"grad_norm": 0.7868390424124061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5321, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 5.170687300864816, |
|
"grad_norm": 0.7950518134725912, |
|
"learning_rate": 5e-06, |
|
"loss": 0.534, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 5.188893946290396, |
|
"grad_norm": 0.9279901061572416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5363, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 5.207100591715976, |
|
"grad_norm": 0.7594304483347795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.534, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 5.225307237141557, |
|
"grad_norm": 0.8252531851052891, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5349, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 5.243513882567137, |
|
"grad_norm": 0.7626524916862092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.534, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 5.2617205279927175, |
|
"grad_norm": 0.78634458156834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5368, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 5.279927173418297, |
|
"grad_norm": 0.7800860821607554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5348, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.298133818843878, |
|
"grad_norm": 0.7198214581579875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.539, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 5.316340464269459, |
|
"grad_norm": 0.8018455337446776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5354, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 5.3345471096950385, |
|
"grad_norm": 0.6375711217756587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.538, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 5.352753755120619, |
|
"grad_norm": 0.7615279567325839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5388, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 5.370960400546199, |
|
"grad_norm": 0.8634503306148038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5387, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.38916704597178, |
|
"grad_norm": 0.728328161040519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5392, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 5.4073736913973605, |
|
"grad_norm": 0.6714611489114068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5393, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 5.42558033682294, |
|
"grad_norm": 0.864160433406934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5406, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 5.443786982248521, |
|
"grad_norm": 0.7597928888037461, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5368, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 5.461993627674101, |
|
"grad_norm": 0.7636646866418205, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5403, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.4802002730996815, |
|
"grad_norm": 0.7478936763176718, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5411, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 5.498406918525261, |
|
"grad_norm": 0.7764354445181351, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5411, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 5.516613563950842, |
|
"grad_norm": 0.8232735346509414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5433, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 5.534820209376423, |
|
"grad_norm": 0.7742332026679888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5459, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 5.5530268548020025, |
|
"grad_norm": 0.8806327502433587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5456, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.571233500227583, |
|
"grad_norm": 0.8857449291804894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5434, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 5.589440145653163, |
|
"grad_norm": 0.9161045429573774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5381, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 5.607646791078744, |
|
"grad_norm": 0.7948317610189037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5424, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 5.6258534365043245, |
|
"grad_norm": 0.7867485103330927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5434, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 5.644060081929904, |
|
"grad_norm": 0.7569175444017539, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5456, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.662266727355485, |
|
"grad_norm": 0.7425402095430522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5437, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 5.680473372781065, |
|
"grad_norm": 0.8566468508292944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.543, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 5.6986800182066455, |
|
"grad_norm": 0.7291267040452126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5456, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 5.716886663632225, |
|
"grad_norm": 0.9339173337499906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5488, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 5.735093309057806, |
|
"grad_norm": 0.668254133197116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5461, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.753299954483387, |
|
"grad_norm": 0.733779005468029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.548, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 5.7715065999089665, |
|
"grad_norm": 0.6817898908805843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5471, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 5.789713245334547, |
|
"grad_norm": 0.7061838303703719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5435, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 5.807919890760127, |
|
"grad_norm": 0.6788561279223211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5485, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 5.826126536185708, |
|
"grad_norm": 0.743542468839561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5442, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.8443331816112885, |
|
"grad_norm": 0.7126719114866883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.545, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 5.862539827036868, |
|
"grad_norm": 0.7700757305885421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5476, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 5.880746472462449, |
|
"grad_norm": 0.8580615971421329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5496, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 5.898953117888029, |
|
"grad_norm": 0.6651944277970823, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5467, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 5.9171597633136095, |
|
"grad_norm": 0.7351660383841082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5498, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 5.93536640873919, |
|
"grad_norm": 0.7432057163838226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5457, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 5.95357305416477, |
|
"grad_norm": 0.7880258422896124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5484, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 5.971779699590351, |
|
"grad_norm": 0.7829267364761494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5522, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 5.9899863450159305, |
|
"grad_norm": 0.755975814943294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5504, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 5.999089667728721, |
|
"eval_loss": 0.8121561408042908, |
|
"eval_runtime": 385.6914, |
|
"eval_samples_per_second": 38.375, |
|
"eval_steps_per_second": 0.602, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 6.008192990441511, |
|
"grad_norm": 1.229518253610835, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5616, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.026399635867092, |
|
"grad_norm": 0.9690039493999913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4806, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 6.044606281292672, |
|
"grad_norm": 0.975061524739041, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4807, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 6.0628129267182524, |
|
"grad_norm": 0.8693905009394387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.48, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 6.081019572143832, |
|
"grad_norm": 0.8533904059239436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4814, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 6.099226217569413, |
|
"grad_norm": 0.9743084553071109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4832, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 6.117432862994993, |
|
"grad_norm": 0.8428587686314428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4836, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 6.1356395084205735, |
|
"grad_norm": 0.8485721574906323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4842, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.9049634307737825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4871, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 6.172052799271734, |
|
"grad_norm": 0.8485250729351949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4891, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 6.190259444697315, |
|
"grad_norm": 0.8677063446883995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4863, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.2084660901228945, |
|
"grad_norm": 0.8067384460711394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4847, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 6.226672735548475, |
|
"grad_norm": 0.8491856381367863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4872, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 6.244879380974056, |
|
"grad_norm": 0.8434795569738153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4893, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 6.263086026399636, |
|
"grad_norm": 0.877180400082389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.488, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 6.2812926718252164, |
|
"grad_norm": 0.7544909031927692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4926, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 6.299499317250796, |
|
"grad_norm": 0.7786685673980743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4903, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 6.317705962676377, |
|
"grad_norm": 0.8813259287698524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4928, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 6.335912608101957, |
|
"grad_norm": 0.8143534255862093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.489, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 6.3541192535275375, |
|
"grad_norm": 0.813093240140628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4895, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 6.372325898953118, |
|
"grad_norm": 0.8234651442993571, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4874, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.390532544378698, |
|
"grad_norm": 0.7583974230123129, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4894, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 6.408739189804279, |
|
"grad_norm": 0.8045709791368885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4924, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 6.4269458352298585, |
|
"grad_norm": 0.8021037971072071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4958, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 6.445152480655439, |
|
"grad_norm": 0.8860947638978545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4936, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 6.46335912608102, |
|
"grad_norm": 0.778005059457562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4904, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 6.4815657715066, |
|
"grad_norm": 0.8879225211432363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4996, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 6.49977241693218, |
|
"grad_norm": 0.86115853799121, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4958, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 6.51797906235776, |
|
"grad_norm": 0.821923783763917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4998, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 6.536185707783341, |
|
"grad_norm": 0.7488573715925901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4942, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 6.554392353208922, |
|
"grad_norm": 0.766478109512254, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4927, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.5725989986345015, |
|
"grad_norm": 0.9993393195281365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5009, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 6.590805644060082, |
|
"grad_norm": 1.0105814992493984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4902, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 6.609012289485662, |
|
"grad_norm": 0.918270960348232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5004, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 6.627218934911243, |
|
"grad_norm": 0.7543985269097522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4972, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 6.645425580336823, |
|
"grad_norm": 0.9591362111868698, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4978, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 6.663632225762403, |
|
"grad_norm": 0.8732535087129343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4962, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 6.681838871187984, |
|
"grad_norm": 0.8907840024813226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4984, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 6.700045516613564, |
|
"grad_norm": 0.8081950411988741, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4995, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 6.718252162039144, |
|
"grad_norm": 0.7957053619070132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5007, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 6.736458807464724, |
|
"grad_norm": 1.0396693696127743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.754665452890305, |
|
"grad_norm": 0.8276731947592968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5003, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 6.772872098315886, |
|
"grad_norm": 0.740304498205183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.498, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 6.7910787437414655, |
|
"grad_norm": 0.7463156387752775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4994, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 6.809285389167046, |
|
"grad_norm": 0.8016121539442179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4997, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 6.827492034592626, |
|
"grad_norm": 0.8418329142580222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4982, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 6.845698680018207, |
|
"grad_norm": 0.8007293534037732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5008, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 6.8639053254437865, |
|
"grad_norm": 0.7962992985732563, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4983, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 6.882111970869367, |
|
"grad_norm": 0.8192800444422219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5012, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 6.900318616294948, |
|
"grad_norm": 0.7933819169487161, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4993, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 6.918525261720528, |
|
"grad_norm": 0.7719557878224939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4969, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.936731907146108, |
|
"grad_norm": 0.7794726223761498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4997, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 6.954938552571688, |
|
"grad_norm": 0.7490795619365134, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4995, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 6.973145197997269, |
|
"grad_norm": 0.8534935347128176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5045, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 6.99135184342285, |
|
"grad_norm": 0.8266159513619395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5002, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 6.998634501593082, |
|
"eval_loss": 0.8599892854690552, |
|
"eval_runtime": 382.211, |
|
"eval_samples_per_second": 38.725, |
|
"eval_steps_per_second": 0.607, |
|
"step": 3844 |
|
}, |
|
{ |
|
"epoch": 7.0095584888484295, |
|
"grad_norm": 1.4929469131394506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.507, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 7.02776513427401, |
|
"grad_norm": 0.9806727302100092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4258, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 7.04597177969959, |
|
"grad_norm": 1.0186437284049983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4304, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 7.064178425125171, |
|
"grad_norm": 0.8346902224337589, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4315, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 7.082385070550751, |
|
"grad_norm": 0.8895988414802425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4321, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 7.100591715976331, |
|
"grad_norm": 0.8968273584319876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4316, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.118798361401912, |
|
"grad_norm": 1.0430199362279293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4352, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 7.137005006827492, |
|
"grad_norm": 0.9925445246056604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4335, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 7.155211652253072, |
|
"grad_norm": 0.9236834017922617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4344, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 7.173418297678653, |
|
"grad_norm": 0.9006887934639467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4354, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 7.191624943104233, |
|
"grad_norm": 1.021746418559959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4337, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 7.209831588529814, |
|
"grad_norm": 0.8820115554192361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4356, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 7.2280382339553935, |
|
"grad_norm": 0.9450373403984087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4346, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 7.246244879380974, |
|
"grad_norm": 0.9599720579833242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.437, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 7.264451524806554, |
|
"grad_norm": 0.944022553759221, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4373, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 7.282658170232135, |
|
"grad_norm": 1.1470515929421334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.437, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.300864815657715, |
|
"grad_norm": 0.9229924188228826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4412, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 7.319071461083295, |
|
"grad_norm": 1.1004139942729736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4404, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 7.337278106508876, |
|
"grad_norm": 0.9234895336641952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4413, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 7.355484751934456, |
|
"grad_norm": 0.9191069378971881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4374, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 7.373691397360036, |
|
"grad_norm": 0.9565597630772228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4427, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 7.391898042785617, |
|
"grad_norm": 0.8991823052622302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4408, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 7.410104688211197, |
|
"grad_norm": 0.9463545273318638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4428, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 7.428311333636778, |
|
"grad_norm": 0.8995180480466028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.439, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 7.4465179790623575, |
|
"grad_norm": 0.8848211155155944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4408, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 7.464724624487938, |
|
"grad_norm": 0.9344565804246009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4426, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.482931269913518, |
|
"grad_norm": 0.9397047554338785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4422, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 7.501137915339099, |
|
"grad_norm": 1.0544561679078621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4433, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 7.519344560764679, |
|
"grad_norm": 1.018669421540764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4425, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 7.537551206190259, |
|
"grad_norm": 0.884634669314439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4447, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 7.55575785161584, |
|
"grad_norm": 0.8973368938963501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4414, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 7.57396449704142, |
|
"grad_norm": 1.0298222416647689, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4475, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 7.592171142467, |
|
"grad_norm": 0.9090837637474475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4427, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 7.610377787892581, |
|
"grad_norm": 0.9114871413052325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4464, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 7.628584433318161, |
|
"grad_norm": 0.8765155235059887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4459, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 7.646791078743742, |
|
"grad_norm": 0.8572465195574555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4466, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.6649977241693215, |
|
"grad_norm": 0.8619953574827645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4449, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 7.683204369594902, |
|
"grad_norm": 0.9047350844927557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4491, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 7.701411015020483, |
|
"grad_norm": 0.8935472556665444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4483, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 7.719617660446063, |
|
"grad_norm": 0.8653782388568667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4475, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 7.737824305871643, |
|
"grad_norm": 0.881016800265456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.451, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 7.756030951297223, |
|
"grad_norm": 0.9020813961697224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.448, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 7.774237596722804, |
|
"grad_norm": 0.9716168118712624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4523, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 7.792444242148385, |
|
"grad_norm": 0.9179534064504127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4515, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 7.810650887573964, |
|
"grad_norm": 0.9614798912890371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.448, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 7.828857532999545, |
|
"grad_norm": 0.9044673066961919, |
|
"learning_rate": 5e-06, |
|
"loss": 0.452, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.847064178425125, |
|
"grad_norm": 0.9051177289315938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4502, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 7.865270823850706, |
|
"grad_norm": 0.9751136269675025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4508, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 7.883477469276286, |
|
"grad_norm": 0.9479807771925007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4481, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 7.901684114701866, |
|
"grad_norm": 1.0019388262723266, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4508, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 7.919890760127447, |
|
"grad_norm": 0.8485770165242006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4488, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 7.938097405553027, |
|
"grad_norm": 0.9440844580758145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.453, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 7.956304050978607, |
|
"grad_norm": 0.9450193155443942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4519, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 7.974510696404187, |
|
"grad_norm": 0.9124151727103043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4542, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 7.992717341829768, |
|
"grad_norm": 0.9474262270163373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4505, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 7.9963586709148835, |
|
"eval_loss": 0.9232881665229797, |
|
"eval_runtime": 369.711, |
|
"eval_samples_per_second": 40.034, |
|
"eval_steps_per_second": 0.628, |
|
"step": 4392 |
|
}, |
|
{ |
|
"epoch": 7.9963586709148835, |
|
"step": 4392, |
|
"total_flos": 7356554202316800.0, |
|
"train_loss": 0.6099752168050664, |
|
"train_runtime": 144891.7905, |
|
"train_samples_per_second": 15.527, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4392, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7356554202316800.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|