{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.9963586709148835, "eval_steps": 500, "global_step": 4392, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018206645425580335, "grad_norm": 15.35500484249196, "learning_rate": 5e-06, "loss": 1.0839, "step": 10 }, { "epoch": 0.03641329085116067, "grad_norm": 2.618926418245599, "learning_rate": 5e-06, "loss": 0.9627, "step": 20 }, { "epoch": 0.05461993627674101, "grad_norm": 2.080636783209041, "learning_rate": 5e-06, "loss": 0.9308, "step": 30 }, { "epoch": 0.07282658170232134, "grad_norm": 1.436134442652753, "learning_rate": 5e-06, "loss": 0.897, "step": 40 }, { "epoch": 0.09103322712790168, "grad_norm": 1.581598449271548, "learning_rate": 5e-06, "loss": 0.8783, "step": 50 }, { "epoch": 0.10923987255348203, "grad_norm": 1.3764881165967209, "learning_rate": 5e-06, "loss": 0.8594, "step": 60 }, { "epoch": 0.12744651797906237, "grad_norm": 0.8508264971874785, "learning_rate": 5e-06, "loss": 0.8476, "step": 70 }, { "epoch": 0.14565316340464268, "grad_norm": 0.9826078435096594, "learning_rate": 5e-06, "loss": 0.8351, "step": 80 }, { "epoch": 0.16385980883022302, "grad_norm": 0.7948332220104163, "learning_rate": 5e-06, "loss": 0.8261, "step": 90 }, { "epoch": 0.18206645425580337, "grad_norm": 0.7335767518254585, "learning_rate": 5e-06, "loss": 0.8185, "step": 100 }, { "epoch": 0.2002730996813837, "grad_norm": 1.0534204936405038, "learning_rate": 5e-06, "loss": 0.8204, "step": 110 }, { "epoch": 0.21847974510696405, "grad_norm": 0.847321277840478, "learning_rate": 5e-06, "loss": 0.8108, "step": 120 }, { "epoch": 0.23668639053254437, "grad_norm": 0.6210895897293146, "learning_rate": 5e-06, "loss": 0.8084, "step": 130 }, { "epoch": 0.25489303595812474, "grad_norm": 0.5781940572538745, "learning_rate": 5e-06, "loss": 0.8103, "step": 140 }, { "epoch": 0.27309968138370505, "grad_norm": 0.9063984151725509, "learning_rate": 5e-06, "loss": 0.8077, "step": 150 }, { "epoch": 0.29130632680928537, "grad_norm": 0.6571941221537146, "learning_rate": 5e-06, "loss": 0.799, "step": 160 }, { "epoch": 0.30951297223486574, "grad_norm": 0.5952826448438556, "learning_rate": 5e-06, "loss": 0.7999, "step": 170 }, { "epoch": 0.32771961766044605, "grad_norm": 1.4601461457218337, "learning_rate": 5e-06, "loss": 0.8004, "step": 180 }, { "epoch": 0.3459262630860264, "grad_norm": 1.3162465370008012, "learning_rate": 5e-06, "loss": 0.7973, "step": 190 }, { "epoch": 0.36413290851160673, "grad_norm": 1.2972378846813408, "learning_rate": 5e-06, "loss": 0.7959, "step": 200 }, { "epoch": 0.38233955393718705, "grad_norm": 0.9152791184028909, "learning_rate": 5e-06, "loss": 0.7892, "step": 210 }, { "epoch": 0.4005461993627674, "grad_norm": 0.823555803312372, "learning_rate": 5e-06, "loss": 0.7932, "step": 220 }, { "epoch": 0.41875284478834773, "grad_norm": 0.7145315436546384, "learning_rate": 5e-06, "loss": 0.7848, "step": 230 }, { "epoch": 0.4369594902139281, "grad_norm": 0.5776875953876701, "learning_rate": 5e-06, "loss": 0.791, "step": 240 }, { "epoch": 0.4551661356395084, "grad_norm": 0.6846063811362885, "learning_rate": 5e-06, "loss": 0.7875, "step": 250 }, { "epoch": 0.47337278106508873, "grad_norm": 0.7232578149621245, "learning_rate": 5e-06, "loss": 0.7843, "step": 260 }, { "epoch": 0.4915794264906691, "grad_norm": 0.6962364276167124, "learning_rate": 5e-06, "loss": 0.7839, "step": 270 }, { "epoch": 0.5097860719162495, "grad_norm": 0.8881236192032435, "learning_rate": 5e-06, "loss": 0.779, "step": 280 }, { "epoch": 0.5279927173418297, "grad_norm": 0.6192460064199938, "learning_rate": 5e-06, "loss": 0.7821, "step": 290 }, { "epoch": 0.5461993627674101, "grad_norm": 0.7429540921229376, "learning_rate": 5e-06, "loss": 0.7771, "step": 300 }, { "epoch": 0.5644060081929905, "grad_norm": 0.5678182946639223, "learning_rate": 5e-06, "loss": 0.7837, "step": 310 }, { "epoch": 0.5826126536185707, "grad_norm": 0.7927883549153515, "learning_rate": 5e-06, "loss": 0.7795, "step": 320 }, { "epoch": 0.6008192990441511, "grad_norm": 0.6630873675584549, "learning_rate": 5e-06, "loss": 0.7788, "step": 330 }, { "epoch": 0.6190259444697315, "grad_norm": 0.8821281227865143, "learning_rate": 5e-06, "loss": 0.7817, "step": 340 }, { "epoch": 0.6372325898953118, "grad_norm": 0.5862391322505137, "learning_rate": 5e-06, "loss": 0.7818, "step": 350 }, { "epoch": 0.6554392353208921, "grad_norm": 0.7933124674797363, "learning_rate": 5e-06, "loss": 0.7714, "step": 360 }, { "epoch": 0.6736458807464725, "grad_norm": 0.5912540694609474, "learning_rate": 5e-06, "loss": 0.7732, "step": 370 }, { "epoch": 0.6918525261720528, "grad_norm": 0.5222298817170612, "learning_rate": 5e-06, "loss": 0.7697, "step": 380 }, { "epoch": 0.7100591715976331, "grad_norm": 0.6154935564611252, "learning_rate": 5e-06, "loss": 0.7716, "step": 390 }, { "epoch": 0.7282658170232135, "grad_norm": 0.7305900161607543, "learning_rate": 5e-06, "loss": 0.7708, "step": 400 }, { "epoch": 0.7464724624487938, "grad_norm": 0.7798387910594561, "learning_rate": 5e-06, "loss": 0.7724, "step": 410 }, { "epoch": 0.7646791078743741, "grad_norm": 1.100346410427369, "learning_rate": 5e-06, "loss": 0.7716, "step": 420 }, { "epoch": 0.7828857532999545, "grad_norm": 0.657718484418182, "learning_rate": 5e-06, "loss": 0.7666, "step": 430 }, { "epoch": 0.8010923987255348, "grad_norm": 0.6103104885939142, "learning_rate": 5e-06, "loss": 0.765, "step": 440 }, { "epoch": 0.8192990441511152, "grad_norm": 0.6884438533501283, "learning_rate": 5e-06, "loss": 0.7699, "step": 450 }, { "epoch": 0.8375056895766955, "grad_norm": 0.6270481654771743, "learning_rate": 5e-06, "loss": 0.767, "step": 460 }, { "epoch": 0.8557123350022758, "grad_norm": 0.5861612234613632, "learning_rate": 5e-06, "loss": 0.7645, "step": 470 }, { "epoch": 0.8739189804278562, "grad_norm": 0.586876291200088, "learning_rate": 5e-06, "loss": 0.7686, "step": 480 }, { "epoch": 0.8921256258534365, "grad_norm": 0.7466830111479166, "learning_rate": 5e-06, "loss": 0.7688, "step": 490 }, { "epoch": 0.9103322712790168, "grad_norm": 0.6792188281050604, "learning_rate": 5e-06, "loss": 0.7685, "step": 500 }, { "epoch": 0.9285389167045972, "grad_norm": 0.6846050616906328, "learning_rate": 5e-06, "loss": 0.7616, "step": 510 }, { "epoch": 0.9467455621301775, "grad_norm": 0.7324962552104101, "learning_rate": 5e-06, "loss": 0.762, "step": 520 }, { "epoch": 0.9649522075557578, "grad_norm": 0.6893904439545421, "learning_rate": 5e-06, "loss": 0.7664, "step": 530 }, { "epoch": 0.9831588529813382, "grad_norm": 0.6529796330088168, "learning_rate": 5e-06, "loss": 0.7631, "step": 540 }, { "epoch": 0.9995448338643604, "eval_loss": 0.7638739347457886, "eval_runtime": 379.9866, "eval_samples_per_second": 38.951, "eval_steps_per_second": 0.611, "step": 549 }, { "epoch": 1.0013654984069185, "grad_norm": 1.2061555179450747, "learning_rate": 5e-06, "loss": 0.812, "step": 550 }, { "epoch": 1.019572143832499, "grad_norm": 0.8129440249983992, "learning_rate": 5e-06, "loss": 0.7222, "step": 560 }, { "epoch": 1.0377787892580792, "grad_norm": 0.6593013104808815, "learning_rate": 5e-06, "loss": 0.7205, "step": 570 }, { "epoch": 1.0559854346836595, "grad_norm": 0.6107941470930411, "learning_rate": 5e-06, "loss": 0.7205, "step": 580 }, { "epoch": 1.07419208010924, "grad_norm": 0.8276670919993867, "learning_rate": 5e-06, "loss": 0.7144, "step": 590 }, { "epoch": 1.0923987255348202, "grad_norm": 0.6253185542660747, "learning_rate": 5e-06, "loss": 0.7136, "step": 600 }, { "epoch": 1.1106053709604005, "grad_norm": 0.6108900775704884, "learning_rate": 5e-06, "loss": 0.7189, "step": 610 }, { "epoch": 1.128812016385981, "grad_norm": 0.8988785705551109, "learning_rate": 5e-06, "loss": 0.7122, "step": 620 }, { "epoch": 1.1470186618115612, "grad_norm": 0.616453203309773, "learning_rate": 5e-06, "loss": 0.7138, "step": 630 }, { "epoch": 1.1652253072371415, "grad_norm": 0.8816478964333492, "learning_rate": 5e-06, "loss": 0.7174, "step": 640 }, { "epoch": 1.183431952662722, "grad_norm": 0.6581015672880766, "learning_rate": 5e-06, "loss": 0.7183, "step": 650 }, { "epoch": 1.2016385980883022, "grad_norm": 0.6253002121195789, "learning_rate": 5e-06, "loss": 0.7169, "step": 660 }, { "epoch": 1.2198452435138827, "grad_norm": 0.7483871477416271, "learning_rate": 5e-06, "loss": 0.7136, "step": 670 }, { "epoch": 1.238051888939463, "grad_norm": 0.6941445361796307, "learning_rate": 5e-06, "loss": 0.7114, "step": 680 }, { "epoch": 1.2562585343650432, "grad_norm": 0.6143667364149711, "learning_rate": 5e-06, "loss": 0.7185, "step": 690 }, { "epoch": 1.2744651797906235, "grad_norm": 0.6592963882014911, "learning_rate": 5e-06, "loss": 0.7166, "step": 700 }, { "epoch": 1.292671825216204, "grad_norm": 0.6189597729803721, "learning_rate": 5e-06, "loss": 0.7158, "step": 710 }, { "epoch": 1.3108784706417842, "grad_norm": 0.6920980650430207, "learning_rate": 5e-06, "loss": 0.7144, "step": 720 }, { "epoch": 1.3290851160673647, "grad_norm": 0.49530512173626284, "learning_rate": 5e-06, "loss": 0.718, "step": 730 }, { "epoch": 1.347291761492945, "grad_norm": 0.8064053486625922, "learning_rate": 5e-06, "loss": 0.7131, "step": 740 }, { "epoch": 1.3654984069185252, "grad_norm": 0.5821998941102782, "learning_rate": 5e-06, "loss": 0.7136, "step": 750 }, { "epoch": 1.3837050523441057, "grad_norm": 0.5735380427187717, "learning_rate": 5e-06, "loss": 0.7154, "step": 760 }, { "epoch": 1.401911697769686, "grad_norm": 0.5961452096313207, "learning_rate": 5e-06, "loss": 0.7148, "step": 770 }, { "epoch": 1.4201183431952662, "grad_norm": 0.6364200555389256, "learning_rate": 5e-06, "loss": 0.7152, "step": 780 }, { "epoch": 1.4383249886208467, "grad_norm": 0.618866609217087, "learning_rate": 5e-06, "loss": 0.7139, "step": 790 }, { "epoch": 1.456531634046427, "grad_norm": 0.5153453443480923, "learning_rate": 5e-06, "loss": 0.7169, "step": 800 }, { "epoch": 1.4747382794720072, "grad_norm": 0.5921562610346923, "learning_rate": 5e-06, "loss": 0.7157, "step": 810 }, { "epoch": 1.4929449248975877, "grad_norm": 0.6864315768684789, "learning_rate": 5e-06, "loss": 0.7186, "step": 820 }, { "epoch": 1.511151570323168, "grad_norm": 0.6718756873067311, "learning_rate": 5e-06, "loss": 0.713, "step": 830 }, { "epoch": 1.5293582157487484, "grad_norm": 0.5292103286821733, "learning_rate": 5e-06, "loss": 0.7121, "step": 840 }, { "epoch": 1.5475648611743287, "grad_norm": 0.5807337157950755, "learning_rate": 5e-06, "loss": 0.7161, "step": 850 }, { "epoch": 1.565771506599909, "grad_norm": 0.5725693174193143, "learning_rate": 5e-06, "loss": 0.7144, "step": 860 }, { "epoch": 1.5839781520254892, "grad_norm": 0.6693290134206946, "learning_rate": 5e-06, "loss": 0.7137, "step": 870 }, { "epoch": 1.6021847974510697, "grad_norm": 0.5960183936149324, "learning_rate": 5e-06, "loss": 0.718, "step": 880 }, { "epoch": 1.62039144287665, "grad_norm": 0.6717348604009659, "learning_rate": 5e-06, "loss": 0.7107, "step": 890 }, { "epoch": 1.6385980883022304, "grad_norm": 0.6147573815012755, "learning_rate": 5e-06, "loss": 0.7101, "step": 900 }, { "epoch": 1.6568047337278107, "grad_norm": 0.5451936467703803, "learning_rate": 5e-06, "loss": 0.7128, "step": 910 }, { "epoch": 1.675011379153391, "grad_norm": 0.7294513986677604, "learning_rate": 5e-06, "loss": 0.7104, "step": 920 }, { "epoch": 1.6932180245789712, "grad_norm": 0.6756274493055726, "learning_rate": 5e-06, "loss": 0.7115, "step": 930 }, { "epoch": 1.7114246700045517, "grad_norm": 0.6503407532587065, "learning_rate": 5e-06, "loss": 0.7105, "step": 940 }, { "epoch": 1.7296313154301322, "grad_norm": 0.5369254794383076, "learning_rate": 5e-06, "loss": 0.7116, "step": 950 }, { "epoch": 1.7478379608557124, "grad_norm": 0.6658976879506388, "learning_rate": 5e-06, "loss": 0.7108, "step": 960 }, { "epoch": 1.7660446062812927, "grad_norm": 0.5443862825931207, "learning_rate": 5e-06, "loss": 0.7107, "step": 970 }, { "epoch": 1.784251251706873, "grad_norm": 0.6571290419201206, "learning_rate": 5e-06, "loss": 0.7118, "step": 980 }, { "epoch": 1.8024578971324532, "grad_norm": 0.543057907363159, "learning_rate": 5e-06, "loss": 0.7109, "step": 990 }, { "epoch": 1.8206645425580337, "grad_norm": 0.5863888559339765, "learning_rate": 5e-06, "loss": 0.7128, "step": 1000 }, { "epoch": 1.8388711879836142, "grad_norm": 0.7512763814183313, "learning_rate": 5e-06, "loss": 0.7129, "step": 1010 }, { "epoch": 1.8570778334091944, "grad_norm": 0.6875839517913319, "learning_rate": 5e-06, "loss": 0.7163, "step": 1020 }, { "epoch": 1.8752844788347747, "grad_norm": 0.5392039305234562, "learning_rate": 5e-06, "loss": 0.712, "step": 1030 }, { "epoch": 1.893491124260355, "grad_norm": 0.5715846033792595, "learning_rate": 5e-06, "loss": 0.7106, "step": 1040 }, { "epoch": 1.9116977696859354, "grad_norm": 0.554967637717086, "learning_rate": 5e-06, "loss": 0.7135, "step": 1050 }, { "epoch": 1.9299044151115157, "grad_norm": 0.6064174196129051, "learning_rate": 5e-06, "loss": 0.7113, "step": 1060 }, { "epoch": 1.9481110605370962, "grad_norm": 0.57876695036443, "learning_rate": 5e-06, "loss": 0.7133, "step": 1070 }, { "epoch": 1.9663177059626764, "grad_norm": 0.6168643227515411, "learning_rate": 5e-06, "loss": 0.7107, "step": 1080 }, { "epoch": 1.9845243513882567, "grad_norm": 0.5918567965220687, "learning_rate": 5e-06, "loss": 0.7148, "step": 1090 }, { "epoch": 1.9990896677287209, "eval_loss": 0.7494105696678162, "eval_runtime": 371.303, "eval_samples_per_second": 39.862, "eval_steps_per_second": 0.625, "step": 1098 }, { "epoch": 2.002730996813837, "grad_norm": 0.7112677344136977, "learning_rate": 5e-06, "loss": 0.7534, "step": 1100 }, { "epoch": 2.020937642239417, "grad_norm": 0.7657125714853397, "learning_rate": 5e-06, "loss": 0.6594, "step": 1110 }, { "epoch": 2.039144287664998, "grad_norm": 0.7269679004826854, "learning_rate": 5e-06, "loss": 0.6665, "step": 1120 }, { "epoch": 2.057350933090578, "grad_norm": 0.5908930298865153, "learning_rate": 5e-06, "loss": 0.6641, "step": 1130 }, { "epoch": 2.0755575785161584, "grad_norm": 0.6685407405530368, "learning_rate": 5e-06, "loss": 0.6611, "step": 1140 }, { "epoch": 2.0937642239417387, "grad_norm": 0.6835975141817704, "learning_rate": 5e-06, "loss": 0.6631, "step": 1150 }, { "epoch": 2.111970869367319, "grad_norm": 0.6561709037357517, "learning_rate": 5e-06, "loss": 0.6631, "step": 1160 }, { "epoch": 2.1301775147928996, "grad_norm": 0.9447854836936921, "learning_rate": 5e-06, "loss": 0.6641, "step": 1170 }, { "epoch": 2.14838416021848, "grad_norm": 0.6554564933004847, "learning_rate": 5e-06, "loss": 0.6644, "step": 1180 }, { "epoch": 2.16659080564406, "grad_norm": 1.0127997383891072, "learning_rate": 5e-06, "loss": 0.6711, "step": 1190 }, { "epoch": 2.1847974510696404, "grad_norm": 0.7206567370516284, "learning_rate": 5e-06, "loss": 0.6668, "step": 1200 }, { "epoch": 2.2030040964952207, "grad_norm": 0.655937928590562, "learning_rate": 5e-06, "loss": 0.6717, "step": 1210 }, { "epoch": 2.221210741920801, "grad_norm": 0.8360483477674778, "learning_rate": 5e-06, "loss": 0.6689, "step": 1220 }, { "epoch": 2.2394173873463816, "grad_norm": 0.6092424114015346, "learning_rate": 5e-06, "loss": 0.6628, "step": 1230 }, { "epoch": 2.257624032771962, "grad_norm": 0.6406882124202334, "learning_rate": 5e-06, "loss": 0.6651, "step": 1240 }, { "epoch": 2.275830678197542, "grad_norm": 0.7614678401383996, "learning_rate": 5e-06, "loss": 0.6677, "step": 1250 }, { "epoch": 2.2940373236231224, "grad_norm": 0.7284934278440872, "learning_rate": 5e-06, "loss": 0.6649, "step": 1260 }, { "epoch": 2.3122439690487027, "grad_norm": 0.6736018159497253, "learning_rate": 5e-06, "loss": 0.6661, "step": 1270 }, { "epoch": 2.330450614474283, "grad_norm": 0.682991318579073, "learning_rate": 5e-06, "loss": 0.6706, "step": 1280 }, { "epoch": 2.3486572598998636, "grad_norm": 0.6067823760231827, "learning_rate": 5e-06, "loss": 0.6642, "step": 1290 }, { "epoch": 2.366863905325444, "grad_norm": 0.6016632433423775, "learning_rate": 5e-06, "loss": 0.6675, "step": 1300 }, { "epoch": 2.385070550751024, "grad_norm": 0.6784572170158043, "learning_rate": 5e-06, "loss": 0.6658, "step": 1310 }, { "epoch": 2.4032771961766044, "grad_norm": 0.6804813510854688, "learning_rate": 5e-06, "loss": 0.6693, "step": 1320 }, { "epoch": 2.4214838416021847, "grad_norm": 0.6667510974726526, "learning_rate": 5e-06, "loss": 0.6667, "step": 1330 }, { "epoch": 2.4396904870277654, "grad_norm": 0.5712701093393895, "learning_rate": 5e-06, "loss": 0.6693, "step": 1340 }, { "epoch": 2.4578971324533456, "grad_norm": 0.5495878729178928, "learning_rate": 5e-06, "loss": 0.6691, "step": 1350 }, { "epoch": 2.476103777878926, "grad_norm": 0.6863248077276337, "learning_rate": 5e-06, "loss": 0.6697, "step": 1360 }, { "epoch": 2.494310423304506, "grad_norm": 0.6281725692693549, "learning_rate": 5e-06, "loss": 0.6659, "step": 1370 }, { "epoch": 2.5125170687300864, "grad_norm": 0.584114706811986, "learning_rate": 5e-06, "loss": 0.6689, "step": 1380 }, { "epoch": 2.5307237141556667, "grad_norm": 0.8069687501187531, "learning_rate": 5e-06, "loss": 0.6713, "step": 1390 }, { "epoch": 2.548930359581247, "grad_norm": 0.5837408246005996, "learning_rate": 5e-06, "loss": 0.6702, "step": 1400 }, { "epoch": 2.5671370050068276, "grad_norm": 0.6602631718487662, "learning_rate": 5e-06, "loss": 0.6729, "step": 1410 }, { "epoch": 2.585343650432408, "grad_norm": 0.5751520967539265, "learning_rate": 5e-06, "loss": 0.6727, "step": 1420 }, { "epoch": 2.603550295857988, "grad_norm": 0.5753704561331634, "learning_rate": 5e-06, "loss": 0.6638, "step": 1430 }, { "epoch": 2.6217569412835684, "grad_norm": 0.5947582650600889, "learning_rate": 5e-06, "loss": 0.6687, "step": 1440 }, { "epoch": 2.6399635867091487, "grad_norm": 0.6421455226025955, "learning_rate": 5e-06, "loss": 0.6673, "step": 1450 }, { "epoch": 2.6581702321347294, "grad_norm": 0.625971391993565, "learning_rate": 5e-06, "loss": 0.6723, "step": 1460 }, { "epoch": 2.6763768775603096, "grad_norm": 0.6230286833589651, "learning_rate": 5e-06, "loss": 0.6697, "step": 1470 }, { "epoch": 2.69458352298589, "grad_norm": 0.6704340829719386, "learning_rate": 5e-06, "loss": 0.6726, "step": 1480 }, { "epoch": 2.71279016841147, "grad_norm": 0.7115047592117129, "learning_rate": 5e-06, "loss": 0.6697, "step": 1490 }, { "epoch": 2.7309968138370504, "grad_norm": 0.5348550201712026, "learning_rate": 5e-06, "loss": 0.6713, "step": 1500 }, { "epoch": 2.749203459262631, "grad_norm": 0.6018887550964896, "learning_rate": 5e-06, "loss": 0.6713, "step": 1510 }, { "epoch": 2.7674101046882114, "grad_norm": 0.5829651029728877, "learning_rate": 5e-06, "loss": 0.6711, "step": 1520 }, { "epoch": 2.7856167501137916, "grad_norm": 0.7257452058424095, "learning_rate": 5e-06, "loss": 0.6679, "step": 1530 }, { "epoch": 2.803823395539372, "grad_norm": 0.7324462033376344, "learning_rate": 5e-06, "loss": 0.6705, "step": 1540 }, { "epoch": 2.822030040964952, "grad_norm": 0.6518477868742856, "learning_rate": 5e-06, "loss": 0.6714, "step": 1550 }, { "epoch": 2.8402366863905324, "grad_norm": 0.5915504419734868, "learning_rate": 5e-06, "loss": 0.6678, "step": 1560 }, { "epoch": 2.8584433318161127, "grad_norm": 0.5596845389686367, "learning_rate": 5e-06, "loss": 0.6738, "step": 1570 }, { "epoch": 2.8766499772416934, "grad_norm": 0.6211271364874157, "learning_rate": 5e-06, "loss": 0.6682, "step": 1580 }, { "epoch": 2.8948566226672736, "grad_norm": 0.6029083750419233, "learning_rate": 5e-06, "loss": 0.6689, "step": 1590 }, { "epoch": 2.913063268092854, "grad_norm": 0.6997154043188799, "learning_rate": 5e-06, "loss": 0.6654, "step": 1600 }, { "epoch": 2.931269913518434, "grad_norm": 0.6561810265940334, "learning_rate": 5e-06, "loss": 0.6648, "step": 1610 }, { "epoch": 2.9494765589440144, "grad_norm": 0.5949195869956836, "learning_rate": 5e-06, "loss": 0.6674, "step": 1620 }, { "epoch": 2.967683204369595, "grad_norm": 0.6572460988263176, "learning_rate": 5e-06, "loss": 0.6748, "step": 1630 }, { "epoch": 2.9858898497951754, "grad_norm": 0.6466132865038215, "learning_rate": 5e-06, "loss": 0.6707, "step": 1640 }, { "epoch": 2.9986345015930813, "eval_loss": 0.7486923933029175, "eval_runtime": 371.6394, "eval_samples_per_second": 39.826, "eval_steps_per_second": 0.624, "step": 1647 }, { "epoch": 3.0040964952207556, "grad_norm": 0.9066558799698319, "learning_rate": 5e-06, "loss": 0.7123, "step": 1650 }, { "epoch": 3.022303140646336, "grad_norm": 0.6985831692717679, "learning_rate": 5e-06, "loss": 0.6192, "step": 1660 }, { "epoch": 3.040509786071916, "grad_norm": 0.7297757031418619, "learning_rate": 5e-06, "loss": 0.6192, "step": 1670 }, { "epoch": 3.0587164314974964, "grad_norm": 0.7384505423128972, "learning_rate": 5e-06, "loss": 0.6218, "step": 1680 }, { "epoch": 3.076923076923077, "grad_norm": 0.7273583338941372, "learning_rate": 5e-06, "loss": 0.6211, "step": 1690 }, { "epoch": 3.0951297223486574, "grad_norm": 0.7471474278740236, "learning_rate": 5e-06, "loss": 0.621, "step": 1700 }, { "epoch": 3.1133363677742376, "grad_norm": 0.6193936409655777, "learning_rate": 5e-06, "loss": 0.6201, "step": 1710 }, { "epoch": 3.131543013199818, "grad_norm": 0.6710258448973723, "learning_rate": 5e-06, "loss": 0.6224, "step": 1720 }, { "epoch": 3.149749658625398, "grad_norm": 0.7409460472770332, "learning_rate": 5e-06, "loss": 0.625, "step": 1730 }, { "epoch": 3.1679563040509784, "grad_norm": 0.709501536492518, "learning_rate": 5e-06, "loss": 0.6204, "step": 1740 }, { "epoch": 3.186162949476559, "grad_norm": 0.7649819943694469, "learning_rate": 5e-06, "loss": 0.6163, "step": 1750 }, { "epoch": 3.2043695949021394, "grad_norm": 0.7173580779194709, "learning_rate": 5e-06, "loss": 0.6234, "step": 1760 }, { "epoch": 3.2225762403277196, "grad_norm": 0.6457493603144049, "learning_rate": 5e-06, "loss": 0.6213, "step": 1770 }, { "epoch": 3.2407828857533, "grad_norm": 0.6074440281419403, "learning_rate": 5e-06, "loss": 0.6229, "step": 1780 }, { "epoch": 3.25898953117888, "grad_norm": 0.7025871444184026, "learning_rate": 5e-06, "loss": 0.621, "step": 1790 }, { "epoch": 3.277196176604461, "grad_norm": 0.7659855454008233, "learning_rate": 5e-06, "loss": 0.6244, "step": 1800 }, { "epoch": 3.295402822030041, "grad_norm": 0.6532860177479579, "learning_rate": 5e-06, "loss": 0.6229, "step": 1810 }, { "epoch": 3.3136094674556213, "grad_norm": 0.5432923070386573, "learning_rate": 5e-06, "loss": 0.6226, "step": 1820 }, { "epoch": 3.3318161128812016, "grad_norm": 0.6927506071213173, "learning_rate": 5e-06, "loss": 0.6241, "step": 1830 }, { "epoch": 3.350022758306782, "grad_norm": 0.6319912049544428, "learning_rate": 5e-06, "loss": 0.6234, "step": 1840 }, { "epoch": 3.368229403732362, "grad_norm": 0.6730493262615627, "learning_rate": 5e-06, "loss": 0.6239, "step": 1850 }, { "epoch": 3.386436049157943, "grad_norm": 0.624938299600225, "learning_rate": 5e-06, "loss": 0.6225, "step": 1860 }, { "epoch": 3.404642694583523, "grad_norm": 0.623354310700811, "learning_rate": 5e-06, "loss": 0.6255, "step": 1870 }, { "epoch": 3.4228493400091033, "grad_norm": 0.6329249624990303, "learning_rate": 5e-06, "loss": 0.6253, "step": 1880 }, { "epoch": 3.4410559854346836, "grad_norm": 0.691830816858052, "learning_rate": 5e-06, "loss": 0.6313, "step": 1890 }, { "epoch": 3.459262630860264, "grad_norm": 0.6032142068639882, "learning_rate": 5e-06, "loss": 0.6279, "step": 1900 }, { "epoch": 3.477469276285844, "grad_norm": 0.6620277244742816, "learning_rate": 5e-06, "loss": 0.6279, "step": 1910 }, { "epoch": 3.495675921711425, "grad_norm": 0.6334218656408422, "learning_rate": 5e-06, "loss": 0.6299, "step": 1920 }, { "epoch": 3.513882567137005, "grad_norm": 0.7375169169002789, "learning_rate": 5e-06, "loss": 0.6281, "step": 1930 }, { "epoch": 3.5320892125625853, "grad_norm": 0.6448512236031347, "learning_rate": 5e-06, "loss": 0.6275, "step": 1940 }, { "epoch": 3.5502958579881656, "grad_norm": 0.5665149421787481, "learning_rate": 5e-06, "loss": 0.6299, "step": 1950 }, { "epoch": 3.568502503413746, "grad_norm": 0.5999159680891181, "learning_rate": 5e-06, "loss": 0.6286, "step": 1960 }, { "epoch": 3.5867091488393266, "grad_norm": 0.6281107227517486, "learning_rate": 5e-06, "loss": 0.6248, "step": 1970 }, { "epoch": 3.604915794264907, "grad_norm": 0.6353135942879086, "learning_rate": 5e-06, "loss": 0.6281, "step": 1980 }, { "epoch": 3.623122439690487, "grad_norm": 0.6193631021536377, "learning_rate": 5e-06, "loss": 0.6278, "step": 1990 }, { "epoch": 3.6413290851160673, "grad_norm": 0.5918570154763652, "learning_rate": 5e-06, "loss": 0.6296, "step": 2000 }, { "epoch": 3.6595357305416476, "grad_norm": 0.6010286514232586, "learning_rate": 5e-06, "loss": 0.6312, "step": 2010 }, { "epoch": 3.6777423759672283, "grad_norm": 0.556597304136066, "learning_rate": 5e-06, "loss": 0.6282, "step": 2020 }, { "epoch": 3.695949021392808, "grad_norm": 0.7271994139802581, "learning_rate": 5e-06, "loss": 0.6282, "step": 2030 }, { "epoch": 3.714155666818389, "grad_norm": 0.6169419004473696, "learning_rate": 5e-06, "loss": 0.633, "step": 2040 }, { "epoch": 3.732362312243969, "grad_norm": 0.6025566845682745, "learning_rate": 5e-06, "loss": 0.6299, "step": 2050 }, { "epoch": 3.7505689576695493, "grad_norm": 0.5736503199358731, "learning_rate": 5e-06, "loss": 0.6307, "step": 2060 }, { "epoch": 3.7687756030951296, "grad_norm": 0.6112656378611477, "learning_rate": 5e-06, "loss": 0.6314, "step": 2070 }, { "epoch": 3.78698224852071, "grad_norm": 0.683407900900997, "learning_rate": 5e-06, "loss": 0.6292, "step": 2080 }, { "epoch": 3.8051888939462906, "grad_norm": 0.5823312600994301, "learning_rate": 5e-06, "loss": 0.6302, "step": 2090 }, { "epoch": 3.823395539371871, "grad_norm": 0.7722774266919119, "learning_rate": 5e-06, "loss": 0.6329, "step": 2100 }, { "epoch": 3.841602184797451, "grad_norm": 0.5962312514475476, "learning_rate": 5e-06, "loss": 0.6336, "step": 2110 }, { "epoch": 3.8598088302230313, "grad_norm": 0.5704378657470786, "learning_rate": 5e-06, "loss": 0.6343, "step": 2120 }, { "epoch": 3.8780154756486116, "grad_norm": 0.6221717567681257, "learning_rate": 5e-06, "loss": 0.6323, "step": 2130 }, { "epoch": 3.8962221210741923, "grad_norm": 0.8715901410739767, "learning_rate": 5e-06, "loss": 0.6289, "step": 2140 }, { "epoch": 3.9144287664997726, "grad_norm": 0.7264743862880063, "learning_rate": 5e-06, "loss": 0.6319, "step": 2150 }, { "epoch": 3.932635411925353, "grad_norm": 0.5646894741460851, "learning_rate": 5e-06, "loss": 0.6326, "step": 2160 }, { "epoch": 3.950842057350933, "grad_norm": 0.6303177348076553, "learning_rate": 5e-06, "loss": 0.6347, "step": 2170 }, { "epoch": 3.9690487027765133, "grad_norm": 0.5882842156900018, "learning_rate": 5e-06, "loss": 0.6255, "step": 2180 }, { "epoch": 3.987255348202094, "grad_norm": 0.7637770706292029, "learning_rate": 5e-06, "loss": 0.6324, "step": 2190 }, { "epoch": 4.0, "eval_loss": 0.7583181262016296, "eval_runtime": 379.7995, "eval_samples_per_second": 38.971, "eval_steps_per_second": 0.611, "step": 2197 }, { "epoch": 4.005461993627674, "grad_norm": 1.3302589404721668, "learning_rate": 5e-06, "loss": 0.6642, "step": 2200 }, { "epoch": 4.023668639053255, "grad_norm": 1.0479601737495694, "learning_rate": 5e-06, "loss": 0.5747, "step": 2210 }, { "epoch": 4.041875284478834, "grad_norm": 0.8236224363789262, "learning_rate": 5e-06, "loss": 0.5739, "step": 2220 }, { "epoch": 4.060081929904415, "grad_norm": 0.6514312376533014, "learning_rate": 5e-06, "loss": 0.5717, "step": 2230 }, { "epoch": 4.078288575329996, "grad_norm": 0.6470033460056318, "learning_rate": 5e-06, "loss": 0.577, "step": 2240 }, { "epoch": 4.096495220755576, "grad_norm": 0.6842183198102436, "learning_rate": 5e-06, "loss": 0.5772, "step": 2250 }, { "epoch": 4.114701866181156, "grad_norm": 0.7773595484089195, "learning_rate": 5e-06, "loss": 0.5787, "step": 2260 }, { "epoch": 4.132908511606736, "grad_norm": 0.8512328139098053, "learning_rate": 5e-06, "loss": 0.5782, "step": 2270 }, { "epoch": 4.151115157032317, "grad_norm": 0.7556248928216761, "learning_rate": 5e-06, "loss": 0.5794, "step": 2280 }, { "epoch": 4.1693218024578975, "grad_norm": 0.649345136898707, "learning_rate": 5e-06, "loss": 0.5806, "step": 2290 }, { "epoch": 4.187528447883477, "grad_norm": 0.7218760078584716, "learning_rate": 5e-06, "loss": 0.5797, "step": 2300 }, { "epoch": 4.205735093309058, "grad_norm": 0.6931763868077586, "learning_rate": 5e-06, "loss": 0.5767, "step": 2310 }, { "epoch": 4.223941738734638, "grad_norm": 0.764188364334872, "learning_rate": 5e-06, "loss": 0.5778, "step": 2320 }, { "epoch": 4.2421483841602186, "grad_norm": 0.7472509039059302, "learning_rate": 5e-06, "loss": 0.5786, "step": 2330 }, { "epoch": 4.260355029585799, "grad_norm": 0.7681897599903893, "learning_rate": 5e-06, "loss": 0.5824, "step": 2340 }, { "epoch": 4.278561675011379, "grad_norm": 0.6853235525321896, "learning_rate": 5e-06, "loss": 0.578, "step": 2350 }, { "epoch": 4.29676832043696, "grad_norm": 0.6371627957254833, "learning_rate": 5e-06, "loss": 0.5767, "step": 2360 }, { "epoch": 4.31497496586254, "grad_norm": 0.62518793171934, "learning_rate": 5e-06, "loss": 0.5828, "step": 2370 }, { "epoch": 4.33318161128812, "grad_norm": 0.6835225300328861, "learning_rate": 5e-06, "loss": 0.5845, "step": 2380 }, { "epoch": 4.3513882567137, "grad_norm": 0.6635739669645049, "learning_rate": 5e-06, "loss": 0.5831, "step": 2390 }, { "epoch": 4.369594902139281, "grad_norm": 0.6905352305966937, "learning_rate": 5e-06, "loss": 0.5838, "step": 2400 }, { "epoch": 4.3878015475648615, "grad_norm": 0.675578669245968, "learning_rate": 5e-06, "loss": 0.5838, "step": 2410 }, { "epoch": 4.406008192990441, "grad_norm": 0.6472341173499976, "learning_rate": 5e-06, "loss": 0.5873, "step": 2420 }, { "epoch": 4.424214838416022, "grad_norm": 0.6312793186052579, "learning_rate": 5e-06, "loss": 0.5862, "step": 2430 }, { "epoch": 4.442421483841602, "grad_norm": 0.6451118193568525, "learning_rate": 5e-06, "loss": 0.5825, "step": 2440 }, { "epoch": 4.4606281292671826, "grad_norm": 0.6570443368465694, "learning_rate": 5e-06, "loss": 0.5854, "step": 2450 }, { "epoch": 4.478834774692763, "grad_norm": 0.658937290588797, "learning_rate": 5e-06, "loss": 0.5866, "step": 2460 }, { "epoch": 4.497041420118343, "grad_norm": 0.7077581907222236, "learning_rate": 5e-06, "loss": 0.5869, "step": 2470 }, { "epoch": 4.515248065543924, "grad_norm": 0.6792893518112693, "learning_rate": 5e-06, "loss": 0.5873, "step": 2480 }, { "epoch": 4.533454710969504, "grad_norm": 0.7399002369000202, "learning_rate": 5e-06, "loss": 0.5832, "step": 2490 }, { "epoch": 4.551661356395084, "grad_norm": 0.7316128957109573, "learning_rate": 5e-06, "loss": 0.5878, "step": 2500 }, { "epoch": 4.569868001820664, "grad_norm": 0.709723130607844, "learning_rate": 5e-06, "loss": 0.5844, "step": 2510 }, { "epoch": 4.588074647246245, "grad_norm": 0.6462822194584583, "learning_rate": 5e-06, "loss": 0.5883, "step": 2520 }, { "epoch": 4.6062812926718255, "grad_norm": 0.7572060951703973, "learning_rate": 5e-06, "loss": 0.5914, "step": 2530 }, { "epoch": 4.624487938097405, "grad_norm": 0.7698033683902142, "learning_rate": 5e-06, "loss": 0.5882, "step": 2540 }, { "epoch": 4.642694583522986, "grad_norm": 0.7913214081907007, "learning_rate": 5e-06, "loss": 0.5865, "step": 2550 }, { "epoch": 4.660901228948566, "grad_norm": 0.652908829617188, "learning_rate": 5e-06, "loss": 0.5885, "step": 2560 }, { "epoch": 4.6791078743741465, "grad_norm": 0.7196290964249576, "learning_rate": 5e-06, "loss": 0.585, "step": 2570 }, { "epoch": 4.697314519799727, "grad_norm": 0.6963685189277059, "learning_rate": 5e-06, "loss": 0.5874, "step": 2580 }, { "epoch": 4.715521165225307, "grad_norm": 0.6213064117674314, "learning_rate": 5e-06, "loss": 0.5886, "step": 2590 }, { "epoch": 4.733727810650888, "grad_norm": 0.6931094000594287, "learning_rate": 5e-06, "loss": 0.5931, "step": 2600 }, { "epoch": 4.751934456076468, "grad_norm": 0.6421262946164188, "learning_rate": 5e-06, "loss": 0.5877, "step": 2610 }, { "epoch": 4.770141101502048, "grad_norm": 0.7592504356723478, "learning_rate": 5e-06, "loss": 0.5901, "step": 2620 }, { "epoch": 4.788347746927629, "grad_norm": 0.5755276719477025, "learning_rate": 5e-06, "loss": 0.5877, "step": 2630 }, { "epoch": 4.806554392353209, "grad_norm": 0.7579206553363019, "learning_rate": 5e-06, "loss": 0.593, "step": 2640 }, { "epoch": 4.8247610377787895, "grad_norm": 0.7227783750503406, "learning_rate": 5e-06, "loss": 0.592, "step": 2650 }, { "epoch": 4.842967683204369, "grad_norm": 0.6143720314418375, "learning_rate": 5e-06, "loss": 0.5923, "step": 2660 }, { "epoch": 4.86117432862995, "grad_norm": 0.684711113595212, "learning_rate": 5e-06, "loss": 0.5906, "step": 2670 }, { "epoch": 4.879380974055531, "grad_norm": 0.6041365723044803, "learning_rate": 5e-06, "loss": 0.5943, "step": 2680 }, { "epoch": 4.8975876194811105, "grad_norm": 0.6690754978083606, "learning_rate": 5e-06, "loss": 0.5909, "step": 2690 }, { "epoch": 4.915794264906691, "grad_norm": 0.6408949623705275, "learning_rate": 5e-06, "loss": 0.5938, "step": 2700 }, { "epoch": 4.934000910332271, "grad_norm": 0.6310380336524642, "learning_rate": 5e-06, "loss": 0.5896, "step": 2710 }, { "epoch": 4.952207555757852, "grad_norm": 0.7658719466797836, "learning_rate": 5e-06, "loss": 0.5925, "step": 2720 }, { "epoch": 4.970414201183432, "grad_norm": 0.6764346949045434, "learning_rate": 5e-06, "loss": 0.587, "step": 2730 }, { "epoch": 4.988620846609012, "grad_norm": 0.6578929361436391, "learning_rate": 5e-06, "loss": 0.5947, "step": 2740 }, { "epoch": 4.999544833864361, "eval_loss": 0.7756755352020264, "eval_runtime": 379.7908, "eval_samples_per_second": 38.971, "eval_steps_per_second": 0.611, "step": 2746 }, { "epoch": 5.006827492034593, "grad_norm": 1.6971405469705954, "learning_rate": 5e-06, "loss": 0.6175, "step": 2750 }, { "epoch": 5.025034137460173, "grad_norm": 1.1249548156787617, "learning_rate": 5e-06, "loss": 0.5284, "step": 2760 }, { "epoch": 5.0432407828857535, "grad_norm": 0.8687787258109687, "learning_rate": 5e-06, "loss": 0.5308, "step": 2770 }, { "epoch": 5.061447428311333, "grad_norm": 0.8717275105088685, "learning_rate": 5e-06, "loss": 0.5312, "step": 2780 }, { "epoch": 5.079654073736914, "grad_norm": 0.8465418731492865, "learning_rate": 5e-06, "loss": 0.5304, "step": 2790 }, { "epoch": 5.097860719162495, "grad_norm": 0.7886283884740087, "learning_rate": 5e-06, "loss": 0.5279, "step": 2800 }, { "epoch": 5.1160673645880745, "grad_norm": 0.8398942783728177, "learning_rate": 5e-06, "loss": 0.5363, "step": 2810 }, { "epoch": 5.134274010013655, "grad_norm": 0.769923514886696, "learning_rate": 5e-06, "loss": 0.5338, "step": 2820 }, { "epoch": 5.152480655439235, "grad_norm": 0.7868390424124061, "learning_rate": 5e-06, "loss": 0.5321, "step": 2830 }, { "epoch": 5.170687300864816, "grad_norm": 0.7950518134725912, "learning_rate": 5e-06, "loss": 0.534, "step": 2840 }, { "epoch": 5.188893946290396, "grad_norm": 0.9279901061572416, "learning_rate": 5e-06, "loss": 0.5363, "step": 2850 }, { "epoch": 5.207100591715976, "grad_norm": 0.7594304483347795, "learning_rate": 5e-06, "loss": 0.534, "step": 2860 }, { "epoch": 5.225307237141557, "grad_norm": 0.8252531851052891, "learning_rate": 5e-06, "loss": 0.5349, "step": 2870 }, { "epoch": 5.243513882567137, "grad_norm": 0.7626524916862092, "learning_rate": 5e-06, "loss": 0.534, "step": 2880 }, { "epoch": 5.2617205279927175, "grad_norm": 0.78634458156834, "learning_rate": 5e-06, "loss": 0.5368, "step": 2890 }, { "epoch": 5.279927173418297, "grad_norm": 0.7800860821607554, "learning_rate": 5e-06, "loss": 0.5348, "step": 2900 }, { "epoch": 5.298133818843878, "grad_norm": 0.7198214581579875, "learning_rate": 5e-06, "loss": 0.539, "step": 2910 }, { "epoch": 5.316340464269459, "grad_norm": 0.8018455337446776, "learning_rate": 5e-06, "loss": 0.5354, "step": 2920 }, { "epoch": 5.3345471096950385, "grad_norm": 0.6375711217756587, "learning_rate": 5e-06, "loss": 0.538, "step": 2930 }, { "epoch": 5.352753755120619, "grad_norm": 0.7615279567325839, "learning_rate": 5e-06, "loss": 0.5388, "step": 2940 }, { "epoch": 5.370960400546199, "grad_norm": 0.8634503306148038, "learning_rate": 5e-06, "loss": 0.5387, "step": 2950 }, { "epoch": 5.38916704597178, "grad_norm": 0.728328161040519, "learning_rate": 5e-06, "loss": 0.5392, "step": 2960 }, { "epoch": 5.4073736913973605, "grad_norm": 0.6714611489114068, "learning_rate": 5e-06, "loss": 0.5393, "step": 2970 }, { "epoch": 5.42558033682294, "grad_norm": 0.864160433406934, "learning_rate": 5e-06, "loss": 0.5406, "step": 2980 }, { "epoch": 5.443786982248521, "grad_norm": 0.7597928888037461, "learning_rate": 5e-06, "loss": 0.5368, "step": 2990 }, { "epoch": 5.461993627674101, "grad_norm": 0.7636646866418205, "learning_rate": 5e-06, "loss": 0.5403, "step": 3000 }, { "epoch": 5.4802002730996815, "grad_norm": 0.7478936763176718, "learning_rate": 5e-06, "loss": 0.5411, "step": 3010 }, { "epoch": 5.498406918525261, "grad_norm": 0.7764354445181351, "learning_rate": 5e-06, "loss": 0.5411, "step": 3020 }, { "epoch": 5.516613563950842, "grad_norm": 0.8232735346509414, "learning_rate": 5e-06, "loss": 0.5433, "step": 3030 }, { "epoch": 5.534820209376423, "grad_norm": 0.7742332026679888, "learning_rate": 5e-06, "loss": 0.5459, "step": 3040 }, { "epoch": 5.5530268548020025, "grad_norm": 0.8806327502433587, "learning_rate": 5e-06, "loss": 0.5456, "step": 3050 }, { "epoch": 5.571233500227583, "grad_norm": 0.8857449291804894, "learning_rate": 5e-06, "loss": 0.5434, "step": 3060 }, { "epoch": 5.589440145653163, "grad_norm": 0.9161045429573774, "learning_rate": 5e-06, "loss": 0.5381, "step": 3070 }, { "epoch": 5.607646791078744, "grad_norm": 0.7948317610189037, "learning_rate": 5e-06, "loss": 0.5424, "step": 3080 }, { "epoch": 5.6258534365043245, "grad_norm": 0.7867485103330927, "learning_rate": 5e-06, "loss": 0.5434, "step": 3090 }, { "epoch": 5.644060081929904, "grad_norm": 0.7569175444017539, "learning_rate": 5e-06, "loss": 0.5456, "step": 3100 }, { "epoch": 5.662266727355485, "grad_norm": 0.7425402095430522, "learning_rate": 5e-06, "loss": 0.5437, "step": 3110 }, { "epoch": 5.680473372781065, "grad_norm": 0.8566468508292944, "learning_rate": 5e-06, "loss": 0.543, "step": 3120 }, { "epoch": 5.6986800182066455, "grad_norm": 0.7291267040452126, "learning_rate": 5e-06, "loss": 0.5456, "step": 3130 }, { "epoch": 5.716886663632225, "grad_norm": 0.9339173337499906, "learning_rate": 5e-06, "loss": 0.5488, "step": 3140 }, { "epoch": 5.735093309057806, "grad_norm": 0.668254133197116, "learning_rate": 5e-06, "loss": 0.5461, "step": 3150 }, { "epoch": 5.753299954483387, "grad_norm": 0.733779005468029, "learning_rate": 5e-06, "loss": 0.548, "step": 3160 }, { "epoch": 5.7715065999089665, "grad_norm": 0.6817898908805843, "learning_rate": 5e-06, "loss": 0.5471, "step": 3170 }, { "epoch": 5.789713245334547, "grad_norm": 0.7061838303703719, "learning_rate": 5e-06, "loss": 0.5435, "step": 3180 }, { "epoch": 5.807919890760127, "grad_norm": 0.6788561279223211, "learning_rate": 5e-06, "loss": 0.5485, "step": 3190 }, { "epoch": 5.826126536185708, "grad_norm": 0.743542468839561, "learning_rate": 5e-06, "loss": 0.5442, "step": 3200 }, { "epoch": 5.8443331816112885, "grad_norm": 0.7126719114866883, "learning_rate": 5e-06, "loss": 0.545, "step": 3210 }, { "epoch": 5.862539827036868, "grad_norm": 0.7700757305885421, "learning_rate": 5e-06, "loss": 0.5476, "step": 3220 }, { "epoch": 5.880746472462449, "grad_norm": 0.8580615971421329, "learning_rate": 5e-06, "loss": 0.5496, "step": 3230 }, { "epoch": 5.898953117888029, "grad_norm": 0.6651944277970823, "learning_rate": 5e-06, "loss": 0.5467, "step": 3240 }, { "epoch": 5.9171597633136095, "grad_norm": 0.7351660383841082, "learning_rate": 5e-06, "loss": 0.5498, "step": 3250 }, { "epoch": 5.93536640873919, "grad_norm": 0.7432057163838226, "learning_rate": 5e-06, "loss": 0.5457, "step": 3260 }, { "epoch": 5.95357305416477, "grad_norm": 0.7880258422896124, "learning_rate": 5e-06, "loss": 0.5484, "step": 3270 }, { "epoch": 5.971779699590351, "grad_norm": 0.7829267364761494, "learning_rate": 5e-06, "loss": 0.5522, "step": 3280 }, { "epoch": 5.9899863450159305, "grad_norm": 0.755975814943294, "learning_rate": 5e-06, "loss": 0.5504, "step": 3290 }, { "epoch": 5.999089667728721, "eval_loss": 0.8121561408042908, "eval_runtime": 385.6914, "eval_samples_per_second": 38.375, "eval_steps_per_second": 0.602, "step": 3295 }, { "epoch": 6.008192990441511, "grad_norm": 1.229518253610835, "learning_rate": 5e-06, "loss": 0.5616, "step": 3300 }, { "epoch": 6.026399635867092, "grad_norm": 0.9690039493999913, "learning_rate": 5e-06, "loss": 0.4806, "step": 3310 }, { "epoch": 6.044606281292672, "grad_norm": 0.975061524739041, "learning_rate": 5e-06, "loss": 0.4807, "step": 3320 }, { "epoch": 6.0628129267182524, "grad_norm": 0.8693905009394387, "learning_rate": 5e-06, "loss": 0.48, "step": 3330 }, { "epoch": 6.081019572143832, "grad_norm": 0.8533904059239436, "learning_rate": 5e-06, "loss": 0.4814, "step": 3340 }, { "epoch": 6.099226217569413, "grad_norm": 0.9743084553071109, "learning_rate": 5e-06, "loss": 0.4832, "step": 3350 }, { "epoch": 6.117432862994993, "grad_norm": 0.8428587686314428, "learning_rate": 5e-06, "loss": 0.4836, "step": 3360 }, { "epoch": 6.1356395084205735, "grad_norm": 0.8485721574906323, "learning_rate": 5e-06, "loss": 0.4842, "step": 3370 }, { "epoch": 6.153846153846154, "grad_norm": 0.9049634307737825, "learning_rate": 5e-06, "loss": 0.4871, "step": 3380 }, { "epoch": 6.172052799271734, "grad_norm": 0.8485250729351949, "learning_rate": 5e-06, "loss": 0.4891, "step": 3390 }, { "epoch": 6.190259444697315, "grad_norm": 0.8677063446883995, "learning_rate": 5e-06, "loss": 0.4863, "step": 3400 }, { "epoch": 6.2084660901228945, "grad_norm": 0.8067384460711394, "learning_rate": 5e-06, "loss": 0.4847, "step": 3410 }, { "epoch": 6.226672735548475, "grad_norm": 0.8491856381367863, "learning_rate": 5e-06, "loss": 0.4872, "step": 3420 }, { "epoch": 6.244879380974056, "grad_norm": 0.8434795569738153, "learning_rate": 5e-06, "loss": 0.4893, "step": 3430 }, { "epoch": 6.263086026399636, "grad_norm": 0.877180400082389, "learning_rate": 5e-06, "loss": 0.488, "step": 3440 }, { "epoch": 6.2812926718252164, "grad_norm": 0.7544909031927692, "learning_rate": 5e-06, "loss": 0.4926, "step": 3450 }, { "epoch": 6.299499317250796, "grad_norm": 0.7786685673980743, "learning_rate": 5e-06, "loss": 0.4903, "step": 3460 }, { "epoch": 6.317705962676377, "grad_norm": 0.8813259287698524, "learning_rate": 5e-06, "loss": 0.4928, "step": 3470 }, { "epoch": 6.335912608101957, "grad_norm": 0.8143534255862093, "learning_rate": 5e-06, "loss": 0.489, "step": 3480 }, { "epoch": 6.3541192535275375, "grad_norm": 0.813093240140628, "learning_rate": 5e-06, "loss": 0.4895, "step": 3490 }, { "epoch": 6.372325898953118, "grad_norm": 0.8234651442993571, "learning_rate": 5e-06, "loss": 0.4874, "step": 3500 }, { "epoch": 6.390532544378698, "grad_norm": 0.7583974230123129, "learning_rate": 5e-06, "loss": 0.4894, "step": 3510 }, { "epoch": 6.408739189804279, "grad_norm": 0.8045709791368885, "learning_rate": 5e-06, "loss": 0.4924, "step": 3520 }, { "epoch": 6.4269458352298585, "grad_norm": 0.8021037971072071, "learning_rate": 5e-06, "loss": 0.4958, "step": 3530 }, { "epoch": 6.445152480655439, "grad_norm": 0.8860947638978545, "learning_rate": 5e-06, "loss": 0.4936, "step": 3540 }, { "epoch": 6.46335912608102, "grad_norm": 0.778005059457562, "learning_rate": 5e-06, "loss": 0.4904, "step": 3550 }, { "epoch": 6.4815657715066, "grad_norm": 0.8879225211432363, "learning_rate": 5e-06, "loss": 0.4996, "step": 3560 }, { "epoch": 6.49977241693218, "grad_norm": 0.86115853799121, "learning_rate": 5e-06, "loss": 0.4958, "step": 3570 }, { "epoch": 6.51797906235776, "grad_norm": 0.821923783763917, "learning_rate": 5e-06, "loss": 0.4998, "step": 3580 }, { "epoch": 6.536185707783341, "grad_norm": 0.7488573715925901, "learning_rate": 5e-06, "loss": 0.4942, "step": 3590 }, { "epoch": 6.554392353208922, "grad_norm": 0.766478109512254, "learning_rate": 5e-06, "loss": 0.4927, "step": 3600 }, { "epoch": 6.5725989986345015, "grad_norm": 0.9993393195281365, "learning_rate": 5e-06, "loss": 0.5009, "step": 3610 }, { "epoch": 6.590805644060082, "grad_norm": 1.0105814992493984, "learning_rate": 5e-06, "loss": 0.4902, "step": 3620 }, { "epoch": 6.609012289485662, "grad_norm": 0.918270960348232, "learning_rate": 5e-06, "loss": 0.5004, "step": 3630 }, { "epoch": 6.627218934911243, "grad_norm": 0.7543985269097522, "learning_rate": 5e-06, "loss": 0.4972, "step": 3640 }, { "epoch": 6.645425580336823, "grad_norm": 0.9591362111868698, "learning_rate": 5e-06, "loss": 0.4978, "step": 3650 }, { "epoch": 6.663632225762403, "grad_norm": 0.8732535087129343, "learning_rate": 5e-06, "loss": 0.4962, "step": 3660 }, { "epoch": 6.681838871187984, "grad_norm": 0.8907840024813226, "learning_rate": 5e-06, "loss": 0.4984, "step": 3670 }, { "epoch": 6.700045516613564, "grad_norm": 0.8081950411988741, "learning_rate": 5e-06, "loss": 0.4995, "step": 3680 }, { "epoch": 6.718252162039144, "grad_norm": 0.7957053619070132, "learning_rate": 5e-06, "loss": 0.5007, "step": 3690 }, { "epoch": 6.736458807464724, "grad_norm": 1.0396693696127743, "learning_rate": 5e-06, "loss": 0.5, "step": 3700 }, { "epoch": 6.754665452890305, "grad_norm": 0.8276731947592968, "learning_rate": 5e-06, "loss": 0.5003, "step": 3710 }, { "epoch": 6.772872098315886, "grad_norm": 0.740304498205183, "learning_rate": 5e-06, "loss": 0.498, "step": 3720 }, { "epoch": 6.7910787437414655, "grad_norm": 0.7463156387752775, "learning_rate": 5e-06, "loss": 0.4994, "step": 3730 }, { "epoch": 6.809285389167046, "grad_norm": 0.8016121539442179, "learning_rate": 5e-06, "loss": 0.4997, "step": 3740 }, { "epoch": 6.827492034592626, "grad_norm": 0.8418329142580222, "learning_rate": 5e-06, "loss": 0.4982, "step": 3750 }, { "epoch": 6.845698680018207, "grad_norm": 0.8007293534037732, "learning_rate": 5e-06, "loss": 0.5008, "step": 3760 }, { "epoch": 6.8639053254437865, "grad_norm": 0.7962992985732563, "learning_rate": 5e-06, "loss": 0.4983, "step": 3770 }, { "epoch": 6.882111970869367, "grad_norm": 0.8192800444422219, "learning_rate": 5e-06, "loss": 0.5012, "step": 3780 }, { "epoch": 6.900318616294948, "grad_norm": 0.7933819169487161, "learning_rate": 5e-06, "loss": 0.4993, "step": 3790 }, { "epoch": 6.918525261720528, "grad_norm": 0.7719557878224939, "learning_rate": 5e-06, "loss": 0.4969, "step": 3800 }, { "epoch": 6.936731907146108, "grad_norm": 0.7794726223761498, "learning_rate": 5e-06, "loss": 0.4997, "step": 3810 }, { "epoch": 6.954938552571688, "grad_norm": 0.7490795619365134, "learning_rate": 5e-06, "loss": 0.4995, "step": 3820 }, { "epoch": 6.973145197997269, "grad_norm": 0.8534935347128176, "learning_rate": 5e-06, "loss": 0.5045, "step": 3830 }, { "epoch": 6.99135184342285, "grad_norm": 0.8266159513619395, "learning_rate": 5e-06, "loss": 0.5002, "step": 3840 }, { "epoch": 6.998634501593082, "eval_loss": 0.8599892854690552, "eval_runtime": 382.211, "eval_samples_per_second": 38.725, "eval_steps_per_second": 0.607, "step": 3844 }, { "epoch": 7.0095584888484295, "grad_norm": 1.4929469131394506, "learning_rate": 5e-06, "loss": 0.507, "step": 3850 }, { "epoch": 7.02776513427401, "grad_norm": 0.9806727302100092, "learning_rate": 5e-06, "loss": 0.4258, "step": 3860 }, { "epoch": 7.04597177969959, "grad_norm": 1.0186437284049983, "learning_rate": 5e-06, "loss": 0.4304, "step": 3870 }, { "epoch": 7.064178425125171, "grad_norm": 0.8346902224337589, "learning_rate": 5e-06, "loss": 0.4315, "step": 3880 }, { "epoch": 7.082385070550751, "grad_norm": 0.8895988414802425, "learning_rate": 5e-06, "loss": 0.4321, "step": 3890 }, { "epoch": 7.100591715976331, "grad_norm": 0.8968273584319876, "learning_rate": 5e-06, "loss": 0.4316, "step": 3900 }, { "epoch": 7.118798361401912, "grad_norm": 1.0430199362279293, "learning_rate": 5e-06, "loss": 0.4352, "step": 3910 }, { "epoch": 7.137005006827492, "grad_norm": 0.9925445246056604, "learning_rate": 5e-06, "loss": 0.4335, "step": 3920 }, { "epoch": 7.155211652253072, "grad_norm": 0.9236834017922617, "learning_rate": 5e-06, "loss": 0.4344, "step": 3930 }, { "epoch": 7.173418297678653, "grad_norm": 0.9006887934639467, "learning_rate": 5e-06, "loss": 0.4354, "step": 3940 }, { "epoch": 7.191624943104233, "grad_norm": 1.021746418559959, "learning_rate": 5e-06, "loss": 0.4337, "step": 3950 }, { "epoch": 7.209831588529814, "grad_norm": 0.8820115554192361, "learning_rate": 5e-06, "loss": 0.4356, "step": 3960 }, { "epoch": 7.2280382339553935, "grad_norm": 0.9450373403984087, "learning_rate": 5e-06, "loss": 0.4346, "step": 3970 }, { "epoch": 7.246244879380974, "grad_norm": 0.9599720579833242, "learning_rate": 5e-06, "loss": 0.437, "step": 3980 }, { "epoch": 7.264451524806554, "grad_norm": 0.944022553759221, "learning_rate": 5e-06, "loss": 0.4373, "step": 3990 }, { "epoch": 7.282658170232135, "grad_norm": 1.1470515929421334, "learning_rate": 5e-06, "loss": 0.437, "step": 4000 }, { "epoch": 7.300864815657715, "grad_norm": 0.9229924188228826, "learning_rate": 5e-06, "loss": 0.4412, "step": 4010 }, { "epoch": 7.319071461083295, "grad_norm": 1.1004139942729736, "learning_rate": 5e-06, "loss": 0.4404, "step": 4020 }, { "epoch": 7.337278106508876, "grad_norm": 0.9234895336641952, "learning_rate": 5e-06, "loss": 0.4413, "step": 4030 }, { "epoch": 7.355484751934456, "grad_norm": 0.9191069378971881, "learning_rate": 5e-06, "loss": 0.4374, "step": 4040 }, { "epoch": 7.373691397360036, "grad_norm": 0.9565597630772228, "learning_rate": 5e-06, "loss": 0.4427, "step": 4050 }, { "epoch": 7.391898042785617, "grad_norm": 0.8991823052622302, "learning_rate": 5e-06, "loss": 0.4408, "step": 4060 }, { "epoch": 7.410104688211197, "grad_norm": 0.9463545273318638, "learning_rate": 5e-06, "loss": 0.4428, "step": 4070 }, { "epoch": 7.428311333636778, "grad_norm": 0.8995180480466028, "learning_rate": 5e-06, "loss": 0.439, "step": 4080 }, { "epoch": 7.4465179790623575, "grad_norm": 0.8848211155155944, "learning_rate": 5e-06, "loss": 0.4408, "step": 4090 }, { "epoch": 7.464724624487938, "grad_norm": 0.9344565804246009, "learning_rate": 5e-06, "loss": 0.4426, "step": 4100 }, { "epoch": 7.482931269913518, "grad_norm": 0.9397047554338785, "learning_rate": 5e-06, "loss": 0.4422, "step": 4110 }, { "epoch": 7.501137915339099, "grad_norm": 1.0544561679078621, "learning_rate": 5e-06, "loss": 0.4433, "step": 4120 }, { "epoch": 7.519344560764679, "grad_norm": 1.018669421540764, "learning_rate": 5e-06, "loss": 0.4425, "step": 4130 }, { "epoch": 7.537551206190259, "grad_norm": 0.884634669314439, "learning_rate": 5e-06, "loss": 0.4447, "step": 4140 }, { "epoch": 7.55575785161584, "grad_norm": 0.8973368938963501, "learning_rate": 5e-06, "loss": 0.4414, "step": 4150 }, { "epoch": 7.57396449704142, "grad_norm": 1.0298222416647689, "learning_rate": 5e-06, "loss": 0.4475, "step": 4160 }, { "epoch": 7.592171142467, "grad_norm": 0.9090837637474475, "learning_rate": 5e-06, "loss": 0.4427, "step": 4170 }, { "epoch": 7.610377787892581, "grad_norm": 0.9114871413052325, "learning_rate": 5e-06, "loss": 0.4464, "step": 4180 }, { "epoch": 7.628584433318161, "grad_norm": 0.8765155235059887, "learning_rate": 5e-06, "loss": 0.4459, "step": 4190 }, { "epoch": 7.646791078743742, "grad_norm": 0.8572465195574555, "learning_rate": 5e-06, "loss": 0.4466, "step": 4200 }, { "epoch": 7.6649977241693215, "grad_norm": 0.8619953574827645, "learning_rate": 5e-06, "loss": 0.4449, "step": 4210 }, { "epoch": 7.683204369594902, "grad_norm": 0.9047350844927557, "learning_rate": 5e-06, "loss": 0.4491, "step": 4220 }, { "epoch": 7.701411015020483, "grad_norm": 0.8935472556665444, "learning_rate": 5e-06, "loss": 0.4483, "step": 4230 }, { "epoch": 7.719617660446063, "grad_norm": 0.8653782388568667, "learning_rate": 5e-06, "loss": 0.4475, "step": 4240 }, { "epoch": 7.737824305871643, "grad_norm": 0.881016800265456, "learning_rate": 5e-06, "loss": 0.451, "step": 4250 }, { "epoch": 7.756030951297223, "grad_norm": 0.9020813961697224, "learning_rate": 5e-06, "loss": 0.448, "step": 4260 }, { "epoch": 7.774237596722804, "grad_norm": 0.9716168118712624, "learning_rate": 5e-06, "loss": 0.4523, "step": 4270 }, { "epoch": 7.792444242148385, "grad_norm": 0.9179534064504127, "learning_rate": 5e-06, "loss": 0.4515, "step": 4280 }, { "epoch": 7.810650887573964, "grad_norm": 0.9614798912890371, "learning_rate": 5e-06, "loss": 0.448, "step": 4290 }, { "epoch": 7.828857532999545, "grad_norm": 0.9044673066961919, "learning_rate": 5e-06, "loss": 0.452, "step": 4300 }, { "epoch": 7.847064178425125, "grad_norm": 0.9051177289315938, "learning_rate": 5e-06, "loss": 0.4502, "step": 4310 }, { "epoch": 7.865270823850706, "grad_norm": 0.9751136269675025, "learning_rate": 5e-06, "loss": 0.4508, "step": 4320 }, { "epoch": 7.883477469276286, "grad_norm": 0.9479807771925007, "learning_rate": 5e-06, "loss": 0.4481, "step": 4330 }, { "epoch": 7.901684114701866, "grad_norm": 1.0019388262723266, "learning_rate": 5e-06, "loss": 0.4508, "step": 4340 }, { "epoch": 7.919890760127447, "grad_norm": 0.8485770165242006, "learning_rate": 5e-06, "loss": 0.4488, "step": 4350 }, { "epoch": 7.938097405553027, "grad_norm": 0.9440844580758145, "learning_rate": 5e-06, "loss": 0.453, "step": 4360 }, { "epoch": 7.956304050978607, "grad_norm": 0.9450193155443942, "learning_rate": 5e-06, "loss": 0.4519, "step": 4370 }, { "epoch": 7.974510696404187, "grad_norm": 0.9124151727103043, "learning_rate": 5e-06, "loss": 0.4542, "step": 4380 }, { "epoch": 7.992717341829768, "grad_norm": 0.9474262270163373, "learning_rate": 5e-06, "loss": 0.4505, "step": 4390 }, { "epoch": 7.9963586709148835, "eval_loss": 0.9232881665229797, "eval_runtime": 369.711, "eval_samples_per_second": 40.034, "eval_steps_per_second": 0.628, "step": 4392 }, { "epoch": 7.9963586709148835, "step": 4392, "total_flos": 7356554202316800.0, "train_loss": 0.6099752168050664, "train_runtime": 144891.7905, "train_samples_per_second": 15.527, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 4392, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7356554202316800.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }