{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9996235648409562, "eval_steps": 500, "global_step": 7968, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012547838634795157, "grad_norm": 25.442365646362305, "learning_rate": 4.1666666666666667e-07, "loss": 0.1369, "step": 5 }, { "epoch": 0.0025095677269590315, "grad_norm": 2707.152587890625, "learning_rate": 8.333333333333333e-07, "loss": 0.1494, "step": 10 }, { "epoch": 0.003764351590438547, "grad_norm": 1.4790934324264526, "learning_rate": 1.25e-06, "loss": 0.1436, "step": 15 }, { "epoch": 0.005019135453918063, "grad_norm": 6611.14990234375, "learning_rate": 1.6666666666666667e-06, "loss": 0.1239, "step": 20 }, { "epoch": 0.006273919317397578, "grad_norm": 3.4831368923187256, "learning_rate": 2.0833333333333334e-06, "loss": 0.1313, "step": 25 }, { "epoch": 0.007528703180877094, "grad_norm": 0.8000121712684631, "learning_rate": 2.5e-06, "loss": 0.1203, "step": 30 }, { "epoch": 0.00878348704435661, "grad_norm": 0.5967940092086792, "learning_rate": 2.916666666666667e-06, "loss": 0.1185, "step": 35 }, { "epoch": 0.010038270907836126, "grad_norm": 3509.61083984375, "learning_rate": 3.3333333333333333e-06, "loss": 0.1243, "step": 40 }, { "epoch": 0.01129305477131564, "grad_norm": 4.068423271179199, "learning_rate": 3.7500000000000005e-06, "loss": 0.123, "step": 45 }, { "epoch": 0.012547838634795156, "grad_norm": 981.9570922851562, "learning_rate": 4.166666666666667e-06, "loss": 0.1358, "step": 50 }, { "epoch": 0.013802622498274672, "grad_norm": 565.902099609375, "learning_rate": 4.583333333333333e-06, "loss": 0.1181, "step": 55 }, { "epoch": 0.015057406361754188, "grad_norm": 1.1762653589248657, "learning_rate": 5e-06, "loss": 0.1138, "step": 60 }, { "epoch": 0.016312190225233704, "grad_norm": 13.798249244689941, "learning_rate": 5.416666666666667e-06, "loss": 0.1303, "step": 65 }, { "epoch": 0.01756697408871322, "grad_norm": 1.9088438749313354, "learning_rate": 5.833333333333334e-06, "loss": 0.128, "step": 70 }, { "epoch": 0.018821757952192736, "grad_norm": 379.896484375, "learning_rate": 6.25e-06, "loss": 0.1115, "step": 75 }, { "epoch": 0.020076541815672252, "grad_norm": 53.034820556640625, "learning_rate": 6.666666666666667e-06, "loss": 0.1106, "step": 80 }, { "epoch": 0.021331325679151768, "grad_norm": 0.5621482729911804, "learning_rate": 7.083333333333335e-06, "loss": 0.1339, "step": 85 }, { "epoch": 0.02258610954263128, "grad_norm": 1503.7694091796875, "learning_rate": 7.500000000000001e-06, "loss": 0.1091, "step": 90 }, { "epoch": 0.023840893406110796, "grad_norm": 21.09101676940918, "learning_rate": 7.916666666666667e-06, "loss": 0.1027, "step": 95 }, { "epoch": 0.025095677269590312, "grad_norm": 153.1422882080078, "learning_rate": 8.333333333333334e-06, "loss": 0.1158, "step": 100 }, { "epoch": 0.026350461133069828, "grad_norm": 686.55615234375, "learning_rate": 8.750000000000001e-06, "loss": 0.1239, "step": 105 }, { "epoch": 0.027605244996549344, "grad_norm": 456.04541015625, "learning_rate": 9.166666666666666e-06, "loss": 0.1176, "step": 110 }, { "epoch": 0.02886002886002886, "grad_norm": 244.83534240722656, "learning_rate": 9.583333333333335e-06, "loss": 0.0952, "step": 115 }, { "epoch": 0.030114812723508376, "grad_norm": 720.8134155273438, "learning_rate": 1e-05, "loss": 0.1181, "step": 120 }, { "epoch": 0.03136959658698789, "grad_norm": 1.439946174621582, "learning_rate": 1.0416666666666668e-05, "loss": 0.1147, "step": 125 }, { "epoch": 0.03262438045046741, "grad_norm": 24.099985122680664, "learning_rate": 1.0833333333333334e-05, "loss": 0.1133, "step": 130 }, { "epoch": 0.03387916431394692, "grad_norm": 10.10527515411377, "learning_rate": 1.125e-05, "loss": 0.1237, "step": 135 }, { "epoch": 0.03513394817742644, "grad_norm": 171.32110595703125, "learning_rate": 1.1666666666666668e-05, "loss": 0.3536, "step": 140 }, { "epoch": 0.03638873204090595, "grad_norm": 129.24395751953125, "learning_rate": 1.2083333333333333e-05, "loss": 0.1079, "step": 145 }, { "epoch": 0.03764351590438547, "grad_norm": 3.056920289993286, "learning_rate": 1.25e-05, "loss": 0.1223, "step": 150 }, { "epoch": 0.038898299767864984, "grad_norm": 58.41600036621094, "learning_rate": 1.2916666666666668e-05, "loss": 0.094, "step": 155 }, { "epoch": 0.040153083631344504, "grad_norm": 67.04778289794922, "learning_rate": 1.3333333333333333e-05, "loss": 0.1181, "step": 160 }, { "epoch": 0.041407867494824016, "grad_norm": 152.23841857910156, "learning_rate": 1.375e-05, "loss": 0.1122, "step": 165 }, { "epoch": 0.042662651358303535, "grad_norm": 35.90134048461914, "learning_rate": 1.416666666666667e-05, "loss": 0.1152, "step": 170 }, { "epoch": 0.04391743522178305, "grad_norm": 1.0190317630767822, "learning_rate": 1.4583333333333333e-05, "loss": 0.1138, "step": 175 }, { "epoch": 0.04517221908526256, "grad_norm": 0.9965628385543823, "learning_rate": 1.5000000000000002e-05, "loss": 0.1127, "step": 180 }, { "epoch": 0.04642700294874208, "grad_norm": 1.6608027219772339, "learning_rate": 1.5416666666666668e-05, "loss": 0.1114, "step": 185 }, { "epoch": 0.04768178681222159, "grad_norm": 1.2562459707260132, "learning_rate": 1.5833333333333333e-05, "loss": 0.1051, "step": 190 }, { "epoch": 0.04893657067570111, "grad_norm": 0.13770118355751038, "learning_rate": 1.6250000000000002e-05, "loss": 0.1049, "step": 195 }, { "epoch": 0.050191354539180624, "grad_norm": 0.3370998799800873, "learning_rate": 1.6666666666666667e-05, "loss": 0.1044, "step": 200 }, { "epoch": 0.051446138402660144, "grad_norm": 0.5696057081222534, "learning_rate": 1.7083333333333333e-05, "loss": 0.1049, "step": 205 }, { "epoch": 0.052700922266139656, "grad_norm": 0.24888598918914795, "learning_rate": 1.7500000000000002e-05, "loss": 0.1065, "step": 210 }, { "epoch": 0.053955706129619176, "grad_norm": 0.5320826172828674, "learning_rate": 1.7916666666666667e-05, "loss": 0.1044, "step": 215 }, { "epoch": 0.05521048999309869, "grad_norm": 0.28355732560157776, "learning_rate": 1.8333333333333333e-05, "loss": 0.1189, "step": 220 }, { "epoch": 0.05646527385657821, "grad_norm": 0.20372584462165833, "learning_rate": 1.8750000000000002e-05, "loss": 0.1038, "step": 225 }, { "epoch": 0.05772005772005772, "grad_norm": 0.5983103513717651, "learning_rate": 1.916666666666667e-05, "loss": 0.0955, "step": 230 }, { "epoch": 0.05897484158353723, "grad_norm": 0.36013373732566833, "learning_rate": 1.9583333333333333e-05, "loss": 0.1012, "step": 235 }, { "epoch": 0.06022962544701675, "grad_norm": 0.20898281037807465, "learning_rate": 2e-05, "loss": 0.0993, "step": 240 }, { "epoch": 0.061484409310496264, "grad_norm": 0.29076099395751953, "learning_rate": 1.999997934261318e-05, "loss": 0.106, "step": 245 }, { "epoch": 0.06273919317397578, "grad_norm": 0.8422909379005432, "learning_rate": 1.999991737053805e-05, "loss": 0.0944, "step": 250 }, { "epoch": 0.0639939770374553, "grad_norm": 0.165451779961586, "learning_rate": 1.999981408403066e-05, "loss": 0.1004, "step": 255 }, { "epoch": 0.06524876090093482, "grad_norm": 0.14131979644298553, "learning_rate": 1.9999669483517726e-05, "loss": 0.0877, "step": 260 }, { "epoch": 0.06650354476441434, "grad_norm": 0.2859463095664978, "learning_rate": 1.9999483569596664e-05, "loss": 0.0939, "step": 265 }, { "epoch": 0.06775832862789384, "grad_norm": 0.26985302567481995, "learning_rate": 1.9999256343035577e-05, "loss": 0.1005, "step": 270 }, { "epoch": 0.06901311249137336, "grad_norm": 0.16001251339912415, "learning_rate": 1.9998987804773244e-05, "loss": 0.0991, "step": 275 }, { "epoch": 0.07026789635485288, "grad_norm": 0.28724098205566406, "learning_rate": 1.9998677955919127e-05, "loss": 0.1039, "step": 280 }, { "epoch": 0.0715226802183324, "grad_norm": 0.10330358892679214, "learning_rate": 1.9998326797753352e-05, "loss": 0.0992, "step": 285 }, { "epoch": 0.0727774640818119, "grad_norm": 0.630402147769928, "learning_rate": 1.999793433172673e-05, "loss": 0.1055, "step": 290 }, { "epoch": 0.07403224794529142, "grad_norm": 0.3052836060523987, "learning_rate": 1.9997500559460718e-05, "loss": 0.0966, "step": 295 }, { "epoch": 0.07528703180877094, "grad_norm": 0.3691750466823578, "learning_rate": 1.999702548274744e-05, "loss": 0.1047, "step": 300 }, { "epoch": 0.07654181567225045, "grad_norm": 0.47557133436203003, "learning_rate": 1.999650910354967e-05, "loss": 0.1022, "step": 305 }, { "epoch": 0.07779659953572997, "grad_norm": 0.21415193378925323, "learning_rate": 1.999595142400081e-05, "loss": 0.1075, "step": 310 }, { "epoch": 0.07905138339920949, "grad_norm": 0.3545961081981659, "learning_rate": 1.99953524464049e-05, "loss": 0.096, "step": 315 }, { "epoch": 0.08030616726268901, "grad_norm": 0.3949846029281616, "learning_rate": 1.9994712173236604e-05, "loss": 0.0974, "step": 320 }, { "epoch": 0.08156095112616851, "grad_norm": 1.0949060916900635, "learning_rate": 1.9994030607141196e-05, "loss": 0.0973, "step": 325 }, { "epoch": 0.08281573498964803, "grad_norm": 0.35101133584976196, "learning_rate": 1.9993307750934555e-05, "loss": 0.0965, "step": 330 }, { "epoch": 0.08407051885312755, "grad_norm": 0.29526200890541077, "learning_rate": 1.999254360760314e-05, "loss": 0.0898, "step": 335 }, { "epoch": 0.08532530271660707, "grad_norm": 0.09260907024145126, "learning_rate": 1.999173818030399e-05, "loss": 0.0948, "step": 340 }, { "epoch": 0.08658008658008658, "grad_norm": 0.2705346345901489, "learning_rate": 1.999089147236472e-05, "loss": 0.0946, "step": 345 }, { "epoch": 0.0878348704435661, "grad_norm": 0.3629266917705536, "learning_rate": 1.999000348728347e-05, "loss": 0.0999, "step": 350 }, { "epoch": 0.08908965430704562, "grad_norm": 0.11858992278575897, "learning_rate": 1.9989074228728942e-05, "loss": 0.0931, "step": 355 }, { "epoch": 0.09034443817052512, "grad_norm": 0.41660797595977783, "learning_rate": 1.9988103700540345e-05, "loss": 0.1047, "step": 360 }, { "epoch": 0.09159922203400464, "grad_norm": 0.7686080932617188, "learning_rate": 1.9987091906727387e-05, "loss": 0.0968, "step": 365 }, { "epoch": 0.09285400589748416, "grad_norm": 0.3915267884731293, "learning_rate": 1.998603885147028e-05, "loss": 0.0947, "step": 370 }, { "epoch": 0.09410878976096368, "grad_norm": 0.23744699358940125, "learning_rate": 1.998494453911969e-05, "loss": 0.0915, "step": 375 }, { "epoch": 0.09536357362444318, "grad_norm": 0.11716315150260925, "learning_rate": 1.9983808974196752e-05, "loss": 0.1005, "step": 380 }, { "epoch": 0.0966183574879227, "grad_norm": 0.355939656496048, "learning_rate": 1.9982632161393022e-05, "loss": 0.0992, "step": 385 }, { "epoch": 0.09787314135140222, "grad_norm": 0.23542310297489166, "learning_rate": 1.9981414105570473e-05, "loss": 0.0864, "step": 390 }, { "epoch": 0.09912792521488174, "grad_norm": 0.5069738626480103, "learning_rate": 1.9980154811761482e-05, "loss": 0.0929, "step": 395 }, { "epoch": 0.10038270907836125, "grad_norm": 0.5965291857719421, "learning_rate": 1.9978854285168784e-05, "loss": 0.0979, "step": 400 }, { "epoch": 0.10163749294184077, "grad_norm": 0.557249128818512, "learning_rate": 1.9977512531165484e-05, "loss": 0.0994, "step": 405 }, { "epoch": 0.10289227680532029, "grad_norm": 0.3753920793533325, "learning_rate": 1.9976129555295003e-05, "loss": 0.1009, "step": 410 }, { "epoch": 0.10414706066879979, "grad_norm": 0.18703804910182953, "learning_rate": 1.9974705363271076e-05, "loss": 0.0938, "step": 415 }, { "epoch": 0.10540184453227931, "grad_norm": 0.49501076340675354, "learning_rate": 1.997323996097772e-05, "loss": 0.083, "step": 420 }, { "epoch": 0.10665662839575883, "grad_norm": 0.13180793821811676, "learning_rate": 1.9971733354469215e-05, "loss": 0.1087, "step": 425 }, { "epoch": 0.10791141225923835, "grad_norm": 0.4710563123226166, "learning_rate": 1.9970185549970066e-05, "loss": 0.0977, "step": 430 }, { "epoch": 0.10916619612271786, "grad_norm": 0.5056783556938171, "learning_rate": 1.9968596553874993e-05, "loss": 0.0947, "step": 435 }, { "epoch": 0.11042097998619738, "grad_norm": 0.2544513940811157, "learning_rate": 1.99669663727489e-05, "loss": 0.0925, "step": 440 }, { "epoch": 0.1116757638496769, "grad_norm": 0.512052059173584, "learning_rate": 1.9965295013326843e-05, "loss": 0.0984, "step": 445 }, { "epoch": 0.11293054771315642, "grad_norm": 0.31233909726142883, "learning_rate": 1.9963582482514003e-05, "loss": 0.0903, "step": 450 }, { "epoch": 0.11418533157663592, "grad_norm": 0.3753993511199951, "learning_rate": 1.9961828787385662e-05, "loss": 0.0985, "step": 455 }, { "epoch": 0.11544011544011544, "grad_norm": 0.14997100830078125, "learning_rate": 1.996003393518718e-05, "loss": 0.0991, "step": 460 }, { "epoch": 0.11669489930359496, "grad_norm": 0.1637381911277771, "learning_rate": 1.995819793333394e-05, "loss": 0.0833, "step": 465 }, { "epoch": 0.11794968316707446, "grad_norm": 0.35544639825820923, "learning_rate": 1.9956320789411338e-05, "loss": 0.0901, "step": 470 }, { "epoch": 0.11920446703055398, "grad_norm": 0.29124122858047485, "learning_rate": 1.9954402511174763e-05, "loss": 0.1043, "step": 475 }, { "epoch": 0.1204592508940335, "grad_norm": 0.1517423689365387, "learning_rate": 1.9952443106549535e-05, "loss": 0.0937, "step": 480 }, { "epoch": 0.12171403475751302, "grad_norm": 0.22879864275455475, "learning_rate": 1.9950442583630884e-05, "loss": 0.0983, "step": 485 }, { "epoch": 0.12296881862099253, "grad_norm": 0.12301073968410492, "learning_rate": 1.9948400950683932e-05, "loss": 0.0937, "step": 490 }, { "epoch": 0.12422360248447205, "grad_norm": 0.46629127860069275, "learning_rate": 1.9946318216143633e-05, "loss": 0.109, "step": 495 }, { "epoch": 0.12547838634795155, "grad_norm": 0.34613123536109924, "learning_rate": 1.9944194388614764e-05, "loss": 0.0979, "step": 500 }, { "epoch": 0.12673317021143107, "grad_norm": 0.3203139901161194, "learning_rate": 1.9942029476871868e-05, "loss": 0.105, "step": 505 }, { "epoch": 0.1279879540749106, "grad_norm": 0.23387163877487183, "learning_rate": 1.9939823489859226e-05, "loss": 0.0918, "step": 510 }, { "epoch": 0.1292427379383901, "grad_norm": 0.5645913481712341, "learning_rate": 1.9937576436690822e-05, "loss": 0.1008, "step": 515 }, { "epoch": 0.13049752180186963, "grad_norm": 0.758552074432373, "learning_rate": 1.9935288326650314e-05, "loss": 0.1041, "step": 520 }, { "epoch": 0.13175230566534915, "grad_norm": 0.40192508697509766, "learning_rate": 1.993295916919097e-05, "loss": 0.0961, "step": 525 }, { "epoch": 0.13300708952882867, "grad_norm": 0.3283374309539795, "learning_rate": 1.9930588973935653e-05, "loss": 0.1099, "step": 530 }, { "epoch": 0.13426187339230816, "grad_norm": 0.13031624257564545, "learning_rate": 1.992817775067677e-05, "loss": 0.0964, "step": 535 }, { "epoch": 0.13551665725578768, "grad_norm": 0.40052691102027893, "learning_rate": 1.9925725509376236e-05, "loss": 0.1003, "step": 540 }, { "epoch": 0.1367714411192672, "grad_norm": 0.20770736038684845, "learning_rate": 1.992323226016543e-05, "loss": 0.0977, "step": 545 }, { "epoch": 0.13802622498274672, "grad_norm": 0.5854383111000061, "learning_rate": 1.9920698013345162e-05, "loss": 0.096, "step": 550 }, { "epoch": 0.13928100884622624, "grad_norm": 0.33884522318840027, "learning_rate": 1.99181227793856e-05, "loss": 0.0981, "step": 555 }, { "epoch": 0.14053579270970576, "grad_norm": 0.9304419159889221, "learning_rate": 1.9915506568926283e-05, "loss": 0.1069, "step": 560 }, { "epoch": 0.14179057657318528, "grad_norm": 0.5028262138366699, "learning_rate": 1.991284939277601e-05, "loss": 0.1064, "step": 565 }, { "epoch": 0.1430453604366648, "grad_norm": 0.30051878094673157, "learning_rate": 1.991015126191285e-05, "loss": 0.0977, "step": 570 }, { "epoch": 0.1443001443001443, "grad_norm": 0.5658770799636841, "learning_rate": 1.990741218748407e-05, "loss": 0.1132, "step": 575 }, { "epoch": 0.1455549281636238, "grad_norm": 0.1313030868768692, "learning_rate": 1.9904632180806094e-05, "loss": 0.0976, "step": 580 }, { "epoch": 0.14680971202710333, "grad_norm": 0.3436416685581207, "learning_rate": 1.9901811253364458e-05, "loss": 0.103, "step": 585 }, { "epoch": 0.14806449589058285, "grad_norm": 0.7141314744949341, "learning_rate": 1.9898949416813757e-05, "loss": 0.0994, "step": 590 }, { "epoch": 0.14931927975406237, "grad_norm": 0.35225412249565125, "learning_rate": 1.9896046682977603e-05, "loss": 0.0989, "step": 595 }, { "epoch": 0.1505740636175419, "grad_norm": 0.19844387471675873, "learning_rate": 1.989310306384858e-05, "loss": 0.1032, "step": 600 }, { "epoch": 0.1518288474810214, "grad_norm": 0.21333719789981842, "learning_rate": 1.989011857158818e-05, "loss": 0.1, "step": 605 }, { "epoch": 0.1530836313445009, "grad_norm": 0.3265141546726227, "learning_rate": 1.9887093218526768e-05, "loss": 0.0935, "step": 610 }, { "epoch": 0.15433841520798042, "grad_norm": 0.24291647970676422, "learning_rate": 1.9884027017163515e-05, "loss": 0.0923, "step": 615 }, { "epoch": 0.15559319907145994, "grad_norm": 0.4325827658176422, "learning_rate": 1.9880919980166374e-05, "loss": 0.089, "step": 620 }, { "epoch": 0.15684798293493946, "grad_norm": 0.1882610321044922, "learning_rate": 1.9877772120371986e-05, "loss": 0.0893, "step": 625 }, { "epoch": 0.15810276679841898, "grad_norm": 0.4826973080635071, "learning_rate": 1.987458345078567e-05, "loss": 0.0983, "step": 630 }, { "epoch": 0.1593575506618985, "grad_norm": 0.5353069305419922, "learning_rate": 1.9871353984581342e-05, "loss": 0.0937, "step": 635 }, { "epoch": 0.16061233452537801, "grad_norm": 0.12057194858789444, "learning_rate": 1.9868083735101464e-05, "loss": 0.0999, "step": 640 }, { "epoch": 0.1618671183888575, "grad_norm": 0.4315410256385803, "learning_rate": 1.9864772715857e-05, "loss": 0.0982, "step": 645 }, { "epoch": 0.16312190225233703, "grad_norm": 0.1834162324666977, "learning_rate": 1.9861420940527357e-05, "loss": 0.0876, "step": 650 }, { "epoch": 0.16437668611581654, "grad_norm": 0.1956656128168106, "learning_rate": 1.985802842296031e-05, "loss": 0.0833, "step": 655 }, { "epoch": 0.16563146997929606, "grad_norm": 0.08431841433048248, "learning_rate": 1.9854595177171968e-05, "loss": 0.0933, "step": 660 }, { "epoch": 0.16688625384277558, "grad_norm": 0.40520185232162476, "learning_rate": 1.9851121217346717e-05, "loss": 0.0887, "step": 665 }, { "epoch": 0.1681410377062551, "grad_norm": 0.661037266254425, "learning_rate": 1.9847606557837138e-05, "loss": 0.0971, "step": 670 }, { "epoch": 0.16939582156973462, "grad_norm": 0.35876908898353577, "learning_rate": 1.9844051213163967e-05, "loss": 0.0891, "step": 675 }, { "epoch": 0.17065060543321414, "grad_norm": 0.73444002866745, "learning_rate": 1.9840455198016033e-05, "loss": 0.1015, "step": 680 }, { "epoch": 0.17190538929669363, "grad_norm": 0.5403853058815002, "learning_rate": 1.9836818527250185e-05, "loss": 0.094, "step": 685 }, { "epoch": 0.17316017316017315, "grad_norm": 0.26550817489624023, "learning_rate": 1.9833141215891253e-05, "loss": 0.0916, "step": 690 }, { "epoch": 0.17441495702365267, "grad_norm": 0.20519235730171204, "learning_rate": 1.9829423279131962e-05, "loss": 0.0938, "step": 695 }, { "epoch": 0.1756697408871322, "grad_norm": 0.2511141896247864, "learning_rate": 1.9825664732332886e-05, "loss": 0.095, "step": 700 }, { "epoch": 0.1769245247506117, "grad_norm": 0.42727699875831604, "learning_rate": 1.982186559102237e-05, "loss": 0.1039, "step": 705 }, { "epoch": 0.17817930861409123, "grad_norm": 0.361933171749115, "learning_rate": 1.9818025870896485e-05, "loss": 0.0933, "step": 710 }, { "epoch": 0.17943409247757075, "grad_norm": 0.081818588078022, "learning_rate": 1.981414558781895e-05, "loss": 0.0972, "step": 715 }, { "epoch": 0.18068887634105024, "grad_norm": 0.24198131263256073, "learning_rate": 1.9810224757821063e-05, "loss": 0.08, "step": 720 }, { "epoch": 0.18194366020452976, "grad_norm": 0.22726424038410187, "learning_rate": 1.9806263397101645e-05, "loss": 0.0889, "step": 725 }, { "epoch": 0.18319844406800928, "grad_norm": 0.26447054743766785, "learning_rate": 1.980226152202697e-05, "loss": 0.0988, "step": 730 }, { "epoch": 0.1844532279314888, "grad_norm": 0.20784974098205566, "learning_rate": 1.9798219149130692e-05, "loss": 0.1134, "step": 735 }, { "epoch": 0.18570801179496832, "grad_norm": 0.6903693079948425, "learning_rate": 1.9794136295113783e-05, "loss": 0.1002, "step": 740 }, { "epoch": 0.18696279565844784, "grad_norm": 0.9974376559257507, "learning_rate": 1.9790012976844465e-05, "loss": 0.0922, "step": 745 }, { "epoch": 0.18821757952192736, "grad_norm": 0.3469702899456024, "learning_rate": 1.9785849211358133e-05, "loss": 0.0843, "step": 750 }, { "epoch": 0.18947236338540685, "grad_norm": 0.25853797793388367, "learning_rate": 1.9781645015857287e-05, "loss": 0.0832, "step": 755 }, { "epoch": 0.19072714724888637, "grad_norm": 0.4150042235851288, "learning_rate": 1.9777400407711467e-05, "loss": 0.0951, "step": 760 }, { "epoch": 0.1919819311123659, "grad_norm": 0.8367013335227966, "learning_rate": 1.9773115404457175e-05, "loss": 0.1043, "step": 765 }, { "epoch": 0.1932367149758454, "grad_norm": 0.09506336599588394, "learning_rate": 1.976879002379781e-05, "loss": 0.1006, "step": 770 }, { "epoch": 0.19449149883932493, "grad_norm": 0.47608646750450134, "learning_rate": 1.9764424283603577e-05, "loss": 0.0969, "step": 775 }, { "epoch": 0.19574628270280445, "grad_norm": 0.3089286684989929, "learning_rate": 1.976001820191143e-05, "loss": 0.081, "step": 780 }, { "epoch": 0.19700106656628397, "grad_norm": 0.7951834201812744, "learning_rate": 1.9755571796925014e-05, "loss": 0.0987, "step": 785 }, { "epoch": 0.19825585042976349, "grad_norm": 0.29604893922805786, "learning_rate": 1.9751085087014533e-05, "loss": 0.0905, "step": 790 }, { "epoch": 0.19951063429324298, "grad_norm": 0.6843680143356323, "learning_rate": 1.974655809071673e-05, "loss": 0.1044, "step": 795 }, { "epoch": 0.2007654181567225, "grad_norm": 0.3378108739852905, "learning_rate": 1.9741990826734793e-05, "loss": 0.0791, "step": 800 }, { "epoch": 0.20202020202020202, "grad_norm": 0.2935211658477783, "learning_rate": 1.9737383313938266e-05, "loss": 0.0996, "step": 805 }, { "epoch": 0.20327498588368154, "grad_norm": 0.2529759705066681, "learning_rate": 1.9732735571362985e-05, "loss": 0.0992, "step": 810 }, { "epoch": 0.20452976974716106, "grad_norm": 0.2465377002954483, "learning_rate": 1.9728047618210995e-05, "loss": 0.0976, "step": 815 }, { "epoch": 0.20578455361064057, "grad_norm": 0.4152264893054962, "learning_rate": 1.9723319473850465e-05, "loss": 0.1063, "step": 820 }, { "epoch": 0.2070393374741201, "grad_norm": 0.3043062686920166, "learning_rate": 1.971855115781562e-05, "loss": 0.1005, "step": 825 }, { "epoch": 0.20829412133759959, "grad_norm": 0.15693755447864532, "learning_rate": 1.9713742689806646e-05, "loss": 0.0905, "step": 830 }, { "epoch": 0.2095489052010791, "grad_norm": 0.21518734097480774, "learning_rate": 1.9708894089689622e-05, "loss": 0.0952, "step": 835 }, { "epoch": 0.21080368906455862, "grad_norm": 0.5088281631469727, "learning_rate": 1.9704005377496428e-05, "loss": 0.0844, "step": 840 }, { "epoch": 0.21205847292803814, "grad_norm": 0.19282662868499756, "learning_rate": 1.969907657342467e-05, "loss": 0.0892, "step": 845 }, { "epoch": 0.21331325679151766, "grad_norm": 0.3823853135108948, "learning_rate": 1.969410769783759e-05, "loss": 0.083, "step": 850 }, { "epoch": 0.21456804065499718, "grad_norm": 0.31330108642578125, "learning_rate": 1.9689098771263982e-05, "loss": 0.0958, "step": 855 }, { "epoch": 0.2158228245184767, "grad_norm": 0.2921331822872162, "learning_rate": 1.968404981439812e-05, "loss": 0.0915, "step": 860 }, { "epoch": 0.21707760838195622, "grad_norm": 0.23992085456848145, "learning_rate": 1.9678960848099646e-05, "loss": 0.0945, "step": 865 }, { "epoch": 0.2183323922454357, "grad_norm": 0.22444355487823486, "learning_rate": 1.967383189339352e-05, "loss": 0.0816, "step": 870 }, { "epoch": 0.21958717610891523, "grad_norm": 0.27808722853660583, "learning_rate": 1.9668662971469886e-05, "loss": 0.0872, "step": 875 }, { "epoch": 0.22084195997239475, "grad_norm": 0.4084593653678894, "learning_rate": 1.9663454103684043e-05, "loss": 0.1007, "step": 880 }, { "epoch": 0.22209674383587427, "grad_norm": 0.2565450966358185, "learning_rate": 1.9658205311556304e-05, "loss": 0.0973, "step": 885 }, { "epoch": 0.2233515276993538, "grad_norm": 0.19569608569145203, "learning_rate": 1.9652916616771933e-05, "loss": 0.0884, "step": 890 }, { "epoch": 0.2246063115628333, "grad_norm": 0.17645685374736786, "learning_rate": 1.9647588041181057e-05, "loss": 0.0918, "step": 895 }, { "epoch": 0.22586109542631283, "grad_norm": 0.5224778056144714, "learning_rate": 1.9642219606798566e-05, "loss": 0.0899, "step": 900 }, { "epoch": 0.22711587928979232, "grad_norm": 0.29252490401268005, "learning_rate": 1.963681133580402e-05, "loss": 0.0978, "step": 905 }, { "epoch": 0.22837066315327184, "grad_norm": 0.2631376385688782, "learning_rate": 1.9631363250541577e-05, "loss": 0.0961, "step": 910 }, { "epoch": 0.22962544701675136, "grad_norm": 0.4589993953704834, "learning_rate": 1.9625875373519866e-05, "loss": 0.0942, "step": 915 }, { "epoch": 0.23088023088023088, "grad_norm": 0.4137325882911682, "learning_rate": 1.9620347727411933e-05, "loss": 0.0917, "step": 920 }, { "epoch": 0.2321350147437104, "grad_norm": 0.21914274990558624, "learning_rate": 1.9614780335055127e-05, "loss": 0.0851, "step": 925 }, { "epoch": 0.23338979860718992, "grad_norm": 0.3876037001609802, "learning_rate": 1.9609173219450998e-05, "loss": 0.0878, "step": 930 }, { "epoch": 0.23464458247066944, "grad_norm": 0.376610666513443, "learning_rate": 1.9603526403765218e-05, "loss": 0.0949, "step": 935 }, { "epoch": 0.23589936633414893, "grad_norm": 0.2858617603778839, "learning_rate": 1.9597839911327475e-05, "loss": 0.091, "step": 940 }, { "epoch": 0.23715415019762845, "grad_norm": 0.6040699481964111, "learning_rate": 1.959211376563139e-05, "loss": 0.0866, "step": 945 }, { "epoch": 0.23840893406110797, "grad_norm": 0.15516065061092377, "learning_rate": 1.9586347990334406e-05, "loss": 0.093, "step": 950 }, { "epoch": 0.2396637179245875, "grad_norm": 0.38089755177497864, "learning_rate": 1.958054260925768e-05, "loss": 0.091, "step": 955 }, { "epoch": 0.240918501788067, "grad_norm": 0.15390656888484955, "learning_rate": 1.9574697646386027e-05, "loss": 0.0883, "step": 960 }, { "epoch": 0.24217328565154653, "grad_norm": 0.5063360929489136, "learning_rate": 1.956881312586777e-05, "loss": 0.094, "step": 965 }, { "epoch": 0.24342806951502605, "grad_norm": 0.15385521948337555, "learning_rate": 1.9562889072014682e-05, "loss": 0.0942, "step": 970 }, { "epoch": 0.24468285337850557, "grad_norm": 0.4311351180076599, "learning_rate": 1.9556925509301844e-05, "loss": 0.0896, "step": 975 }, { "epoch": 0.24593763724198506, "grad_norm": 0.1626339703798294, "learning_rate": 1.955092246236759e-05, "loss": 0.0901, "step": 980 }, { "epoch": 0.24719242110546458, "grad_norm": 0.1978272795677185, "learning_rate": 1.954487995601337e-05, "loss": 0.086, "step": 985 }, { "epoch": 0.2484472049689441, "grad_norm": 0.5115323066711426, "learning_rate": 1.953879801520366e-05, "loss": 0.0966, "step": 990 }, { "epoch": 0.24970198883242362, "grad_norm": 0.556075930595398, "learning_rate": 1.9532676665065863e-05, "loss": 0.0947, "step": 995 }, { "epoch": 0.2509567726959031, "grad_norm": 0.4283283054828644, "learning_rate": 1.9526515930890203e-05, "loss": 0.0995, "step": 1000 }, { "epoch": 0.25221155655938265, "grad_norm": 0.20551514625549316, "learning_rate": 1.9520315838129602e-05, "loss": 0.0822, "step": 1005 }, { "epoch": 0.25346634042286215, "grad_norm": 0.2339772880077362, "learning_rate": 1.9514076412399615e-05, "loss": 0.0916, "step": 1010 }, { "epoch": 0.2547211242863417, "grad_norm": 0.33765801787376404, "learning_rate": 1.9507797679478282e-05, "loss": 0.0834, "step": 1015 }, { "epoch": 0.2559759081498212, "grad_norm": 0.15917867422103882, "learning_rate": 1.9501479665306046e-05, "loss": 0.1036, "step": 1020 }, { "epoch": 0.25723069201330073, "grad_norm": 0.15824545919895172, "learning_rate": 1.9495122395985642e-05, "loss": 0.0926, "step": 1025 }, { "epoch": 0.2584854758767802, "grad_norm": 0.22190357744693756, "learning_rate": 1.948872589778198e-05, "loss": 0.0915, "step": 1030 }, { "epoch": 0.2597402597402597, "grad_norm": 0.39479732513427734, "learning_rate": 1.9482290197122054e-05, "loss": 0.0954, "step": 1035 }, { "epoch": 0.26099504360373926, "grad_norm": 0.44342002272605896, "learning_rate": 1.947581532059481e-05, "loss": 0.0968, "step": 1040 }, { "epoch": 0.26224982746721875, "grad_norm": 0.18394285440444946, "learning_rate": 1.946930129495106e-05, "loss": 0.0968, "step": 1045 }, { "epoch": 0.2635046113306983, "grad_norm": 0.40263161063194275, "learning_rate": 1.9462748147103342e-05, "loss": 0.0954, "step": 1050 }, { "epoch": 0.2647593951941778, "grad_norm": 0.3291696608066559, "learning_rate": 1.9456155904125853e-05, "loss": 0.0974, "step": 1055 }, { "epoch": 0.26601417905765734, "grad_norm": 0.17444024980068207, "learning_rate": 1.9449524593254283e-05, "loss": 0.0892, "step": 1060 }, { "epoch": 0.26726896292113683, "grad_norm": 0.5202435851097107, "learning_rate": 1.944285424188575e-05, "loss": 0.1026, "step": 1065 }, { "epoch": 0.2685237467846163, "grad_norm": 0.6516605615615845, "learning_rate": 1.943614487757866e-05, "loss": 0.0899, "step": 1070 }, { "epoch": 0.26977853064809587, "grad_norm": 0.7146198153495789, "learning_rate": 1.9429396528052594e-05, "loss": 0.0834, "step": 1075 }, { "epoch": 0.27103331451157536, "grad_norm": 0.46206018328666687, "learning_rate": 1.9422609221188208e-05, "loss": 0.0961, "step": 1080 }, { "epoch": 0.2722880983750549, "grad_norm": 0.258039265871048, "learning_rate": 1.9415782985027105e-05, "loss": 0.0861, "step": 1085 }, { "epoch": 0.2735428822385344, "grad_norm": 0.2528323829174042, "learning_rate": 1.9408917847771732e-05, "loss": 0.0903, "step": 1090 }, { "epoch": 0.27479766610201395, "grad_norm": 0.35763663053512573, "learning_rate": 1.9402013837785242e-05, "loss": 0.0961, "step": 1095 }, { "epoch": 0.27605244996549344, "grad_norm": 0.33493563532829285, "learning_rate": 1.93950709835914e-05, "loss": 0.086, "step": 1100 }, { "epoch": 0.27730723382897293, "grad_norm": 0.6536179780960083, "learning_rate": 1.9388089313874447e-05, "loss": 0.083, "step": 1105 }, { "epoch": 0.2785620176924525, "grad_norm": 0.6487680077552795, "learning_rate": 1.9381068857478994e-05, "loss": 0.0968, "step": 1110 }, { "epoch": 0.27981680155593197, "grad_norm": 0.25090888142585754, "learning_rate": 1.9374009643409895e-05, "loss": 0.0785, "step": 1115 }, { "epoch": 0.2810715854194115, "grad_norm": 0.1529439091682434, "learning_rate": 1.9366911700832146e-05, "loss": 0.0946, "step": 1120 }, { "epoch": 0.282326369282891, "grad_norm": 0.2744344472885132, "learning_rate": 1.935977505907072e-05, "loss": 0.0827, "step": 1125 }, { "epoch": 0.28358115314637056, "grad_norm": 0.4758091866970062, "learning_rate": 1.93525997476105e-05, "loss": 0.0895, "step": 1130 }, { "epoch": 0.28483593700985005, "grad_norm": 0.5062987804412842, "learning_rate": 1.9345385796096118e-05, "loss": 0.1007, "step": 1135 }, { "epoch": 0.2860907208733296, "grad_norm": 0.44954121112823486, "learning_rate": 1.933813323433186e-05, "loss": 0.0915, "step": 1140 }, { "epoch": 0.2873455047368091, "grad_norm": 0.2048788219690323, "learning_rate": 1.9330842092281508e-05, "loss": 0.0866, "step": 1145 }, { "epoch": 0.2886002886002886, "grad_norm": 0.3660197854042053, "learning_rate": 1.9323512400068262e-05, "loss": 0.0916, "step": 1150 }, { "epoch": 0.2898550724637681, "grad_norm": 0.5215709209442139, "learning_rate": 1.931614418797457e-05, "loss": 0.096, "step": 1155 }, { "epoch": 0.2911098563272476, "grad_norm": 0.46644458174705505, "learning_rate": 1.9308737486442045e-05, "loss": 0.1107, "step": 1160 }, { "epoch": 0.29236464019072717, "grad_norm": 0.24164815247058868, "learning_rate": 1.9301292326071295e-05, "loss": 0.0923, "step": 1165 }, { "epoch": 0.29361942405420666, "grad_norm": 0.42168012261390686, "learning_rate": 1.9293808737621837e-05, "loss": 0.0998, "step": 1170 }, { "epoch": 0.2948742079176862, "grad_norm": 0.37700730562210083, "learning_rate": 1.9286286752011948e-05, "loss": 0.1036, "step": 1175 }, { "epoch": 0.2961289917811657, "grad_norm": 0.6957244277000427, "learning_rate": 1.927872640031854e-05, "loss": 0.0903, "step": 1180 }, { "epoch": 0.2973837756446452, "grad_norm": 0.30158841609954834, "learning_rate": 1.9271127713777033e-05, "loss": 0.0861, "step": 1185 }, { "epoch": 0.29863855950812473, "grad_norm": 0.41035765409469604, "learning_rate": 1.9263490723781233e-05, "loss": 0.0909, "step": 1190 }, { "epoch": 0.2998933433716042, "grad_norm": 0.1835126429796219, "learning_rate": 1.9255815461883184e-05, "loss": 0.0968, "step": 1195 }, { "epoch": 0.3011481272350838, "grad_norm": 0.18504665791988373, "learning_rate": 1.9248101959793066e-05, "loss": 0.0995, "step": 1200 }, { "epoch": 0.30240291109856327, "grad_norm": 0.32556024193763733, "learning_rate": 1.9240350249379035e-05, "loss": 0.0972, "step": 1205 }, { "epoch": 0.3036576949620428, "grad_norm": 0.2088983952999115, "learning_rate": 1.92325603626671e-05, "loss": 0.0935, "step": 1210 }, { "epoch": 0.3049124788255223, "grad_norm": 0.5765480995178223, "learning_rate": 1.922473233184101e-05, "loss": 0.0933, "step": 1215 }, { "epoch": 0.3061672626890018, "grad_norm": 0.7290382981300354, "learning_rate": 1.9216866189242095e-05, "loss": 0.0954, "step": 1220 }, { "epoch": 0.30742204655248134, "grad_norm": 0.2641292214393616, "learning_rate": 1.9208961967369148e-05, "loss": 0.0796, "step": 1225 }, { "epoch": 0.30867683041596083, "grad_norm": 0.6574556827545166, "learning_rate": 1.9201019698878272e-05, "loss": 0.1038, "step": 1230 }, { "epoch": 0.3099316142794404, "grad_norm": 0.19239722192287445, "learning_rate": 1.9193039416582785e-05, "loss": 0.094, "step": 1235 }, { "epoch": 0.3111863981429199, "grad_norm": 0.14215205609798431, "learning_rate": 1.918502115345303e-05, "loss": 0.0965, "step": 1240 }, { "epoch": 0.3124411820063994, "grad_norm": 0.3955984115600586, "learning_rate": 1.9176964942616286e-05, "loss": 0.0961, "step": 1245 }, { "epoch": 0.3136959658698789, "grad_norm": 0.5442742109298706, "learning_rate": 1.9168870817356602e-05, "loss": 0.0844, "step": 1250 }, { "epoch": 0.3149507497333584, "grad_norm": 0.2363368719816208, "learning_rate": 1.916073881111468e-05, "loss": 0.0882, "step": 1255 }, { "epoch": 0.31620553359683795, "grad_norm": 0.4277198016643524, "learning_rate": 1.915256895748771e-05, "loss": 0.0813, "step": 1260 }, { "epoch": 0.31746031746031744, "grad_norm": 0.15375563502311707, "learning_rate": 1.9144361290229266e-05, "loss": 0.1007, "step": 1265 }, { "epoch": 0.318715101323797, "grad_norm": 0.2776433527469635, "learning_rate": 1.913611584324913e-05, "loss": 0.0898, "step": 1270 }, { "epoch": 0.3199698851872765, "grad_norm": 0.20093151926994324, "learning_rate": 1.912783265061319e-05, "loss": 0.0897, "step": 1275 }, { "epoch": 0.32122466905075603, "grad_norm": 0.13896812498569489, "learning_rate": 1.9119511746543265e-05, "loss": 0.0945, "step": 1280 }, { "epoch": 0.3224794529142355, "grad_norm": 0.1117207333445549, "learning_rate": 1.911115316541698e-05, "loss": 0.0887, "step": 1285 }, { "epoch": 0.323734236777715, "grad_norm": 0.19571149349212646, "learning_rate": 1.9102756941767625e-05, "loss": 0.086, "step": 1290 }, { "epoch": 0.32498902064119456, "grad_norm": 0.12047363817691803, "learning_rate": 1.9094323110284006e-05, "loss": 0.1008, "step": 1295 }, { "epoch": 0.32624380450467405, "grad_norm": 0.2128770500421524, "learning_rate": 1.9085851705810307e-05, "loss": 0.1054, "step": 1300 }, { "epoch": 0.3274985883681536, "grad_norm": 0.18241125345230103, "learning_rate": 1.907734276334595e-05, "loss": 0.0981, "step": 1305 }, { "epoch": 0.3287533722316331, "grad_norm": 0.32163918018341064, "learning_rate": 1.9068796318045434e-05, "loss": 0.0871, "step": 1310 }, { "epoch": 0.33000815609511264, "grad_norm": 0.6417638063430786, "learning_rate": 1.90602124052182e-05, "loss": 0.0959, "step": 1315 }, { "epoch": 0.33126293995859213, "grad_norm": 0.2659170627593994, "learning_rate": 1.9051591060328496e-05, "loss": 0.0938, "step": 1320 }, { "epoch": 0.3325177238220717, "grad_norm": 0.34006327390670776, "learning_rate": 1.904293231899521e-05, "loss": 0.0835, "step": 1325 }, { "epoch": 0.33377250768555117, "grad_norm": 0.44254758954048157, "learning_rate": 1.9034236216991738e-05, "loss": 0.0894, "step": 1330 }, { "epoch": 0.33502729154903066, "grad_norm": 0.2562454342842102, "learning_rate": 1.9025502790245824e-05, "loss": 0.0953, "step": 1335 }, { "epoch": 0.3362820754125102, "grad_norm": 0.33543840050697327, "learning_rate": 1.901673207483943e-05, "loss": 0.0911, "step": 1340 }, { "epoch": 0.3375368592759897, "grad_norm": 0.3265765905380249, "learning_rate": 1.9007924107008563e-05, "loss": 0.0885, "step": 1345 }, { "epoch": 0.33879164313946925, "grad_norm": 0.08985516428947449, "learning_rate": 1.8999078923143142e-05, "loss": 0.085, "step": 1350 }, { "epoch": 0.34004642700294874, "grad_norm": 0.5862855315208435, "learning_rate": 1.899019655978685e-05, "loss": 0.0956, "step": 1355 }, { "epoch": 0.3413012108664283, "grad_norm": 0.32543689012527466, "learning_rate": 1.8981277053636963e-05, "loss": 0.096, "step": 1360 }, { "epoch": 0.3425559947299078, "grad_norm": 0.6047563552856445, "learning_rate": 1.8972320441544224e-05, "loss": 0.0843, "step": 1365 }, { "epoch": 0.34381077859338727, "grad_norm": 0.28414762020111084, "learning_rate": 1.8963326760512668e-05, "loss": 0.0892, "step": 1370 }, { "epoch": 0.3450655624568668, "grad_norm": 0.337194561958313, "learning_rate": 1.895429604769949e-05, "loss": 0.099, "step": 1375 }, { "epoch": 0.3463203463203463, "grad_norm": 0.4529132843017578, "learning_rate": 1.894522834041487e-05, "loss": 0.0941, "step": 1380 }, { "epoch": 0.34757513018382585, "grad_norm": 0.12156742066144943, "learning_rate": 1.8936123676121844e-05, "loss": 0.0896, "step": 1385 }, { "epoch": 0.34882991404730535, "grad_norm": 0.25660645961761475, "learning_rate": 1.8926982092436117e-05, "loss": 0.1036, "step": 1390 }, { "epoch": 0.3500846979107849, "grad_norm": 0.31460127234458923, "learning_rate": 1.891780362712594e-05, "loss": 0.1055, "step": 1395 }, { "epoch": 0.3513394817742644, "grad_norm": 0.31471750140190125, "learning_rate": 1.8908588318111932e-05, "loss": 0.0958, "step": 1400 }, { "epoch": 0.3525942656377439, "grad_norm": 0.13055211305618286, "learning_rate": 1.889933620346694e-05, "loss": 0.1019, "step": 1405 }, { "epoch": 0.3538490495012234, "grad_norm": 0.13505928218364716, "learning_rate": 1.8890047321415856e-05, "loss": 0.0816, "step": 1410 }, { "epoch": 0.3551038333647029, "grad_norm": 0.2555672526359558, "learning_rate": 1.8880721710335495e-05, "loss": 0.0998, "step": 1415 }, { "epoch": 0.35635861722818246, "grad_norm": 0.3331303298473358, "learning_rate": 1.8871359408754405e-05, "loss": 0.094, "step": 1420 }, { "epoch": 0.35761340109166195, "grad_norm": 0.17009656131267548, "learning_rate": 1.8861960455352723e-05, "loss": 0.0946, "step": 1425 }, { "epoch": 0.3588681849551415, "grad_norm": 0.2334425002336502, "learning_rate": 1.885252488896201e-05, "loss": 0.0906, "step": 1430 }, { "epoch": 0.360122968818621, "grad_norm": 0.21691180765628815, "learning_rate": 1.8843052748565097e-05, "loss": 0.091, "step": 1435 }, { "epoch": 0.3613777526821005, "grad_norm": 0.40637996792793274, "learning_rate": 1.8833544073295918e-05, "loss": 0.0856, "step": 1440 }, { "epoch": 0.36263253654558003, "grad_norm": 0.4045301079750061, "learning_rate": 1.882399890243935e-05, "loss": 0.0821, "step": 1445 }, { "epoch": 0.3638873204090595, "grad_norm": 0.25760528445243835, "learning_rate": 1.8814417275431046e-05, "loss": 0.0895, "step": 1450 }, { "epoch": 0.36514210427253907, "grad_norm": 0.3337756097316742, "learning_rate": 1.8804799231857292e-05, "loss": 0.0939, "step": 1455 }, { "epoch": 0.36639688813601856, "grad_norm": 0.325998991727829, "learning_rate": 1.8795144811454805e-05, "loss": 0.0978, "step": 1460 }, { "epoch": 0.3676516719994981, "grad_norm": 0.12589257955551147, "learning_rate": 1.878545405411061e-05, "loss": 0.0826, "step": 1465 }, { "epoch": 0.3689064558629776, "grad_norm": 0.10235779732465744, "learning_rate": 1.877572699986185e-05, "loss": 0.0841, "step": 1470 }, { "epoch": 0.3701612397264571, "grad_norm": 0.20532777905464172, "learning_rate": 1.876596368889563e-05, "loss": 0.0897, "step": 1475 }, { "epoch": 0.37141602358993664, "grad_norm": 0.30277219414711, "learning_rate": 1.8756164161548848e-05, "loss": 0.0975, "step": 1480 }, { "epoch": 0.37267080745341613, "grad_norm": 0.15634752810001373, "learning_rate": 1.8746328458308034e-05, "loss": 0.0879, "step": 1485 }, { "epoch": 0.3739255913168957, "grad_norm": 0.2814009487628937, "learning_rate": 1.873645661980917e-05, "loss": 0.0885, "step": 1490 }, { "epoch": 0.37518037518037517, "grad_norm": 0.7077462673187256, "learning_rate": 1.872654868683753e-05, "loss": 0.1143, "step": 1495 }, { "epoch": 0.3764351590438547, "grad_norm": 0.14586646854877472, "learning_rate": 1.8716604700327516e-05, "loss": 0.1006, "step": 1500 }, { "epoch": 0.3776899429073342, "grad_norm": 0.23802891373634338, "learning_rate": 1.8706624701362485e-05, "loss": 0.0926, "step": 1505 }, { "epoch": 0.3789447267708137, "grad_norm": 0.41399145126342773, "learning_rate": 1.8696608731174576e-05, "loss": 0.0897, "step": 1510 }, { "epoch": 0.38019951063429325, "grad_norm": 0.18691197037696838, "learning_rate": 1.8686556831144545e-05, "loss": 0.0843, "step": 1515 }, { "epoch": 0.38145429449777274, "grad_norm": 0.3470015823841095, "learning_rate": 1.867646904280159e-05, "loss": 0.1038, "step": 1520 }, { "epoch": 0.3827090783612523, "grad_norm": 0.3475337624549866, "learning_rate": 1.8666345407823177e-05, "loss": 0.0992, "step": 1525 }, { "epoch": 0.3839638622247318, "grad_norm": 0.21168570220470428, "learning_rate": 1.865618596803487e-05, "loss": 0.0877, "step": 1530 }, { "epoch": 0.3852186460882113, "grad_norm": 0.2576999366283417, "learning_rate": 1.864599076541018e-05, "loss": 0.099, "step": 1535 }, { "epoch": 0.3864734299516908, "grad_norm": 0.42329198122024536, "learning_rate": 1.8635759842070344e-05, "loss": 0.0886, "step": 1540 }, { "epoch": 0.38772821381517036, "grad_norm": 0.2144610732793808, "learning_rate": 1.862549324028419e-05, "loss": 0.0976, "step": 1545 }, { "epoch": 0.38898299767864986, "grad_norm": 0.36767107248306274, "learning_rate": 1.8615191002467955e-05, "loss": 0.1067, "step": 1550 }, { "epoch": 0.39023778154212935, "grad_norm": 0.32758021354675293, "learning_rate": 1.8604853171185098e-05, "loss": 0.0925, "step": 1555 }, { "epoch": 0.3914925654056089, "grad_norm": 0.11192046850919724, "learning_rate": 1.859447978914614e-05, "loss": 0.0815, "step": 1560 }, { "epoch": 0.3927473492690884, "grad_norm": 0.17995843291282654, "learning_rate": 1.8584070899208468e-05, "loss": 0.0823, "step": 1565 }, { "epoch": 0.39400213313256793, "grad_norm": 0.3672018349170685, "learning_rate": 1.857362654437618e-05, "loss": 0.0955, "step": 1570 }, { "epoch": 0.3952569169960474, "grad_norm": 0.11518882215023041, "learning_rate": 1.8563146767799884e-05, "loss": 0.0955, "step": 1575 }, { "epoch": 0.39651170085952697, "grad_norm": 0.30487769842147827, "learning_rate": 1.8552631612776554e-05, "loss": 0.0842, "step": 1580 }, { "epoch": 0.39776648472300646, "grad_norm": 0.6982203722000122, "learning_rate": 1.85420811227493e-05, "loss": 0.0801, "step": 1585 }, { "epoch": 0.39902126858648596, "grad_norm": 0.22043141722679138, "learning_rate": 1.853149534130724e-05, "loss": 0.1039, "step": 1590 }, { "epoch": 0.4002760524499655, "grad_norm": 0.3814282715320587, "learning_rate": 1.8520874312185292e-05, "loss": 0.0861, "step": 1595 }, { "epoch": 0.401530836313445, "grad_norm": 0.14649051427841187, "learning_rate": 1.8510218079263995e-05, "loss": 0.09, "step": 1600 }, { "epoch": 0.40278562017692454, "grad_norm": 0.16551180183887482, "learning_rate": 1.849952668656933e-05, "loss": 0.079, "step": 1605 }, { "epoch": 0.40404040404040403, "grad_norm": 0.17042899131774902, "learning_rate": 1.8488800178272553e-05, "loss": 0.1073, "step": 1610 }, { "epoch": 0.4052951879038836, "grad_norm": 0.4069715440273285, "learning_rate": 1.847803859868998e-05, "loss": 0.0948, "step": 1615 }, { "epoch": 0.40654997176736307, "grad_norm": 0.18239188194274902, "learning_rate": 1.8467241992282842e-05, "loss": 0.0934, "step": 1620 }, { "epoch": 0.40780475563084256, "grad_norm": 0.4659058749675751, "learning_rate": 1.845641040365707e-05, "loss": 0.0953, "step": 1625 }, { "epoch": 0.4090595394943221, "grad_norm": 0.1839870810508728, "learning_rate": 1.844554387756313e-05, "loss": 0.0874, "step": 1630 }, { "epoch": 0.4103143233578016, "grad_norm": 0.8208329081535339, "learning_rate": 1.8434642458895823e-05, "loss": 0.0993, "step": 1635 }, { "epoch": 0.41156910722128115, "grad_norm": 0.3736049234867096, "learning_rate": 1.8423706192694118e-05, "loss": 0.0892, "step": 1640 }, { "epoch": 0.41282389108476064, "grad_norm": 0.1566988229751587, "learning_rate": 1.841273512414095e-05, "loss": 0.0893, "step": 1645 }, { "epoch": 0.4140786749482402, "grad_norm": 0.5364568829536438, "learning_rate": 1.840172929856304e-05, "loss": 0.1002, "step": 1650 }, { "epoch": 0.4153334588117197, "grad_norm": 0.41213005781173706, "learning_rate": 1.8390688761430707e-05, "loss": 0.0942, "step": 1655 }, { "epoch": 0.41658824267519917, "grad_norm": 0.417540580034256, "learning_rate": 1.8379613558357686e-05, "loss": 0.09, "step": 1660 }, { "epoch": 0.4178430265386787, "grad_norm": 0.30479973554611206, "learning_rate": 1.836850373510092e-05, "loss": 0.0819, "step": 1665 }, { "epoch": 0.4190978104021582, "grad_norm": 0.5476796627044678, "learning_rate": 1.8357359337560393e-05, "loss": 0.0835, "step": 1670 }, { "epoch": 0.42035259426563776, "grad_norm": 0.14834348857402802, "learning_rate": 1.8346180411778934e-05, "loss": 0.081, "step": 1675 }, { "epoch": 0.42160737812911725, "grad_norm": 0.49090176820755005, "learning_rate": 1.833496700394202e-05, "loss": 0.0864, "step": 1680 }, { "epoch": 0.4228621619925968, "grad_norm": 0.17847497761249542, "learning_rate": 1.83237191603776e-05, "loss": 0.0705, "step": 1685 }, { "epoch": 0.4241169458560763, "grad_norm": 0.22627241909503937, "learning_rate": 1.831243692755587e-05, "loss": 0.0757, "step": 1690 }, { "epoch": 0.4253717297195558, "grad_norm": 0.6790867447853088, "learning_rate": 1.830112035208913e-05, "loss": 0.086, "step": 1695 }, { "epoch": 0.4266265135830353, "grad_norm": 0.33606070280075073, "learning_rate": 1.828976948073155e-05, "loss": 0.0954, "step": 1700 }, { "epoch": 0.4278812974465148, "grad_norm": 0.15440745651721954, "learning_rate": 1.8278384360379008e-05, "loss": 0.0877, "step": 1705 }, { "epoch": 0.42913608130999437, "grad_norm": 0.3315647542476654, "learning_rate": 1.8266965038068856e-05, "loss": 0.0856, "step": 1710 }, { "epoch": 0.43039086517347386, "grad_norm": 0.2767457365989685, "learning_rate": 1.8255511560979782e-05, "loss": 0.0946, "step": 1715 }, { "epoch": 0.4316456490369534, "grad_norm": 0.24647273123264313, "learning_rate": 1.824402397643155e-05, "loss": 0.1019, "step": 1720 }, { "epoch": 0.4329004329004329, "grad_norm": 0.5085861086845398, "learning_rate": 1.823250233188487e-05, "loss": 0.0915, "step": 1725 }, { "epoch": 0.43415521676391244, "grad_norm": 0.2605689465999603, "learning_rate": 1.822094667494115e-05, "loss": 0.0787, "step": 1730 }, { "epoch": 0.43541000062739194, "grad_norm": 0.2040780782699585, "learning_rate": 1.8209357053342325e-05, "loss": 0.0897, "step": 1735 }, { "epoch": 0.4366647844908714, "grad_norm": 0.12423911690711975, "learning_rate": 1.8197733514970655e-05, "loss": 0.1005, "step": 1740 }, { "epoch": 0.437919568354351, "grad_norm": 0.08479491621255875, "learning_rate": 1.8186076107848524e-05, "loss": 0.0871, "step": 1745 }, { "epoch": 0.43917435221783047, "grad_norm": 0.2682114541530609, "learning_rate": 1.8174384880138247e-05, "loss": 0.0907, "step": 1750 }, { "epoch": 0.44042913608131, "grad_norm": 0.32582300901412964, "learning_rate": 1.8162659880141865e-05, "loss": 0.0893, "step": 1755 }, { "epoch": 0.4416839199447895, "grad_norm": 0.4764627516269684, "learning_rate": 1.8150901156300956e-05, "loss": 0.0892, "step": 1760 }, { "epoch": 0.44293870380826905, "grad_norm": 0.2317512184381485, "learning_rate": 1.8139108757196412e-05, "loss": 0.0972, "step": 1765 }, { "epoch": 0.44419348767174854, "grad_norm": 0.1146375834941864, "learning_rate": 1.812728273154827e-05, "loss": 0.1024, "step": 1770 }, { "epoch": 0.44544827153522804, "grad_norm": 0.07953114807605743, "learning_rate": 1.8115423128215485e-05, "loss": 0.0892, "step": 1775 }, { "epoch": 0.4467030553987076, "grad_norm": 0.5142281651496887, "learning_rate": 1.810352999619574e-05, "loss": 0.1007, "step": 1780 }, { "epoch": 0.4479578392621871, "grad_norm": 0.35662662982940674, "learning_rate": 1.8091603384625243e-05, "loss": 0.0937, "step": 1785 }, { "epoch": 0.4492126231256666, "grad_norm": 0.8468720316886902, "learning_rate": 1.8079643342778516e-05, "loss": 0.0987, "step": 1790 }, { "epoch": 0.4504674069891461, "grad_norm": 0.34367749094963074, "learning_rate": 1.80676499200682e-05, "loss": 0.087, "step": 1795 }, { "epoch": 0.45172219085262566, "grad_norm": 0.21747180819511414, "learning_rate": 1.8055623166044855e-05, "loss": 0.0883, "step": 1800 }, { "epoch": 0.45297697471610515, "grad_norm": 0.4869808554649353, "learning_rate": 1.8043563130396738e-05, "loss": 0.0989, "step": 1805 }, { "epoch": 0.45423175857958464, "grad_norm": 0.2503896653652191, "learning_rate": 1.8031469862949618e-05, "loss": 0.083, "step": 1810 }, { "epoch": 0.4554865424430642, "grad_norm": 0.34971126914024353, "learning_rate": 1.801934341366655e-05, "loss": 0.0845, "step": 1815 }, { "epoch": 0.4567413263065437, "grad_norm": 0.6193472743034363, "learning_rate": 1.800718383264769e-05, "loss": 0.0966, "step": 1820 }, { "epoch": 0.45799611017002323, "grad_norm": 0.5469531416893005, "learning_rate": 1.799499117013007e-05, "loss": 0.0844, "step": 1825 }, { "epoch": 0.4592508940335027, "grad_norm": 0.2877050042152405, "learning_rate": 1.7982765476487398e-05, "loss": 0.0929, "step": 1830 }, { "epoch": 0.46050567789698227, "grad_norm": 0.2750946283340454, "learning_rate": 1.797050680222985e-05, "loss": 0.0883, "step": 1835 }, { "epoch": 0.46176046176046176, "grad_norm": 0.4805077016353607, "learning_rate": 1.7958215198003866e-05, "loss": 0.0916, "step": 1840 }, { "epoch": 0.46301524562394125, "grad_norm": 0.4399943947792053, "learning_rate": 1.7945890714591926e-05, "loss": 0.099, "step": 1845 }, { "epoch": 0.4642700294874208, "grad_norm": 0.8317438364028931, "learning_rate": 1.7933533402912354e-05, "loss": 0.0836, "step": 1850 }, { "epoch": 0.4655248133509003, "grad_norm": 0.26974841952323914, "learning_rate": 1.7921143314019106e-05, "loss": 0.1013, "step": 1855 }, { "epoch": 0.46677959721437984, "grad_norm": 0.2595142722129822, "learning_rate": 1.7908720499101552e-05, "loss": 0.0881, "step": 1860 }, { "epoch": 0.46803438107785933, "grad_norm": 0.18567359447479248, "learning_rate": 1.789626500948427e-05, "loss": 0.0929, "step": 1865 }, { "epoch": 0.4692891649413389, "grad_norm": 0.11302470415830612, "learning_rate": 1.7883776896626836e-05, "loss": 0.0882, "step": 1870 }, { "epoch": 0.47054394880481837, "grad_norm": 0.06441876292228699, "learning_rate": 1.7871256212123605e-05, "loss": 0.091, "step": 1875 }, { "epoch": 0.47179873266829786, "grad_norm": 0.5252417922019958, "learning_rate": 1.78587030077035e-05, "loss": 0.0995, "step": 1880 }, { "epoch": 0.4730535165317774, "grad_norm": 0.2045115977525711, "learning_rate": 1.7846117335229808e-05, "loss": 0.0889, "step": 1885 }, { "epoch": 0.4743083003952569, "grad_norm": 0.22728148102760315, "learning_rate": 1.783349924669994e-05, "loss": 0.1018, "step": 1890 }, { "epoch": 0.47556308425873645, "grad_norm": 0.17804867029190063, "learning_rate": 1.7820848794245243e-05, "loss": 0.1001, "step": 1895 }, { "epoch": 0.47681786812221594, "grad_norm": 0.5709582567214966, "learning_rate": 1.7808166030130782e-05, "loss": 0.087, "step": 1900 }, { "epoch": 0.4780726519856955, "grad_norm": 0.19447216391563416, "learning_rate": 1.779545100675511e-05, "loss": 0.0881, "step": 1905 }, { "epoch": 0.479327435849175, "grad_norm": 0.5337616801261902, "learning_rate": 1.778270377665005e-05, "loss": 0.0954, "step": 1910 }, { "epoch": 0.48058221971265447, "grad_norm": 0.8371096253395081, "learning_rate": 1.77699243924805e-05, "loss": 0.0886, "step": 1915 }, { "epoch": 0.481837003576134, "grad_norm": 0.21723900735378265, "learning_rate": 1.77571129070442e-05, "loss": 0.0899, "step": 1920 }, { "epoch": 0.4830917874396135, "grad_norm": 0.18002496659755707, "learning_rate": 1.7744269373271507e-05, "loss": 0.0876, "step": 1925 }, { "epoch": 0.48434657130309305, "grad_norm": 0.15647904574871063, "learning_rate": 1.7731393844225187e-05, "loss": 0.0842, "step": 1930 }, { "epoch": 0.48560135516657255, "grad_norm": 0.36643457412719727, "learning_rate": 1.7718486373100207e-05, "loss": 0.0848, "step": 1935 }, { "epoch": 0.4868561390300521, "grad_norm": 0.37278589606285095, "learning_rate": 1.7705547013223486e-05, "loss": 0.1, "step": 1940 }, { "epoch": 0.4881109228935316, "grad_norm": 0.18367764353752136, "learning_rate": 1.7692575818053696e-05, "loss": 0.0969, "step": 1945 }, { "epoch": 0.48936570675701113, "grad_norm": 0.4618573486804962, "learning_rate": 1.7679572841181033e-05, "loss": 0.0854, "step": 1950 }, { "epoch": 0.4906204906204906, "grad_norm": 0.2800655961036682, "learning_rate": 1.7666538136327007e-05, "loss": 0.0942, "step": 1955 }, { "epoch": 0.4918752744839701, "grad_norm": 0.2937714457511902, "learning_rate": 1.7653471757344203e-05, "loss": 0.0977, "step": 1960 }, { "epoch": 0.49313005834744966, "grad_norm": 0.7581244707107544, "learning_rate": 1.7640373758216075e-05, "loss": 0.0939, "step": 1965 }, { "epoch": 0.49438484221092915, "grad_norm": 0.21476201713085175, "learning_rate": 1.7627244193056705e-05, "loss": 0.0936, "step": 1970 }, { "epoch": 0.4956396260744087, "grad_norm": 0.1473897099494934, "learning_rate": 1.7614083116110597e-05, "loss": 0.0859, "step": 1975 }, { "epoch": 0.4968944099378882, "grad_norm": 0.4635510742664337, "learning_rate": 1.7600890581752435e-05, "loss": 0.0984, "step": 1980 }, { "epoch": 0.49814919380136774, "grad_norm": 0.15860065817832947, "learning_rate": 1.758766664448689e-05, "loss": 0.0934, "step": 1985 }, { "epoch": 0.49940397766484723, "grad_norm": 0.10272369533777237, "learning_rate": 1.7574411358948347e-05, "loss": 0.0749, "step": 1990 }, { "epoch": 0.5006587615283268, "grad_norm": 0.3367800712585449, "learning_rate": 1.7561124779900723e-05, "loss": 0.0809, "step": 1995 }, { "epoch": 0.5019135453918062, "grad_norm": 0.23143918812274933, "learning_rate": 1.7547806962237222e-05, "loss": 0.1092, "step": 2000 }, { "epoch": 0.5031683292552858, "grad_norm": 0.5789488554000854, "learning_rate": 1.7534457960980097e-05, "loss": 0.1088, "step": 2005 }, { "epoch": 0.5044231131187653, "grad_norm": 0.2879166007041931, "learning_rate": 1.7521077831280453e-05, "loss": 0.0873, "step": 2010 }, { "epoch": 0.5056778969822449, "grad_norm": 0.6664144396781921, "learning_rate": 1.750766662841799e-05, "loss": 0.0984, "step": 2015 }, { "epoch": 0.5069326808457243, "grad_norm": 0.13004139065742493, "learning_rate": 1.7494224407800792e-05, "loss": 0.0879, "step": 2020 }, { "epoch": 0.5081874647092038, "grad_norm": 0.30764704942703247, "learning_rate": 1.7480751224965083e-05, "loss": 0.093, "step": 2025 }, { "epoch": 0.5094422485726834, "grad_norm": 0.15937355160713196, "learning_rate": 1.7467247135575016e-05, "loss": 0.088, "step": 2030 }, { "epoch": 0.5106970324361628, "grad_norm": 0.12112176418304443, "learning_rate": 1.7453712195422432e-05, "loss": 0.0967, "step": 2035 }, { "epoch": 0.5119518162996424, "grad_norm": 0.35342133045196533, "learning_rate": 1.744014646042663e-05, "loss": 0.0834, "step": 2040 }, { "epoch": 0.5132066001631219, "grad_norm": 0.39632728695869446, "learning_rate": 1.7426549986634135e-05, "loss": 0.0866, "step": 2045 }, { "epoch": 0.5144613840266015, "grad_norm": 0.4647534489631653, "learning_rate": 1.741292283021847e-05, "loss": 0.0834, "step": 2050 }, { "epoch": 0.5157161678900809, "grad_norm": 0.3376861810684204, "learning_rate": 1.7399265047479926e-05, "loss": 0.0771, "step": 2055 }, { "epoch": 0.5169709517535604, "grad_norm": 0.3715597689151764, "learning_rate": 1.7385576694845324e-05, "loss": 0.0886, "step": 2060 }, { "epoch": 0.51822573561704, "grad_norm": 0.4062286615371704, "learning_rate": 1.7371857828867778e-05, "loss": 0.0886, "step": 2065 }, { "epoch": 0.5194805194805194, "grad_norm": 0.2665550708770752, "learning_rate": 1.7358108506226477e-05, "loss": 0.0918, "step": 2070 }, { "epoch": 0.520735303343999, "grad_norm": 0.4846581220626831, "learning_rate": 1.7344328783726436e-05, "loss": 0.0834, "step": 2075 }, { "epoch": 0.5219900872074785, "grad_norm": 0.2385398894548416, "learning_rate": 1.7330518718298263e-05, "loss": 0.0953, "step": 2080 }, { "epoch": 0.5232448710709581, "grad_norm": 0.5059955716133118, "learning_rate": 1.7316678366997935e-05, "loss": 0.0796, "step": 2085 }, { "epoch": 0.5244996549344375, "grad_norm": 0.5229114294052124, "learning_rate": 1.7302807787006547e-05, "loss": 0.0953, "step": 2090 }, { "epoch": 0.5257544387979171, "grad_norm": 0.3612153232097626, "learning_rate": 1.728890703563009e-05, "loss": 0.0874, "step": 2095 }, { "epoch": 0.5270092226613966, "grad_norm": 0.2730209529399872, "learning_rate": 1.7274976170299197e-05, "loss": 0.0977, "step": 2100 }, { "epoch": 0.528264006524876, "grad_norm": 0.28223976492881775, "learning_rate": 1.726101524856893e-05, "loss": 0.0901, "step": 2105 }, { "epoch": 0.5295187903883556, "grad_norm": 0.24942927062511444, "learning_rate": 1.724702432811852e-05, "loss": 0.0978, "step": 2110 }, { "epoch": 0.5307735742518351, "grad_norm": 0.21909397840499878, "learning_rate": 1.7233003466751133e-05, "loss": 0.0978, "step": 2115 }, { "epoch": 0.5320283581153147, "grad_norm": 0.28112301230430603, "learning_rate": 1.7218952722393646e-05, "loss": 0.0899, "step": 2120 }, { "epoch": 0.5332831419787941, "grad_norm": 0.23570087552070618, "learning_rate": 1.7204872153096386e-05, "loss": 0.0839, "step": 2125 }, { "epoch": 0.5345379258422737, "grad_norm": 0.5193778872489929, "learning_rate": 1.719076181703291e-05, "loss": 0.1028, "step": 2130 }, { "epoch": 0.5357927097057532, "grad_norm": 0.2834993600845337, "learning_rate": 1.7176621772499752e-05, "loss": 0.0984, "step": 2135 }, { "epoch": 0.5370474935692326, "grad_norm": 0.41437774896621704, "learning_rate": 1.716245207791618e-05, "loss": 0.092, "step": 2140 }, { "epoch": 0.5383022774327122, "grad_norm": 0.4809644818305969, "learning_rate": 1.714825279182398e-05, "loss": 0.0814, "step": 2145 }, { "epoch": 0.5395570612961917, "grad_norm": 0.10345987975597382, "learning_rate": 1.7134023972887164e-05, "loss": 0.0736, "step": 2150 }, { "epoch": 0.5408118451596713, "grad_norm": 0.06666211783885956, "learning_rate": 1.7119765679891794e-05, "loss": 0.078, "step": 2155 }, { "epoch": 0.5420666290231507, "grad_norm": 0.4196692407131195, "learning_rate": 1.7105477971745668e-05, "loss": 0.0963, "step": 2160 }, { "epoch": 0.5433214128866303, "grad_norm": 0.1562952995300293, "learning_rate": 1.7091160907478137e-05, "loss": 0.0777, "step": 2165 }, { "epoch": 0.5445761967501098, "grad_norm": 0.2740727663040161, "learning_rate": 1.7076814546239825e-05, "loss": 0.0953, "step": 2170 }, { "epoch": 0.5458309806135893, "grad_norm": 0.2551318407058716, "learning_rate": 1.7062438947302405e-05, "loss": 0.0775, "step": 2175 }, { "epoch": 0.5470857644770688, "grad_norm": 0.9234172105789185, "learning_rate": 1.704803417005833e-05, "loss": 0.089, "step": 2180 }, { "epoch": 0.5483405483405484, "grad_norm": 0.29249778389930725, "learning_rate": 1.7033600274020616e-05, "loss": 0.0925, "step": 2185 }, { "epoch": 0.5495953322040279, "grad_norm": 0.1841478943824768, "learning_rate": 1.7019137318822577e-05, "loss": 0.0937, "step": 2190 }, { "epoch": 0.5508501160675073, "grad_norm": 0.15575248003005981, "learning_rate": 1.7004645364217584e-05, "loss": 0.0894, "step": 2195 }, { "epoch": 0.5521048999309869, "grad_norm": 0.37570688128471375, "learning_rate": 1.699012447007882e-05, "loss": 0.0853, "step": 2200 }, { "epoch": 0.5533596837944664, "grad_norm": 0.49081242084503174, "learning_rate": 1.6975574696399033e-05, "loss": 0.0938, "step": 2205 }, { "epoch": 0.5546144676579459, "grad_norm": 0.39014649391174316, "learning_rate": 1.6960996103290282e-05, "loss": 0.0876, "step": 2210 }, { "epoch": 0.5558692515214254, "grad_norm": 0.48171404004096985, "learning_rate": 1.694638875098369e-05, "loss": 0.0865, "step": 2215 }, { "epoch": 0.557124035384905, "grad_norm": 0.18437805771827698, "learning_rate": 1.693175269982921e-05, "loss": 0.0895, "step": 2220 }, { "epoch": 0.5583788192483845, "grad_norm": 0.35959693789482117, "learning_rate": 1.691708801029535e-05, "loss": 0.0898, "step": 2225 }, { "epoch": 0.5596336031118639, "grad_norm": 0.5558937191963196, "learning_rate": 1.6902394742968945e-05, "loss": 0.0974, "step": 2230 }, { "epoch": 0.5608883869753435, "grad_norm": 0.2495732605457306, "learning_rate": 1.68876729585549e-05, "loss": 0.0879, "step": 2235 }, { "epoch": 0.562143170838823, "grad_norm": 0.1926206350326538, "learning_rate": 1.6872922717875923e-05, "loss": 0.0889, "step": 2240 }, { "epoch": 0.5633979547023026, "grad_norm": 0.17689497768878937, "learning_rate": 1.6858144081872315e-05, "loss": 0.0924, "step": 2245 }, { "epoch": 0.564652738565782, "grad_norm": 0.10556932538747787, "learning_rate": 1.6843337111601663e-05, "loss": 0.086, "step": 2250 }, { "epoch": 0.5659075224292616, "grad_norm": 0.19952704012393951, "learning_rate": 1.6828501868238637e-05, "loss": 0.0801, "step": 2255 }, { "epoch": 0.5671623062927411, "grad_norm": 0.2911912202835083, "learning_rate": 1.6813638413074707e-05, "loss": 0.0925, "step": 2260 }, { "epoch": 0.5684170901562206, "grad_norm": 0.23794835805892944, "learning_rate": 1.67987468075179e-05, "loss": 0.0913, "step": 2265 }, { "epoch": 0.5696718740197001, "grad_norm": 0.1402294784784317, "learning_rate": 1.6783827113092547e-05, "loss": 0.0882, "step": 2270 }, { "epoch": 0.5709266578831796, "grad_norm": 0.15708528459072113, "learning_rate": 1.6768879391439035e-05, "loss": 0.0941, "step": 2275 }, { "epoch": 0.5721814417466592, "grad_norm": 0.37110403180122375, "learning_rate": 1.6753903704313527e-05, "loss": 0.0969, "step": 2280 }, { "epoch": 0.5734362256101386, "grad_norm": 0.2311350554227829, "learning_rate": 1.6738900113587745e-05, "loss": 0.0849, "step": 2285 }, { "epoch": 0.5746910094736182, "grad_norm": 0.22549764811992645, "learning_rate": 1.6723868681248677e-05, "loss": 0.0923, "step": 2290 }, { "epoch": 0.5759457933370977, "grad_norm": 0.12788964807987213, "learning_rate": 1.6708809469398347e-05, "loss": 0.0798, "step": 2295 }, { "epoch": 0.5772005772005772, "grad_norm": 0.4113115668296814, "learning_rate": 1.6693722540253554e-05, "loss": 0.0794, "step": 2300 }, { "epoch": 0.5784553610640567, "grad_norm": 0.28315308690071106, "learning_rate": 1.6678607956145596e-05, "loss": 0.0855, "step": 2305 }, { "epoch": 0.5797101449275363, "grad_norm": 0.18012386560440063, "learning_rate": 1.6663465779520042e-05, "loss": 0.0906, "step": 2310 }, { "epoch": 0.5809649287910158, "grad_norm": 0.0916949212551117, "learning_rate": 1.6648296072936445e-05, "loss": 0.0807, "step": 2315 }, { "epoch": 0.5822197126544952, "grad_norm": 0.3671141266822815, "learning_rate": 1.6633098899068112e-05, "loss": 0.093, "step": 2320 }, { "epoch": 0.5834744965179748, "grad_norm": 0.16488447785377502, "learning_rate": 1.6617874320701813e-05, "loss": 0.0883, "step": 2325 }, { "epoch": 0.5847292803814543, "grad_norm": 0.291689395904541, "learning_rate": 1.660262240073756e-05, "loss": 0.0908, "step": 2330 }, { "epoch": 0.5859840642449338, "grad_norm": 0.25021910667419434, "learning_rate": 1.658734320218831e-05, "loss": 0.0799, "step": 2335 }, { "epoch": 0.5872388481084133, "grad_norm": 0.11796696484088898, "learning_rate": 1.6572036788179728e-05, "loss": 0.0904, "step": 2340 }, { "epoch": 0.5884936319718929, "grad_norm": 0.14167477190494537, "learning_rate": 1.6556703221949912e-05, "loss": 0.0912, "step": 2345 }, { "epoch": 0.5897484158353724, "grad_norm": 0.12993638217449188, "learning_rate": 1.6541342566849145e-05, "loss": 0.0851, "step": 2350 }, { "epoch": 0.5910031996988518, "grad_norm": 0.23770704865455627, "learning_rate": 1.652595488633963e-05, "loss": 0.0898, "step": 2355 }, { "epoch": 0.5922579835623314, "grad_norm": 0.5402049422264099, "learning_rate": 1.6510540243995216e-05, "loss": 0.0882, "step": 2360 }, { "epoch": 0.5935127674258109, "grad_norm": 0.16188447177410126, "learning_rate": 1.6495098703501153e-05, "loss": 0.0781, "step": 2365 }, { "epoch": 0.5947675512892904, "grad_norm": 0.33231815695762634, "learning_rate": 1.6479630328653814e-05, "loss": 0.0879, "step": 2370 }, { "epoch": 0.5960223351527699, "grad_norm": 0.3051691949367523, "learning_rate": 1.6464135183360444e-05, "loss": 0.1065, "step": 2375 }, { "epoch": 0.5972771190162495, "grad_norm": 0.09856946021318436, "learning_rate": 1.6448613331638877e-05, "loss": 0.0931, "step": 2380 }, { "epoch": 0.598531902879729, "grad_norm": 0.3320614993572235, "learning_rate": 1.6433064837617294e-05, "loss": 0.0806, "step": 2385 }, { "epoch": 0.5997866867432085, "grad_norm": 0.6440458297729492, "learning_rate": 1.641748976553395e-05, "loss": 0.0981, "step": 2390 }, { "epoch": 0.601041470606688, "grad_norm": 0.24246153235435486, "learning_rate": 1.64018881797369e-05, "loss": 0.088, "step": 2395 }, { "epoch": 0.6022962544701675, "grad_norm": 0.21367953717708588, "learning_rate": 1.6386260144683744e-05, "loss": 0.087, "step": 2400 }, { "epoch": 0.603551038333647, "grad_norm": 0.2851586639881134, "learning_rate": 1.6370605724941356e-05, "loss": 0.0877, "step": 2405 }, { "epoch": 0.6048058221971265, "grad_norm": 0.19967404007911682, "learning_rate": 1.6354924985185614e-05, "loss": 0.0888, "step": 2410 }, { "epoch": 0.6060606060606061, "grad_norm": 0.22060827910900116, "learning_rate": 1.633921799020114e-05, "loss": 0.0845, "step": 2415 }, { "epoch": 0.6073153899240856, "grad_norm": 0.4157061278820038, "learning_rate": 1.632348480488103e-05, "loss": 0.0899, "step": 2420 }, { "epoch": 0.6085701737875651, "grad_norm": 0.27656257152557373, "learning_rate": 1.6307725494226586e-05, "loss": 0.09, "step": 2425 }, { "epoch": 0.6098249576510446, "grad_norm": 0.3946853578090668, "learning_rate": 1.6291940123347033e-05, "loss": 0.0863, "step": 2430 }, { "epoch": 0.6110797415145242, "grad_norm": 0.2414676994085312, "learning_rate": 1.6276128757459282e-05, "loss": 0.0905, "step": 2435 }, { "epoch": 0.6123345253780036, "grad_norm": 0.23438376188278198, "learning_rate": 1.6260291461887628e-05, "loss": 0.0726, "step": 2440 }, { "epoch": 0.6135893092414831, "grad_norm": 0.20305827260017395, "learning_rate": 1.6244428302063506e-05, "loss": 0.0891, "step": 2445 }, { "epoch": 0.6148440931049627, "grad_norm": 0.3930656909942627, "learning_rate": 1.62285393435252e-05, "loss": 0.0884, "step": 2450 }, { "epoch": 0.6160988769684422, "grad_norm": 0.23331406712532043, "learning_rate": 1.6212624651917573e-05, "loss": 0.0904, "step": 2455 }, { "epoch": 0.6173536608319217, "grad_norm": 0.34075310826301575, "learning_rate": 1.6196684292991827e-05, "loss": 0.0941, "step": 2460 }, { "epoch": 0.6186084446954012, "grad_norm": 0.33201703429222107, "learning_rate": 1.6180718332605185e-05, "loss": 0.0838, "step": 2465 }, { "epoch": 0.6198632285588808, "grad_norm": 0.10392648726701736, "learning_rate": 1.6164726836720656e-05, "loss": 0.0974, "step": 2470 }, { "epoch": 0.6211180124223602, "grad_norm": 0.21417252719402313, "learning_rate": 1.614870987140674e-05, "loss": 0.0891, "step": 2475 }, { "epoch": 0.6223727962858397, "grad_norm": 0.24638135731220245, "learning_rate": 1.6132667502837164e-05, "loss": 0.0827, "step": 2480 }, { "epoch": 0.6236275801493193, "grad_norm": 0.183788001537323, "learning_rate": 1.611659979729062e-05, "loss": 0.0917, "step": 2485 }, { "epoch": 0.6248823640127988, "grad_norm": 0.25125667452812195, "learning_rate": 1.6100506821150455e-05, "loss": 0.0888, "step": 2490 }, { "epoch": 0.6261371478762783, "grad_norm": 0.24602219462394714, "learning_rate": 1.6084388640904452e-05, "loss": 0.0773, "step": 2495 }, { "epoch": 0.6273919317397578, "grad_norm": 0.6547009944915771, "learning_rate": 1.60682453231445e-05, "loss": 0.1033, "step": 2500 }, { "epoch": 0.6286467156032374, "grad_norm": 0.11527785658836365, "learning_rate": 1.605207693456635e-05, "loss": 0.0947, "step": 2505 }, { "epoch": 0.6299014994667168, "grad_norm": 0.3719431459903717, "learning_rate": 1.6035883541969336e-05, "loss": 0.0873, "step": 2510 }, { "epoch": 0.6311562833301964, "grad_norm": 0.2360219806432724, "learning_rate": 1.601966521225609e-05, "loss": 0.0826, "step": 2515 }, { "epoch": 0.6324110671936759, "grad_norm": 0.27890655398368835, "learning_rate": 1.6003422012432275e-05, "loss": 0.0955, "step": 2520 }, { "epoch": 0.6336658510571554, "grad_norm": 0.14903779327869415, "learning_rate": 1.5987154009606308e-05, "loss": 0.0951, "step": 2525 }, { "epoch": 0.6349206349206349, "grad_norm": 0.20811066031455994, "learning_rate": 1.5970861270989065e-05, "loss": 0.0874, "step": 2530 }, { "epoch": 0.6361754187841144, "grad_norm": 0.39913272857666016, "learning_rate": 1.5954543863893638e-05, "loss": 0.0846, "step": 2535 }, { "epoch": 0.637430202647594, "grad_norm": 0.3802116811275482, "learning_rate": 1.5938201855735017e-05, "loss": 0.0754, "step": 2540 }, { "epoch": 0.6386849865110734, "grad_norm": 0.4720747172832489, "learning_rate": 1.592183531402984e-05, "loss": 0.0872, "step": 2545 }, { "epoch": 0.639939770374553, "grad_norm": 0.46679922938346863, "learning_rate": 1.590544430639611e-05, "loss": 0.0867, "step": 2550 }, { "epoch": 0.6411945542380325, "grad_norm": 0.29669615626335144, "learning_rate": 1.5889028900552897e-05, "loss": 0.0988, "step": 2555 }, { "epoch": 0.6424493381015121, "grad_norm": 0.20683659613132477, "learning_rate": 1.587258916432008e-05, "loss": 0.0935, "step": 2560 }, { "epoch": 0.6437041219649915, "grad_norm": 0.3284091055393219, "learning_rate": 1.5856125165618056e-05, "loss": 0.0779, "step": 2565 }, { "epoch": 0.644958905828471, "grad_norm": 0.24657922983169556, "learning_rate": 1.5839636972467466e-05, "loss": 0.0827, "step": 2570 }, { "epoch": 0.6462136896919506, "grad_norm": 0.32977283000946045, "learning_rate": 1.5823124652988907e-05, "loss": 0.0955, "step": 2575 }, { "epoch": 0.64746847355543, "grad_norm": 0.28097760677337646, "learning_rate": 1.580658827540265e-05, "loss": 0.0837, "step": 2580 }, { "epoch": 0.6487232574189096, "grad_norm": 0.12431260943412781, "learning_rate": 1.5790027908028366e-05, "loss": 0.0901, "step": 2585 }, { "epoch": 0.6499780412823891, "grad_norm": 0.22479486465454102, "learning_rate": 1.5773443619284844e-05, "loss": 0.084, "step": 2590 }, { "epoch": 0.6512328251458687, "grad_norm": 0.36471477150917053, "learning_rate": 1.5756835477689683e-05, "loss": 0.0811, "step": 2595 }, { "epoch": 0.6524876090093481, "grad_norm": 0.11465982347726822, "learning_rate": 1.574020355185906e-05, "loss": 0.0775, "step": 2600 }, { "epoch": 0.6537423928728276, "grad_norm": 0.22774779796600342, "learning_rate": 1.5723547910507392e-05, "loss": 0.0893, "step": 2605 }, { "epoch": 0.6549971767363072, "grad_norm": 0.17315097153186798, "learning_rate": 1.5706868622447084e-05, "loss": 0.0924, "step": 2610 }, { "epoch": 0.6562519605997866, "grad_norm": 0.21010887622833252, "learning_rate": 1.5690165756588235e-05, "loss": 0.085, "step": 2615 }, { "epoch": 0.6575067444632662, "grad_norm": 0.20467358827590942, "learning_rate": 1.5673439381938365e-05, "loss": 0.0857, "step": 2620 }, { "epoch": 0.6587615283267457, "grad_norm": 0.0706283450126648, "learning_rate": 1.565668956760211e-05, "loss": 0.0921, "step": 2625 }, { "epoch": 0.6600163121902253, "grad_norm": 0.10635245591402054, "learning_rate": 1.563991638278094e-05, "loss": 0.0901, "step": 2630 }, { "epoch": 0.6612710960537047, "grad_norm": 0.2414940446615219, "learning_rate": 1.56231198967729e-05, "loss": 0.0979, "step": 2635 }, { "epoch": 0.6625258799171843, "grad_norm": 0.3018660843372345, "learning_rate": 1.560630017897229e-05, "loss": 0.0774, "step": 2640 }, { "epoch": 0.6637806637806638, "grad_norm": 0.08618688583374023, "learning_rate": 1.558945729886938e-05, "loss": 0.0917, "step": 2645 }, { "epoch": 0.6650354476441434, "grad_norm": 0.3376272916793823, "learning_rate": 1.5572591326050167e-05, "loss": 0.0997, "step": 2650 }, { "epoch": 0.6662902315076228, "grad_norm": 0.47232964634895325, "learning_rate": 1.5555702330196024e-05, "loss": 0.0984, "step": 2655 }, { "epoch": 0.6675450153711023, "grad_norm": 0.3733605742454529, "learning_rate": 1.5538790381083457e-05, "loss": 0.095, "step": 2660 }, { "epoch": 0.6687997992345819, "grad_norm": 0.197517529129982, "learning_rate": 1.5521855548583807e-05, "loss": 0.0882, "step": 2665 }, { "epoch": 0.6700545830980613, "grad_norm": 0.5959796905517578, "learning_rate": 1.550489790266294e-05, "loss": 0.0978, "step": 2670 }, { "epoch": 0.6713093669615409, "grad_norm": 0.3124695122241974, "learning_rate": 1.5487917513381e-05, "loss": 0.0877, "step": 2675 }, { "epoch": 0.6725641508250204, "grad_norm": 0.5856664776802063, "learning_rate": 1.5470914450892066e-05, "loss": 0.0871, "step": 2680 }, { "epoch": 0.6738189346885, "grad_norm": 0.11247047036886215, "learning_rate": 1.5453888785443916e-05, "loss": 0.0787, "step": 2685 }, { "epoch": 0.6750737185519794, "grad_norm": 0.21094247698783875, "learning_rate": 1.54368405873777e-05, "loss": 0.0905, "step": 2690 }, { "epoch": 0.6763285024154589, "grad_norm": 0.32155776023864746, "learning_rate": 1.5419769927127664e-05, "loss": 0.0896, "step": 2695 }, { "epoch": 0.6775832862789385, "grad_norm": 0.1487855464220047, "learning_rate": 1.5402676875220847e-05, "loss": 0.0924, "step": 2700 }, { "epoch": 0.6788380701424179, "grad_norm": 0.3109644651412964, "learning_rate": 1.5385561502276813e-05, "loss": 0.085, "step": 2705 }, { "epoch": 0.6800928540058975, "grad_norm": 0.13988961279392242, "learning_rate": 1.536842387900733e-05, "loss": 0.0806, "step": 2710 }, { "epoch": 0.681347637869377, "grad_norm": 0.4081459045410156, "learning_rate": 1.5351264076216114e-05, "loss": 0.0912, "step": 2715 }, { "epoch": 0.6826024217328566, "grad_norm": 0.23864784836769104, "learning_rate": 1.533408216479849e-05, "loss": 0.088, "step": 2720 }, { "epoch": 0.683857205596336, "grad_norm": 0.24918711185455322, "learning_rate": 1.531687821574114e-05, "loss": 0.0904, "step": 2725 }, { "epoch": 0.6851119894598156, "grad_norm": 0.23600496351718903, "learning_rate": 1.5299652300121792e-05, "loss": 0.0944, "step": 2730 }, { "epoch": 0.6863667733232951, "grad_norm": 0.1220758855342865, "learning_rate": 1.5282404489108925e-05, "loss": 0.0811, "step": 2735 }, { "epoch": 0.6876215571867745, "grad_norm": 0.1529039442539215, "learning_rate": 1.5265134853961477e-05, "loss": 0.0985, "step": 2740 }, { "epoch": 0.6888763410502541, "grad_norm": 0.3052196502685547, "learning_rate": 1.524784346602856e-05, "loss": 0.0917, "step": 2745 }, { "epoch": 0.6901311249137336, "grad_norm": 0.3731965720653534, "learning_rate": 1.5230530396749148e-05, "loss": 0.0906, "step": 2750 }, { "epoch": 0.6913859087772132, "grad_norm": 0.09618979692459106, "learning_rate": 1.5213195717651793e-05, "loss": 0.0914, "step": 2755 }, { "epoch": 0.6926406926406926, "grad_norm": 0.3307301104068756, "learning_rate": 1.5195839500354337e-05, "loss": 0.0859, "step": 2760 }, { "epoch": 0.6938954765041722, "grad_norm": 0.12252155691385269, "learning_rate": 1.5178461816563594e-05, "loss": 0.0881, "step": 2765 }, { "epoch": 0.6951502603676517, "grad_norm": 0.2949330806732178, "learning_rate": 1.5161062738075068e-05, "loss": 0.0782, "step": 2770 }, { "epoch": 0.6964050442311311, "grad_norm": 0.3150964379310608, "learning_rate": 1.5143642336772663e-05, "loss": 0.0866, "step": 2775 }, { "epoch": 0.6976598280946107, "grad_norm": 0.44747477769851685, "learning_rate": 1.5126200684628372e-05, "loss": 0.0943, "step": 2780 }, { "epoch": 0.6989146119580902, "grad_norm": 0.1062508150935173, "learning_rate": 1.5108737853701981e-05, "loss": 0.1027, "step": 2785 }, { "epoch": 0.7001693958215698, "grad_norm": 0.16572189331054688, "learning_rate": 1.5091253916140789e-05, "loss": 0.0892, "step": 2790 }, { "epoch": 0.7014241796850492, "grad_norm": 0.1096203476190567, "learning_rate": 1.5073748944179282e-05, "loss": 0.0889, "step": 2795 }, { "epoch": 0.7026789635485288, "grad_norm": 0.3676256835460663, "learning_rate": 1.5056223010138857e-05, "loss": 0.0998, "step": 2800 }, { "epoch": 0.7039337474120083, "grad_norm": 0.2482236921787262, "learning_rate": 1.5038676186427515e-05, "loss": 0.089, "step": 2805 }, { "epoch": 0.7051885312754878, "grad_norm": 0.13800540566444397, "learning_rate": 1.5021108545539562e-05, "loss": 0.0923, "step": 2810 }, { "epoch": 0.7064433151389673, "grad_norm": 0.33005279302597046, "learning_rate": 1.5003520160055303e-05, "loss": 0.0894, "step": 2815 }, { "epoch": 0.7076980990024468, "grad_norm": 0.34894976019859314, "learning_rate": 1.4985911102640762e-05, "loss": 0.0975, "step": 2820 }, { "epoch": 0.7089528828659264, "grad_norm": 0.2921280860900879, "learning_rate": 1.4968281446047357e-05, "loss": 0.089, "step": 2825 }, { "epoch": 0.7102076667294058, "grad_norm": 0.3317979872226715, "learning_rate": 1.4950631263111615e-05, "loss": 0.0889, "step": 2830 }, { "epoch": 0.7114624505928854, "grad_norm": 0.16637681424617767, "learning_rate": 1.4932960626754867e-05, "loss": 0.0788, "step": 2835 }, { "epoch": 0.7127172344563649, "grad_norm": 0.15462718904018402, "learning_rate": 1.491526960998295e-05, "loss": 0.0887, "step": 2840 }, { "epoch": 0.7139720183198444, "grad_norm": 0.23913145065307617, "learning_rate": 1.4897558285885896e-05, "loss": 0.0849, "step": 2845 }, { "epoch": 0.7152268021833239, "grad_norm": 0.44012489914894104, "learning_rate": 1.487982672763764e-05, "loss": 0.0802, "step": 2850 }, { "epoch": 0.7164815860468035, "grad_norm": 0.16587427258491516, "learning_rate": 1.4862075008495718e-05, "loss": 0.0913, "step": 2855 }, { "epoch": 0.717736369910283, "grad_norm": 0.4225013256072998, "learning_rate": 1.4844303201800949e-05, "loss": 0.0828, "step": 2860 }, { "epoch": 0.7189911537737624, "grad_norm": 0.3589664399623871, "learning_rate": 1.4826511380977155e-05, "loss": 0.0861, "step": 2865 }, { "epoch": 0.720245937637242, "grad_norm": 0.13328975439071655, "learning_rate": 1.4808699619530841e-05, "loss": 0.0897, "step": 2870 }, { "epoch": 0.7215007215007215, "grad_norm": 0.5425406098365784, "learning_rate": 1.479086799105089e-05, "loss": 0.0789, "step": 2875 }, { "epoch": 0.722755505364201, "grad_norm": 0.22545233368873596, "learning_rate": 1.4773016569208283e-05, "loss": 0.0814, "step": 2880 }, { "epoch": 0.7240102892276805, "grad_norm": 0.2654625177383423, "learning_rate": 1.4755145427755755e-05, "loss": 0.0846, "step": 2885 }, { "epoch": 0.7252650730911601, "grad_norm": 0.4927530288696289, "learning_rate": 1.4737254640527525e-05, "loss": 0.0892, "step": 2890 }, { "epoch": 0.7265198569546396, "grad_norm": 0.317210853099823, "learning_rate": 1.4719344281438977e-05, "loss": 0.0858, "step": 2895 }, { "epoch": 0.727774640818119, "grad_norm": 0.3418554961681366, "learning_rate": 1.4701414424486353e-05, "loss": 0.0859, "step": 2900 }, { "epoch": 0.7290294246815986, "grad_norm": 0.21974579989910126, "learning_rate": 1.4683465143746452e-05, "loss": 0.0988, "step": 2905 }, { "epoch": 0.7302842085450781, "grad_norm": 0.35090965032577515, "learning_rate": 1.466549651337632e-05, "loss": 0.0909, "step": 2910 }, { "epoch": 0.7315389924085576, "grad_norm": 0.27880313992500305, "learning_rate": 1.4647508607612952e-05, "loss": 0.0907, "step": 2915 }, { "epoch": 0.7327937762720371, "grad_norm": 0.12414207309484482, "learning_rate": 1.4629501500772962e-05, "loss": 0.0912, "step": 2920 }, { "epoch": 0.7340485601355167, "grad_norm": 0.3848329782485962, "learning_rate": 1.4611475267252318e-05, "loss": 0.0813, "step": 2925 }, { "epoch": 0.7353033439989962, "grad_norm": 0.07426943629980087, "learning_rate": 1.4593429981525985e-05, "loss": 0.0862, "step": 2930 }, { "epoch": 0.7365581278624757, "grad_norm": 0.1772748976945877, "learning_rate": 1.4575365718147655e-05, "loss": 0.111, "step": 2935 }, { "epoch": 0.7378129117259552, "grad_norm": 0.23796626925468445, "learning_rate": 1.4557282551749428e-05, "loss": 0.0852, "step": 2940 }, { "epoch": 0.7390676955894347, "grad_norm": 0.22509251534938812, "learning_rate": 1.4539180557041494e-05, "loss": 0.0804, "step": 2945 }, { "epoch": 0.7403224794529142, "grad_norm": 0.29270699620246887, "learning_rate": 1.452105980881183e-05, "loss": 0.0925, "step": 2950 }, { "epoch": 0.7415772633163937, "grad_norm": 0.27449578046798706, "learning_rate": 1.4502920381925905e-05, "loss": 0.084, "step": 2955 }, { "epoch": 0.7428320471798733, "grad_norm": 0.23798063397407532, "learning_rate": 1.4484762351326344e-05, "loss": 0.0894, "step": 2960 }, { "epoch": 0.7440868310433528, "grad_norm": 0.1779652088880539, "learning_rate": 1.4466585792032644e-05, "loss": 0.0829, "step": 2965 }, { "epoch": 0.7453416149068323, "grad_norm": 0.10709716379642487, "learning_rate": 1.4448390779140844e-05, "loss": 0.0889, "step": 2970 }, { "epoch": 0.7465963987703118, "grad_norm": 0.3699777126312256, "learning_rate": 1.4430177387823232e-05, "loss": 0.0925, "step": 2975 }, { "epoch": 0.7478511826337914, "grad_norm": 0.23383845388889313, "learning_rate": 1.4411945693328017e-05, "loss": 0.0802, "step": 2980 }, { "epoch": 0.7491059664972708, "grad_norm": 0.17309200763702393, "learning_rate": 1.4393695770979038e-05, "loss": 0.092, "step": 2985 }, { "epoch": 0.7503607503607503, "grad_norm": 0.08418245613574982, "learning_rate": 1.4375427696175434e-05, "loss": 0.088, "step": 2990 }, { "epoch": 0.7516155342242299, "grad_norm": 0.4208645820617676, "learning_rate": 1.4357141544391342e-05, "loss": 0.0946, "step": 2995 }, { "epoch": 0.7528703180877094, "grad_norm": 0.10623873025178909, "learning_rate": 1.4338837391175582e-05, "loss": 0.0876, "step": 3000 }, { "epoch": 0.7541251019511889, "grad_norm": 0.13417641818523407, "learning_rate": 1.4320515312151352e-05, "loss": 0.0853, "step": 3005 }, { "epoch": 0.7553798858146684, "grad_norm": 0.11833745986223221, "learning_rate": 1.4302175383015907e-05, "loss": 0.0923, "step": 3010 }, { "epoch": 0.756634669678148, "grad_norm": 0.13997751474380493, "learning_rate": 1.4283817679540246e-05, "loss": 0.0842, "step": 3015 }, { "epoch": 0.7578894535416274, "grad_norm": 0.4091770648956299, "learning_rate": 1.4265442277568808e-05, "loss": 0.0869, "step": 3020 }, { "epoch": 0.759144237405107, "grad_norm": 0.38319870829582214, "learning_rate": 1.4247049253019148e-05, "loss": 0.0816, "step": 3025 }, { "epoch": 0.7603990212685865, "grad_norm": 0.44087597727775574, "learning_rate": 1.4228638681881633e-05, "loss": 0.0925, "step": 3030 }, { "epoch": 0.761653805132066, "grad_norm": 0.10818012803792953, "learning_rate": 1.4210210640219117e-05, "loss": 0.0873, "step": 3035 }, { "epoch": 0.7629085889955455, "grad_norm": 0.3094254434108734, "learning_rate": 1.4191765204166643e-05, "loss": 0.0881, "step": 3040 }, { "epoch": 0.764163372859025, "grad_norm": 0.14086200296878815, "learning_rate": 1.4173302449931107e-05, "loss": 0.0874, "step": 3045 }, { "epoch": 0.7654181567225046, "grad_norm": 0.10829006880521774, "learning_rate": 1.4154822453790963e-05, "loss": 0.0813, "step": 3050 }, { "epoch": 0.7666729405859841, "grad_norm": 0.21264980733394623, "learning_rate": 1.4136325292095899e-05, "loss": 0.0823, "step": 3055 }, { "epoch": 0.7679277244494636, "grad_norm": 0.07161971926689148, "learning_rate": 1.4117811041266518e-05, "loss": 0.0878, "step": 3060 }, { "epoch": 0.7691825083129431, "grad_norm": 0.24035820364952087, "learning_rate": 1.4099279777794026e-05, "loss": 0.0811, "step": 3065 }, { "epoch": 0.7704372921764227, "grad_norm": 0.17685869336128235, "learning_rate": 1.4080731578239917e-05, "loss": 0.0854, "step": 3070 }, { "epoch": 0.7716920760399021, "grad_norm": 0.33490845561027527, "learning_rate": 1.4062166519235665e-05, "loss": 0.0875, "step": 3075 }, { "epoch": 0.7729468599033816, "grad_norm": 0.24075230956077576, "learning_rate": 1.4043584677482383e-05, "loss": 0.0942, "step": 3080 }, { "epoch": 0.7742016437668612, "grad_norm": 0.16541673243045807, "learning_rate": 1.4024986129750535e-05, "loss": 0.0924, "step": 3085 }, { "epoch": 0.7754564276303407, "grad_norm": 0.36374107003211975, "learning_rate": 1.40063709528796e-05, "loss": 0.0959, "step": 3090 }, { "epoch": 0.7767112114938202, "grad_norm": 0.19677504897117615, "learning_rate": 1.3987739223777756e-05, "loss": 0.0841, "step": 3095 }, { "epoch": 0.7779659953572997, "grad_norm": 0.24688193202018738, "learning_rate": 1.3969091019421573e-05, "loss": 0.0795, "step": 3100 }, { "epoch": 0.7792207792207793, "grad_norm": 0.4386122524738312, "learning_rate": 1.3950426416855685e-05, "loss": 0.091, "step": 3105 }, { "epoch": 0.7804755630842587, "grad_norm": 0.281536340713501, "learning_rate": 1.3931745493192473e-05, "loss": 0.0809, "step": 3110 }, { "epoch": 0.7817303469477382, "grad_norm": 0.3444419801235199, "learning_rate": 1.391304832561175e-05, "loss": 0.0819, "step": 3115 }, { "epoch": 0.7829851308112178, "grad_norm": 0.3660227060317993, "learning_rate": 1.3894334991360448e-05, "loss": 0.0908, "step": 3120 }, { "epoch": 0.7842399146746973, "grad_norm": 0.15960213541984558, "learning_rate": 1.3875605567752275e-05, "loss": 0.0817, "step": 3125 }, { "epoch": 0.7854946985381768, "grad_norm": 0.10506850481033325, "learning_rate": 1.3856860132167423e-05, "loss": 0.0867, "step": 3130 }, { "epoch": 0.7867494824016563, "grad_norm": 0.28331154584884644, "learning_rate": 1.3838098762052237e-05, "loss": 0.0899, "step": 3135 }, { "epoch": 0.7880042662651359, "grad_norm": 0.07872291654348373, "learning_rate": 1.381932153491889e-05, "loss": 0.0864, "step": 3140 }, { "epoch": 0.7892590501286153, "grad_norm": 0.26927658915519714, "learning_rate": 1.3800528528345074e-05, "loss": 0.0928, "step": 3145 }, { "epoch": 0.7905138339920948, "grad_norm": 0.4041297733783722, "learning_rate": 1.378171981997367e-05, "loss": 0.0891, "step": 3150 }, { "epoch": 0.7917686178555744, "grad_norm": 0.29741108417510986, "learning_rate": 1.3762895487512426e-05, "loss": 0.0827, "step": 3155 }, { "epoch": 0.7930234017190539, "grad_norm": 0.11423831433057785, "learning_rate": 1.3744055608733654e-05, "loss": 0.0853, "step": 3160 }, { "epoch": 0.7942781855825334, "grad_norm": 0.07605528086423874, "learning_rate": 1.3725200261473879e-05, "loss": 0.0958, "step": 3165 }, { "epoch": 0.7955329694460129, "grad_norm": 0.27038949728012085, "learning_rate": 1.3706329523633546e-05, "loss": 0.0866, "step": 3170 }, { "epoch": 0.7967877533094925, "grad_norm": 0.17523351311683655, "learning_rate": 1.3687443473176678e-05, "loss": 0.092, "step": 3175 }, { "epoch": 0.7980425371729719, "grad_norm": 0.14444954693317413, "learning_rate": 1.3668542188130567e-05, "loss": 0.0937, "step": 3180 }, { "epoch": 0.7992973210364515, "grad_norm": 0.09110607951879501, "learning_rate": 1.3649625746585442e-05, "loss": 0.0894, "step": 3185 }, { "epoch": 0.800552104899931, "grad_norm": 0.24283112585544586, "learning_rate": 1.3630694226694159e-05, "loss": 0.0879, "step": 3190 }, { "epoch": 0.8018068887634106, "grad_norm": 0.2000175565481186, "learning_rate": 1.3611747706671859e-05, "loss": 0.0809, "step": 3195 }, { "epoch": 0.80306167262689, "grad_norm": 0.13179725408554077, "learning_rate": 1.3592786264795659e-05, "loss": 0.0792, "step": 3200 }, { "epoch": 0.8043164564903695, "grad_norm": 0.17673130333423615, "learning_rate": 1.357380997940433e-05, "loss": 0.0819, "step": 3205 }, { "epoch": 0.8055712403538491, "grad_norm": 0.1775507628917694, "learning_rate": 1.3554818928897965e-05, "loss": 0.094, "step": 3210 }, { "epoch": 0.8068260242173285, "grad_norm": 0.10443057119846344, "learning_rate": 1.3535813191737663e-05, "loss": 0.0921, "step": 3215 }, { "epoch": 0.8080808080808081, "grad_norm": 0.45298513770103455, "learning_rate": 1.351679284644519e-05, "loss": 0.092, "step": 3220 }, { "epoch": 0.8093355919442876, "grad_norm": 0.1088099479675293, "learning_rate": 1.3497757971602677e-05, "loss": 0.0945, "step": 3225 }, { "epoch": 0.8105903758077672, "grad_norm": 0.11513658612966537, "learning_rate": 1.3478708645852272e-05, "loss": 0.0906, "step": 3230 }, { "epoch": 0.8118451596712466, "grad_norm": 0.15573541820049286, "learning_rate": 1.3459644947895844e-05, "loss": 0.0895, "step": 3235 }, { "epoch": 0.8130999435347261, "grad_norm": 0.548417866230011, "learning_rate": 1.344056695649462e-05, "loss": 0.0957, "step": 3240 }, { "epoch": 0.8143547273982057, "grad_norm": 0.24571377038955688, "learning_rate": 1.3421474750468893e-05, "loss": 0.0852, "step": 3245 }, { "epoch": 0.8156095112616851, "grad_norm": 0.1693846732378006, "learning_rate": 1.3402368408697681e-05, "loss": 0.0962, "step": 3250 }, { "epoch": 0.8168642951251647, "grad_norm": 0.12856455147266388, "learning_rate": 1.3383248010118404e-05, "loss": 0.0926, "step": 3255 }, { "epoch": 0.8181190789886442, "grad_norm": 0.3115483820438385, "learning_rate": 1.336411363372655e-05, "loss": 0.0943, "step": 3260 }, { "epoch": 0.8193738628521238, "grad_norm": 0.44211283326148987, "learning_rate": 1.3344965358575368e-05, "loss": 0.102, "step": 3265 }, { "epoch": 0.8206286467156032, "grad_norm": 0.1086539626121521, "learning_rate": 1.3325803263775521e-05, "loss": 0.0968, "step": 3270 }, { "epoch": 0.8218834305790828, "grad_norm": 0.24857747554779053, "learning_rate": 1.3306627428494769e-05, "loss": 0.097, "step": 3275 }, { "epoch": 0.8231382144425623, "grad_norm": 0.5873998403549194, "learning_rate": 1.3287437931957642e-05, "loss": 0.0848, "step": 3280 }, { "epoch": 0.8243929983060417, "grad_norm": 0.2507260739803314, "learning_rate": 1.3268234853445113e-05, "loss": 0.0917, "step": 3285 }, { "epoch": 0.8256477821695213, "grad_norm": 0.2204979509115219, "learning_rate": 1.3249018272294261e-05, "loss": 0.083, "step": 3290 }, { "epoch": 0.8269025660330008, "grad_norm": 0.19563211500644684, "learning_rate": 1.3229788267897958e-05, "loss": 0.0803, "step": 3295 }, { "epoch": 0.8281573498964804, "grad_norm": 0.24326251447200775, "learning_rate": 1.3210544919704539e-05, "loss": 0.078, "step": 3300 }, { "epoch": 0.8294121337599598, "grad_norm": 0.08204273879528046, "learning_rate": 1.319128830721745e-05, "loss": 0.0866, "step": 3305 }, { "epoch": 0.8306669176234394, "grad_norm": 0.3777156174182892, "learning_rate": 1.317201850999496e-05, "loss": 0.0892, "step": 3310 }, { "epoch": 0.8319217014869189, "grad_norm": 0.15763606131076813, "learning_rate": 1.315273560764979e-05, "loss": 0.0947, "step": 3315 }, { "epoch": 0.8331764853503983, "grad_norm": 0.4323311448097229, "learning_rate": 1.3133439679848824e-05, "loss": 0.091, "step": 3320 }, { "epoch": 0.8344312692138779, "grad_norm": 0.51838618516922, "learning_rate": 1.3114130806312744e-05, "loss": 0.0783, "step": 3325 }, { "epoch": 0.8356860530773574, "grad_norm": 0.2101561725139618, "learning_rate": 1.3094809066815731e-05, "loss": 0.0797, "step": 3330 }, { "epoch": 0.836940836940837, "grad_norm": 0.35820868611335754, "learning_rate": 1.3075474541185104e-05, "loss": 0.0904, "step": 3335 }, { "epoch": 0.8381956208043164, "grad_norm": 0.17395086586475372, "learning_rate": 1.3056127309301027e-05, "loss": 0.0773, "step": 3340 }, { "epoch": 0.839450404667796, "grad_norm": 0.22628264129161835, "learning_rate": 1.3036767451096148e-05, "loss": 0.0833, "step": 3345 }, { "epoch": 0.8407051885312755, "grad_norm": 0.14669981598854065, "learning_rate": 1.3017395046555284e-05, "loss": 0.0866, "step": 3350 }, { "epoch": 0.841959972394755, "grad_norm": 0.45152294635772705, "learning_rate": 1.2998010175715081e-05, "loss": 0.0955, "step": 3355 }, { "epoch": 0.8432147562582345, "grad_norm": 0.14131848514080048, "learning_rate": 1.2978612918663702e-05, "loss": 0.0805, "step": 3360 }, { "epoch": 0.844469540121714, "grad_norm": 0.4170994460582733, "learning_rate": 1.2959203355540466e-05, "loss": 0.0885, "step": 3365 }, { "epoch": 0.8457243239851936, "grad_norm": 0.2687641382217407, "learning_rate": 1.2939781566535551e-05, "loss": 0.0859, "step": 3370 }, { "epoch": 0.846979107848673, "grad_norm": 0.15929897129535675, "learning_rate": 1.2920347631889637e-05, "loss": 0.0876, "step": 3375 }, { "epoch": 0.8482338917121526, "grad_norm": 0.1964198797941208, "learning_rate": 1.2900901631893585e-05, "loss": 0.0966, "step": 3380 }, { "epoch": 0.8494886755756321, "grad_norm": 0.21609993278980255, "learning_rate": 1.28814436468881e-05, "loss": 0.0811, "step": 3385 }, { "epoch": 0.8507434594391116, "grad_norm": 0.24047359824180603, "learning_rate": 1.2861973757263416e-05, "loss": 0.0858, "step": 3390 }, { "epoch": 0.8519982433025911, "grad_norm": 0.25712600350379944, "learning_rate": 1.2842492043458929e-05, "loss": 0.0793, "step": 3395 }, { "epoch": 0.8532530271660707, "grad_norm": 0.1645137518644333, "learning_rate": 1.2822998585962909e-05, "loss": 0.0801, "step": 3400 }, { "epoch": 0.8545078110295502, "grad_norm": 0.601332426071167, "learning_rate": 1.280349346531213e-05, "loss": 0.0891, "step": 3405 }, { "epoch": 0.8557625948930296, "grad_norm": 0.3988315761089325, "learning_rate": 1.2783976762091554e-05, "loss": 0.0885, "step": 3410 }, { "epoch": 0.8570173787565092, "grad_norm": 0.37066715955734253, "learning_rate": 1.2764448556934001e-05, "loss": 0.0935, "step": 3415 }, { "epoch": 0.8582721626199887, "grad_norm": 0.1261286735534668, "learning_rate": 1.274490893051981e-05, "loss": 0.0824, "step": 3420 }, { "epoch": 0.8595269464834682, "grad_norm": 0.11686058342456818, "learning_rate": 1.2725357963576506e-05, "loss": 0.0963, "step": 3425 }, { "epoch": 0.8607817303469477, "grad_norm": 0.1499367356300354, "learning_rate": 1.2705795736878461e-05, "loss": 0.0899, "step": 3430 }, { "epoch": 0.8620365142104273, "grad_norm": 0.25637149810791016, "learning_rate": 1.268622233124658e-05, "loss": 0.09, "step": 3435 }, { "epoch": 0.8632912980739068, "grad_norm": 0.2606950104236603, "learning_rate": 1.2666637827547935e-05, "loss": 0.0928, "step": 3440 }, { "epoch": 0.8645460819373862, "grad_norm": 0.17637059092521667, "learning_rate": 1.264704230669547e-05, "loss": 0.0999, "step": 3445 }, { "epoch": 0.8658008658008658, "grad_norm": 0.21137331426143646, "learning_rate": 1.2627435849647629e-05, "loss": 0.0871, "step": 3450 }, { "epoch": 0.8670556496643453, "grad_norm": 0.40598878264427185, "learning_rate": 1.2607818537408047e-05, "loss": 0.0821, "step": 3455 }, { "epoch": 0.8683104335278249, "grad_norm": 0.11776316165924072, "learning_rate": 1.2588190451025209e-05, "loss": 0.0987, "step": 3460 }, { "epoch": 0.8695652173913043, "grad_norm": 0.11056658625602722, "learning_rate": 1.2568551671592106e-05, "loss": 0.0863, "step": 3465 }, { "epoch": 0.8708200012547839, "grad_norm": 0.29835495352745056, "learning_rate": 1.2548902280245909e-05, "loss": 0.0876, "step": 3470 }, { "epoch": 0.8720747851182634, "grad_norm": 0.3364979922771454, "learning_rate": 1.252924235816764e-05, "loss": 0.0788, "step": 3475 }, { "epoch": 0.8733295689817429, "grad_norm": 0.1180335134267807, "learning_rate": 1.2509571986581814e-05, "loss": 0.0799, "step": 3480 }, { "epoch": 0.8745843528452224, "grad_norm": 0.2995622754096985, "learning_rate": 1.2489891246756131e-05, "loss": 0.0964, "step": 3485 }, { "epoch": 0.875839136708702, "grad_norm": 0.23864442110061646, "learning_rate": 1.2470200220001122e-05, "loss": 0.0982, "step": 3490 }, { "epoch": 0.8770939205721815, "grad_norm": 0.6277647018432617, "learning_rate": 1.245049898766982e-05, "loss": 0.0722, "step": 3495 }, { "epoch": 0.8783487044356609, "grad_norm": 0.4860200881958008, "learning_rate": 1.2430787631157414e-05, "loss": 0.0952, "step": 3500 }, { "epoch": 0.8796034882991405, "grad_norm": 0.30421602725982666, "learning_rate": 1.2411066231900935e-05, "loss": 0.0901, "step": 3505 }, { "epoch": 0.88085827216262, "grad_norm": 0.20121634006500244, "learning_rate": 1.239133487137889e-05, "loss": 0.0833, "step": 3510 }, { "epoch": 0.8821130560260995, "grad_norm": 0.3852861225605011, "learning_rate": 1.2371593631110953e-05, "loss": 0.0862, "step": 3515 }, { "epoch": 0.883367839889579, "grad_norm": 0.45018061995506287, "learning_rate": 1.2351842592657612e-05, "loss": 0.098, "step": 3520 }, { "epoch": 0.8846226237530586, "grad_norm": 0.40172022581100464, "learning_rate": 1.2332081837619836e-05, "loss": 0.0899, "step": 3525 }, { "epoch": 0.8858774076165381, "grad_norm": 0.2906516492366791, "learning_rate": 1.2312311447638731e-05, "loss": 0.0852, "step": 3530 }, { "epoch": 0.8871321914800175, "grad_norm": 0.39876627922058105, "learning_rate": 1.2292531504395223e-05, "loss": 0.0938, "step": 3535 }, { "epoch": 0.8883869753434971, "grad_norm": 0.1180957704782486, "learning_rate": 1.2272742089609694e-05, "loss": 0.0862, "step": 3540 }, { "epoch": 0.8896417592069766, "grad_norm": 0.1698416918516159, "learning_rate": 1.2252943285041662e-05, "loss": 0.0873, "step": 3545 }, { "epoch": 0.8908965430704561, "grad_norm": 0.3837547302246094, "learning_rate": 1.2233135172489453e-05, "loss": 0.0888, "step": 3550 }, { "epoch": 0.8921513269339356, "grad_norm": 0.09981489181518555, "learning_rate": 1.221331783378982e-05, "loss": 0.0802, "step": 3555 }, { "epoch": 0.8934061107974152, "grad_norm": 0.21812370419502258, "learning_rate": 1.2193491350817657e-05, "loss": 0.0749, "step": 3560 }, { "epoch": 0.8946608946608947, "grad_norm": 0.6983737349510193, "learning_rate": 1.2173655805485627e-05, "loss": 0.0881, "step": 3565 }, { "epoch": 0.8959156785243741, "grad_norm": 0.3287610411643982, "learning_rate": 1.2153811279743841e-05, "loss": 0.0859, "step": 3570 }, { "epoch": 0.8971704623878537, "grad_norm": 0.3805353045463562, "learning_rate": 1.2133957855579501e-05, "loss": 0.0874, "step": 3575 }, { "epoch": 0.8984252462513332, "grad_norm": 0.3398018181324005, "learning_rate": 1.2114095615016585e-05, "loss": 0.0892, "step": 3580 }, { "epoch": 0.8996800301148127, "grad_norm": 0.2726369798183441, "learning_rate": 1.2094224640115488e-05, "loss": 0.0846, "step": 3585 }, { "epoch": 0.9009348139782922, "grad_norm": 0.2575783133506775, "learning_rate": 1.2074345012972694e-05, "loss": 0.0863, "step": 3590 }, { "epoch": 0.9021895978417718, "grad_norm": 0.18294771015644073, "learning_rate": 1.2054456815720432e-05, "loss": 0.0792, "step": 3595 }, { "epoch": 0.9034443817052513, "grad_norm": 0.2737415134906769, "learning_rate": 1.2034560130526341e-05, "loss": 0.0811, "step": 3600 }, { "epoch": 0.9046991655687308, "grad_norm": 0.22107063233852386, "learning_rate": 1.2014655039593119e-05, "loss": 0.0937, "step": 3605 }, { "epoch": 0.9059539494322103, "grad_norm": 0.37395817041397095, "learning_rate": 1.1994741625158206e-05, "loss": 0.0813, "step": 3610 }, { "epoch": 0.9072087332956899, "grad_norm": 0.3313354253768921, "learning_rate": 1.1974819969493421e-05, "loss": 0.0789, "step": 3615 }, { "epoch": 0.9084635171591693, "grad_norm": 0.3608105182647705, "learning_rate": 1.195489015490463e-05, "loss": 0.0859, "step": 3620 }, { "epoch": 0.9097183010226488, "grad_norm": 0.37519270181655884, "learning_rate": 1.1934952263731411e-05, "loss": 0.0919, "step": 3625 }, { "epoch": 0.9109730848861284, "grad_norm": 0.5066865086555481, "learning_rate": 1.1915006378346719e-05, "loss": 0.1, "step": 3630 }, { "epoch": 0.9122278687496079, "grad_norm": 0.22127996385097504, "learning_rate": 1.1895052581156516e-05, "loss": 0.0854, "step": 3635 }, { "epoch": 0.9134826526130874, "grad_norm": 0.18663233518600464, "learning_rate": 1.1875090954599472e-05, "loss": 0.0962, "step": 3640 }, { "epoch": 0.9147374364765669, "grad_norm": 0.5338375568389893, "learning_rate": 1.1855121581146591e-05, "loss": 0.086, "step": 3645 }, { "epoch": 0.9159922203400465, "grad_norm": 0.10081567615270615, "learning_rate": 1.183514454330089e-05, "loss": 0.0863, "step": 3650 }, { "epoch": 0.9172470042035259, "grad_norm": 0.09096982330083847, "learning_rate": 1.1815159923597044e-05, "loss": 0.084, "step": 3655 }, { "epoch": 0.9185017880670054, "grad_norm": 0.2180848866701126, "learning_rate": 1.1795167804601062e-05, "loss": 0.0916, "step": 3660 }, { "epoch": 0.919756571930485, "grad_norm": 0.2790067791938782, "learning_rate": 1.177516826890993e-05, "loss": 0.0914, "step": 3665 }, { "epoch": 0.9210113557939645, "grad_norm": 0.26915568113327026, "learning_rate": 1.1755161399151277e-05, "loss": 0.0887, "step": 3670 }, { "epoch": 0.922266139657444, "grad_norm": 0.050155360251665115, "learning_rate": 1.1735147277983027e-05, "loss": 0.0957, "step": 3675 }, { "epoch": 0.9235209235209235, "grad_norm": 0.07628196477890015, "learning_rate": 1.1715125988093075e-05, "loss": 0.0895, "step": 3680 }, { "epoch": 0.9247757073844031, "grad_norm": 0.23782391846179962, "learning_rate": 1.1695097612198929e-05, "loss": 0.0878, "step": 3685 }, { "epoch": 0.9260304912478825, "grad_norm": 0.28425681591033936, "learning_rate": 1.1675062233047365e-05, "loss": 0.0928, "step": 3690 }, { "epoch": 0.927285275111362, "grad_norm": 0.36293289065361023, "learning_rate": 1.16550199334141e-05, "loss": 0.0976, "step": 3695 }, { "epoch": 0.9285400589748416, "grad_norm": 0.5268048048019409, "learning_rate": 1.1634970796103442e-05, "loss": 0.0995, "step": 3700 }, { "epoch": 0.9297948428383211, "grad_norm": 0.18220224976539612, "learning_rate": 1.1614914903947952e-05, "loss": 0.0895, "step": 3705 }, { "epoch": 0.9310496267018006, "grad_norm": 0.18643346428871155, "learning_rate": 1.1594852339808082e-05, "loss": 0.0977, "step": 3710 }, { "epoch": 0.9323044105652801, "grad_norm": 0.1516513228416443, "learning_rate": 1.1574783186571876e-05, "loss": 0.0947, "step": 3715 }, { "epoch": 0.9335591944287597, "grad_norm": 0.27426013350486755, "learning_rate": 1.155470752715458e-05, "loss": 0.0885, "step": 3720 }, { "epoch": 0.9348139782922391, "grad_norm": 0.2292599081993103, "learning_rate": 1.1534625444498325e-05, "loss": 0.0774, "step": 3725 }, { "epoch": 0.9360687621557187, "grad_norm": 0.2502307593822479, "learning_rate": 1.1514537021571784e-05, "loss": 0.0816, "step": 3730 }, { "epoch": 0.9373235460191982, "grad_norm": 0.1261693835258484, "learning_rate": 1.1494442341369819e-05, "loss": 0.0859, "step": 3735 }, { "epoch": 0.9385783298826778, "grad_norm": 0.18473610281944275, "learning_rate": 1.1474341486913146e-05, "loss": 0.0804, "step": 3740 }, { "epoch": 0.9398331137461572, "grad_norm": 0.1409272700548172, "learning_rate": 1.1454234541247995e-05, "loss": 0.0985, "step": 3745 }, { "epoch": 0.9410878976096367, "grad_norm": 0.32273778319358826, "learning_rate": 1.1434121587445752e-05, "loss": 0.0901, "step": 3750 }, { "epoch": 0.9423426814731163, "grad_norm": 0.12227199226617813, "learning_rate": 1.1414002708602632e-05, "loss": 0.0928, "step": 3755 }, { "epoch": 0.9435974653365957, "grad_norm": 0.2481284737586975, "learning_rate": 1.1393877987839329e-05, "loss": 0.0853, "step": 3760 }, { "epoch": 0.9448522492000753, "grad_norm": 0.25302889943122864, "learning_rate": 1.1373747508300668e-05, "loss": 0.0729, "step": 3765 }, { "epoch": 0.9461070330635548, "grad_norm": 0.28753232955932617, "learning_rate": 1.1353611353155272e-05, "loss": 0.0916, "step": 3770 }, { "epoch": 0.9473618169270344, "grad_norm": 0.308005154132843, "learning_rate": 1.133346960559521e-05, "loss": 0.0896, "step": 3775 }, { "epoch": 0.9486166007905138, "grad_norm": 0.22970429062843323, "learning_rate": 1.1313322348835658e-05, "loss": 0.0871, "step": 3780 }, { "epoch": 0.9498713846539933, "grad_norm": 0.3094078600406647, "learning_rate": 1.1293169666114546e-05, "loss": 0.0929, "step": 3785 }, { "epoch": 0.9511261685174729, "grad_norm": 0.17183607816696167, "learning_rate": 1.127301164069223e-05, "loss": 0.0901, "step": 3790 }, { "epoch": 0.9523809523809523, "grad_norm": 0.3873693346977234, "learning_rate": 1.1252848355851136e-05, "loss": 0.0829, "step": 3795 }, { "epoch": 0.9536357362444319, "grad_norm": 0.42757073044776917, "learning_rate": 1.1232679894895417e-05, "loss": 0.0855, "step": 3800 }, { "epoch": 0.9548905201079114, "grad_norm": 0.49850013852119446, "learning_rate": 1.1212506341150615e-05, "loss": 0.0834, "step": 3805 }, { "epoch": 0.956145303971391, "grad_norm": 0.14661534130573273, "learning_rate": 1.1192327777963313e-05, "loss": 0.0798, "step": 3810 }, { "epoch": 0.9574000878348704, "grad_norm": 0.29823562502861023, "learning_rate": 1.117214428870078e-05, "loss": 0.0786, "step": 3815 }, { "epoch": 0.95865487169835, "grad_norm": 0.12722723186016083, "learning_rate": 1.1151955956750652e-05, "loss": 0.0807, "step": 3820 }, { "epoch": 0.9599096555618295, "grad_norm": 0.10096994042396545, "learning_rate": 1.1131762865520566e-05, "loss": 0.0847, "step": 3825 }, { "epoch": 0.9611644394253089, "grad_norm": 0.1767021268606186, "learning_rate": 1.1111565098437815e-05, "loss": 0.1014, "step": 3830 }, { "epoch": 0.9624192232887885, "grad_norm": 0.2014637440443039, "learning_rate": 1.1091362738949024e-05, "loss": 0.091, "step": 3835 }, { "epoch": 0.963674007152268, "grad_norm": 0.2271222472190857, "learning_rate": 1.1071155870519777e-05, "loss": 0.0887, "step": 3840 }, { "epoch": 0.9649287910157476, "grad_norm": 0.15430879592895508, "learning_rate": 1.1050944576634298e-05, "loss": 0.0842, "step": 3845 }, { "epoch": 0.966183574879227, "grad_norm": 0.14442376792430878, "learning_rate": 1.1030728940795087e-05, "loss": 0.0935, "step": 3850 }, { "epoch": 0.9674383587427066, "grad_norm": 0.1097768247127533, "learning_rate": 1.101050904652259e-05, "loss": 0.0842, "step": 3855 }, { "epoch": 0.9686931426061861, "grad_norm": 0.14131766557693481, "learning_rate": 1.0990284977354841e-05, "loss": 0.0848, "step": 3860 }, { "epoch": 0.9699479264696657, "grad_norm": 0.2724064588546753, "learning_rate": 1.097005681684712e-05, "loss": 0.0792, "step": 3865 }, { "epoch": 0.9712027103331451, "grad_norm": 0.15965338051319122, "learning_rate": 1.094982464857162e-05, "loss": 0.0699, "step": 3870 }, { "epoch": 0.9724574941966246, "grad_norm": 0.25648635625839233, "learning_rate": 1.0929588556117086e-05, "loss": 0.0869, "step": 3875 }, { "epoch": 0.9737122780601042, "grad_norm": 0.1974741518497467, "learning_rate": 1.0909348623088472e-05, "loss": 0.0814, "step": 3880 }, { "epoch": 0.9749670619235836, "grad_norm": 0.1373465359210968, "learning_rate": 1.0889104933106604e-05, "loss": 0.0868, "step": 3885 }, { "epoch": 0.9762218457870632, "grad_norm": 0.2142958790063858, "learning_rate": 1.0868857569807831e-05, "loss": 0.0888, "step": 3890 }, { "epoch": 0.9774766296505427, "grad_norm": 0.48555663228034973, "learning_rate": 1.0848606616843673e-05, "loss": 0.0839, "step": 3895 }, { "epoch": 0.9787314135140223, "grad_norm": 0.2687907814979553, "learning_rate": 1.0828352157880489e-05, "loss": 0.0737, "step": 3900 }, { "epoch": 0.9799861973775017, "grad_norm": 0.18711687624454498, "learning_rate": 1.0808094276599113e-05, "loss": 0.0958, "step": 3905 }, { "epoch": 0.9812409812409812, "grad_norm": 0.09380491077899933, "learning_rate": 1.0787833056694526e-05, "loss": 0.0839, "step": 3910 }, { "epoch": 0.9824957651044608, "grad_norm": 0.2556704878807068, "learning_rate": 1.0767568581875494e-05, "loss": 0.0801, "step": 3915 }, { "epoch": 0.9837505489679402, "grad_norm": 0.17845787107944489, "learning_rate": 1.0747300935864245e-05, "loss": 0.0906, "step": 3920 }, { "epoch": 0.9850053328314198, "grad_norm": 0.20369689166545868, "learning_rate": 1.0727030202396091e-05, "loss": 0.0813, "step": 3925 }, { "epoch": 0.9862601166948993, "grad_norm": 0.3633408546447754, "learning_rate": 1.0706756465219114e-05, "loss": 0.091, "step": 3930 }, { "epoch": 0.9875149005583789, "grad_norm": 0.14127115905284882, "learning_rate": 1.0686479808093798e-05, "loss": 0.1013, "step": 3935 }, { "epoch": 0.9887696844218583, "grad_norm": 0.43319714069366455, "learning_rate": 1.0666200314792695e-05, "loss": 0.0854, "step": 3940 }, { "epoch": 0.9900244682853379, "grad_norm": 0.5234352946281433, "learning_rate": 1.064591806910007e-05, "loss": 0.0799, "step": 3945 }, { "epoch": 0.9912792521488174, "grad_norm": 0.10436911880970001, "learning_rate": 1.062563315481156e-05, "loss": 0.0774, "step": 3950 }, { "epoch": 0.9925340360122968, "grad_norm": 0.18167732656002045, "learning_rate": 1.0605345655733839e-05, "loss": 0.0922, "step": 3955 }, { "epoch": 0.9937888198757764, "grad_norm": 0.08805394172668457, "learning_rate": 1.058505565568424e-05, "loss": 0.0908, "step": 3960 }, { "epoch": 0.9950436037392559, "grad_norm": 0.186781644821167, "learning_rate": 1.056476323849044e-05, "loss": 0.1014, "step": 3965 }, { "epoch": 0.9962983876027355, "grad_norm": 0.3317610025405884, "learning_rate": 1.0544468487990105e-05, "loss": 0.0959, "step": 3970 }, { "epoch": 0.9975531714662149, "grad_norm": 0.159254252910614, "learning_rate": 1.0524171488030537e-05, "loss": 0.0961, "step": 3975 }, { "epoch": 0.9988079553296945, "grad_norm": 0.5758376717567444, "learning_rate": 1.0503872322468331e-05, "loss": 0.0859, "step": 3980 }, { "epoch": 1.000062739193174, "grad_norm": 0.1497868299484253, "learning_rate": 1.048357107516903e-05, "loss": 0.0865, "step": 3985 }, { "epoch": 1.0013175230566536, "grad_norm": 0.3601112365722656, "learning_rate": 1.0463267830006779e-05, "loss": 0.0877, "step": 3990 }, { "epoch": 1.002572306920133, "grad_norm": 0.06848477572202682, "learning_rate": 1.0442962670863971e-05, "loss": 0.0809, "step": 3995 }, { "epoch": 1.0038270907836124, "grad_norm": 0.40660738945007324, "learning_rate": 1.0422655681630917e-05, "loss": 0.0925, "step": 4000 }, { "epoch": 1.005081874647092, "grad_norm": 0.12791000306606293, "learning_rate": 1.040234694620548e-05, "loss": 0.0877, "step": 4005 }, { "epoch": 1.0063366585105715, "grad_norm": 0.445340096950531, "learning_rate": 1.0382036548492743e-05, "loss": 0.0683, "step": 4010 }, { "epoch": 1.007591442374051, "grad_norm": 0.29247570037841797, "learning_rate": 1.0361724572404654e-05, "loss": 0.0869, "step": 4015 }, { "epoch": 1.0088462262375306, "grad_norm": 0.10048985481262207, "learning_rate": 1.034141110185968e-05, "loss": 0.072, "step": 4020 }, { "epoch": 1.0101010101010102, "grad_norm": 0.19421495497226715, "learning_rate": 1.0321096220782469e-05, "loss": 0.0859, "step": 4025 }, { "epoch": 1.0113557939644897, "grad_norm": 0.34872177243232727, "learning_rate": 1.0300780013103488e-05, "loss": 0.09, "step": 4030 }, { "epoch": 1.012610577827969, "grad_norm": 0.3325657844543457, "learning_rate": 1.028046256275869e-05, "loss": 0.097, "step": 4035 }, { "epoch": 1.0138653616914486, "grad_norm": 0.3407081365585327, "learning_rate": 1.0260143953689165e-05, "loss": 0.0872, "step": 4040 }, { "epoch": 1.0151201455549281, "grad_norm": 0.18198151886463165, "learning_rate": 1.0239824269840784e-05, "loss": 0.0884, "step": 4045 }, { "epoch": 1.0163749294184077, "grad_norm": 0.08795715123414993, "learning_rate": 1.0219503595163857e-05, "loss": 0.09, "step": 4050 }, { "epoch": 1.0176297132818872, "grad_norm": 0.07186929136514664, "learning_rate": 1.0199182013612797e-05, "loss": 0.0783, "step": 4055 }, { "epoch": 1.0188844971453668, "grad_norm": 0.36318737268447876, "learning_rate": 1.017885960914576e-05, "loss": 0.0852, "step": 4060 }, { "epoch": 1.0201392810088463, "grad_norm": 0.350274920463562, "learning_rate": 1.0158536465724291e-05, "loss": 0.0723, "step": 4065 }, { "epoch": 1.0213940648723256, "grad_norm": 0.3536963164806366, "learning_rate": 1.0138212667313003e-05, "loss": 0.0898, "step": 4070 }, { "epoch": 1.0226488487358052, "grad_norm": 0.37890106439590454, "learning_rate": 1.011788829787921e-05, "loss": 0.0871, "step": 4075 }, { "epoch": 1.0239036325992847, "grad_norm": 0.2589329183101654, "learning_rate": 1.0097563441392582e-05, "loss": 0.0964, "step": 4080 }, { "epoch": 1.0251584164627643, "grad_norm": 0.3065876066684723, "learning_rate": 1.0077238181824804e-05, "loss": 0.0933, "step": 4085 }, { "epoch": 1.0264132003262438, "grad_norm": 0.11984225362539291, "learning_rate": 1.0056912603149229e-05, "loss": 0.0845, "step": 4090 }, { "epoch": 1.0276679841897234, "grad_norm": 0.19489920139312744, "learning_rate": 1.0036586789340518e-05, "loss": 0.071, "step": 4095 }, { "epoch": 1.028922768053203, "grad_norm": 0.4552467167377472, "learning_rate": 1.001626082437432e-05, "loss": 0.0869, "step": 4100 }, { "epoch": 1.0301775519166823, "grad_norm": 0.2607716917991638, "learning_rate": 9.995934792226892e-06, "loss": 0.0903, "step": 4105 }, { "epoch": 1.0314323357801618, "grad_norm": 0.2067851722240448, "learning_rate": 9.975608776874775e-06, "loss": 0.0818, "step": 4110 }, { "epoch": 1.0326871196436413, "grad_norm": 0.1449105590581894, "learning_rate": 9.955282862294447e-06, "loss": 0.0903, "step": 4115 }, { "epoch": 1.033941903507121, "grad_norm": 0.3527059853076935, "learning_rate": 9.93495713246196e-06, "loss": 0.0982, "step": 4120 }, { "epoch": 1.0351966873706004, "grad_norm": 0.38260769844055176, "learning_rate": 9.91463167135261e-06, "loss": 0.0798, "step": 4125 }, { "epoch": 1.03645147123408, "grad_norm": 0.19611142575740814, "learning_rate": 9.894306562940576e-06, "loss": 0.0771, "step": 4130 }, { "epoch": 1.0377062550975595, "grad_norm": 0.09190942347049713, "learning_rate": 9.873981891198585e-06, "loss": 0.0887, "step": 4135 }, { "epoch": 1.0389610389610389, "grad_norm": 0.3059125244617462, "learning_rate": 9.853657740097558e-06, "loss": 0.0804, "step": 4140 }, { "epoch": 1.0402158228245184, "grad_norm": 0.2917691171169281, "learning_rate": 9.833334193606266e-06, "loss": 0.0876, "step": 4145 }, { "epoch": 1.041470606687998, "grad_norm": 0.10409116744995117, "learning_rate": 9.81301133569098e-06, "loss": 0.0951, "step": 4150 }, { "epoch": 1.0427253905514775, "grad_norm": 0.23820455372333527, "learning_rate": 9.792689250315126e-06, "loss": 0.0873, "step": 4155 }, { "epoch": 1.043980174414957, "grad_norm": 0.3840187191963196, "learning_rate": 9.772368021438943e-06, "loss": 0.0859, "step": 4160 }, { "epoch": 1.0452349582784366, "grad_norm": 0.2977546751499176, "learning_rate": 9.752047733019132e-06, "loss": 0.0833, "step": 4165 }, { "epoch": 1.0464897421419161, "grad_norm": 0.3379094898700714, "learning_rate": 9.731728469008493e-06, "loss": 0.0799, "step": 4170 }, { "epoch": 1.0477445260053955, "grad_norm": 0.18591825664043427, "learning_rate": 9.711410313355614e-06, "loss": 0.0772, "step": 4175 }, { "epoch": 1.048999309868875, "grad_norm": 0.1972389817237854, "learning_rate": 9.691093350004492e-06, "loss": 0.0912, "step": 4180 }, { "epoch": 1.0502540937323546, "grad_norm": 0.2270268350839615, "learning_rate": 9.670777662894205e-06, "loss": 0.0733, "step": 4185 }, { "epoch": 1.0515088775958341, "grad_norm": 0.3008296489715576, "learning_rate": 9.650463335958551e-06, "loss": 0.0821, "step": 4190 }, { "epoch": 1.0527636614593137, "grad_norm": 0.11666283011436462, "learning_rate": 9.630150453125711e-06, "loss": 0.0782, "step": 4195 }, { "epoch": 1.0540184453227932, "grad_norm": 0.1857229471206665, "learning_rate": 9.609839098317902e-06, "loss": 0.0811, "step": 4200 }, { "epoch": 1.0552732291862728, "grad_norm": 0.24505260586738586, "learning_rate": 9.589529355451028e-06, "loss": 0.0747, "step": 4205 }, { "epoch": 1.056528013049752, "grad_norm": 0.12762409448623657, "learning_rate": 9.569221308434336e-06, "loss": 0.0813, "step": 4210 }, { "epoch": 1.0577827969132316, "grad_norm": 0.14525793492794037, "learning_rate": 9.548915041170049e-06, "loss": 0.0814, "step": 4215 }, { "epoch": 1.0590375807767112, "grad_norm": 0.4550996422767639, "learning_rate": 9.528610637553063e-06, "loss": 0.087, "step": 4220 }, { "epoch": 1.0602923646401907, "grad_norm": 0.11074529588222504, "learning_rate": 9.508308181470556e-06, "loss": 0.0885, "step": 4225 }, { "epoch": 1.0615471485036703, "grad_norm": 0.2155328094959259, "learning_rate": 9.488007756801672e-06, "loss": 0.0873, "step": 4230 }, { "epoch": 1.0628019323671498, "grad_norm": 0.0720556378364563, "learning_rate": 9.467709447417149e-06, "loss": 0.0787, "step": 4235 }, { "epoch": 1.0640567162306294, "grad_norm": 0.5692139267921448, "learning_rate": 9.447413337178994e-06, "loss": 0.0746, "step": 4240 }, { "epoch": 1.0653115000941087, "grad_norm": 0.42540258169174194, "learning_rate": 9.42711950994013e-06, "loss": 0.0977, "step": 4245 }, { "epoch": 1.0665662839575882, "grad_norm": 0.13704167306423187, "learning_rate": 9.406828049544046e-06, "loss": 0.0815, "step": 4250 }, { "epoch": 1.0678210678210678, "grad_norm": 0.11324842274188995, "learning_rate": 9.386539039824446e-06, "loss": 0.0853, "step": 4255 }, { "epoch": 1.0690758516845473, "grad_norm": 0.09248417615890503, "learning_rate": 9.366252564604914e-06, "loss": 0.0751, "step": 4260 }, { "epoch": 1.0703306355480269, "grad_norm": 0.11014354974031448, "learning_rate": 9.34596870769857e-06, "loss": 0.0888, "step": 4265 }, { "epoch": 1.0715854194115064, "grad_norm": 0.2357379049062729, "learning_rate": 9.325687552907708e-06, "loss": 0.0806, "step": 4270 }, { "epoch": 1.072840203274986, "grad_norm": 0.23039399087429047, "learning_rate": 9.305409184023455e-06, "loss": 0.0899, "step": 4275 }, { "epoch": 1.0740949871384653, "grad_norm": 0.3211176097393036, "learning_rate": 9.285133684825435e-06, "loss": 0.0921, "step": 4280 }, { "epoch": 1.0753497710019448, "grad_norm": 0.08523392677307129, "learning_rate": 9.264861139081417e-06, "loss": 0.0742, "step": 4285 }, { "epoch": 1.0766045548654244, "grad_norm": 0.446898490190506, "learning_rate": 9.244591630546964e-06, "loss": 0.0888, "step": 4290 }, { "epoch": 1.077859338728904, "grad_norm": 0.30059829354286194, "learning_rate": 9.224325242965088e-06, "loss": 0.0796, "step": 4295 }, { "epoch": 1.0791141225923835, "grad_norm": 0.1692143827676773, "learning_rate": 9.204062060065915e-06, "loss": 0.0988, "step": 4300 }, { "epoch": 1.080368906455863, "grad_norm": 0.5262351632118225, "learning_rate": 9.18380216556632e-06, "loss": 0.0817, "step": 4305 }, { "epoch": 1.0816236903193426, "grad_norm": 0.2138439118862152, "learning_rate": 9.163545643169607e-06, "loss": 0.0921, "step": 4310 }, { "epoch": 1.082878474182822, "grad_norm": 0.2504398822784424, "learning_rate": 9.143292576565142e-06, "loss": 0.0747, "step": 4315 }, { "epoch": 1.0841332580463015, "grad_norm": 0.2210868000984192, "learning_rate": 9.123043049427996e-06, "loss": 0.0843, "step": 4320 }, { "epoch": 1.085388041909781, "grad_norm": 0.20180197060108185, "learning_rate": 9.102797145418644e-06, "loss": 0.0854, "step": 4325 }, { "epoch": 1.0866428257732605, "grad_norm": 0.3638245761394501, "learning_rate": 9.082554948182577e-06, "loss": 0.0809, "step": 4330 }, { "epoch": 1.08789760963674, "grad_norm": 0.28940561413764954, "learning_rate": 9.062316541349978e-06, "loss": 0.0806, "step": 4335 }, { "epoch": 1.0891523935002196, "grad_norm": 0.15019264817237854, "learning_rate": 9.042082008535361e-06, "loss": 0.081, "step": 4340 }, { "epoch": 1.0904071773636992, "grad_norm": 0.4596641957759857, "learning_rate": 9.021851433337243e-06, "loss": 0.0886, "step": 4345 }, { "epoch": 1.0916619612271785, "grad_norm": 0.3484109342098236, "learning_rate": 9.001624899337785e-06, "loss": 0.0997, "step": 4350 }, { "epoch": 1.092916745090658, "grad_norm": 0.4524548053741455, "learning_rate": 8.981402490102464e-06, "loss": 0.0719, "step": 4355 }, { "epoch": 1.0941715289541376, "grad_norm": 0.18216001987457275, "learning_rate": 8.961184289179695e-06, "loss": 0.0866, "step": 4360 }, { "epoch": 1.0954263128176172, "grad_norm": 0.3416767120361328, "learning_rate": 8.94097038010052e-06, "loss": 0.0759, "step": 4365 }, { "epoch": 1.0966810966810967, "grad_norm": 0.549856960773468, "learning_rate": 8.920760846378248e-06, "loss": 0.0875, "step": 4370 }, { "epoch": 1.0979358805445762, "grad_norm": 0.12520906329154968, "learning_rate": 8.900555771508114e-06, "loss": 0.086, "step": 4375 }, { "epoch": 1.0991906644080558, "grad_norm": 0.17632810771465302, "learning_rate": 8.880355238966923e-06, "loss": 0.084, "step": 4380 }, { "epoch": 1.1004454482715351, "grad_norm": 0.28502127528190613, "learning_rate": 8.860159332212719e-06, "loss": 0.0813, "step": 4385 }, { "epoch": 1.1017002321350147, "grad_norm": 0.43483448028564453, "learning_rate": 8.83996813468443e-06, "loss": 0.0871, "step": 4390 }, { "epoch": 1.1029550159984942, "grad_norm": 0.28160524368286133, "learning_rate": 8.81978172980154e-06, "loss": 0.0886, "step": 4395 }, { "epoch": 1.1042097998619738, "grad_norm": 0.4156465232372284, "learning_rate": 8.799600200963716e-06, "loss": 0.083, "step": 4400 }, { "epoch": 1.1054645837254533, "grad_norm": 0.24867035448551178, "learning_rate": 8.77942363155049e-06, "loss": 0.0785, "step": 4405 }, { "epoch": 1.1067193675889329, "grad_norm": 0.3078720271587372, "learning_rate": 8.7592521049209e-06, "loss": 0.0817, "step": 4410 }, { "epoch": 1.1079741514524124, "grad_norm": 0.12757770717144012, "learning_rate": 8.739085704413161e-06, "loss": 0.0773, "step": 4415 }, { "epoch": 1.1092289353158917, "grad_norm": 0.2089099884033203, "learning_rate": 8.718924513344288e-06, "loss": 0.0826, "step": 4420 }, { "epoch": 1.1104837191793713, "grad_norm": 0.28598687052726746, "learning_rate": 8.698768615009789e-06, "loss": 0.0898, "step": 4425 }, { "epoch": 1.1117385030428508, "grad_norm": 0.16409343481063843, "learning_rate": 8.678618092683307e-06, "loss": 0.0904, "step": 4430 }, { "epoch": 1.1129932869063304, "grad_norm": 0.17756353318691254, "learning_rate": 8.658473029616264e-06, "loss": 0.077, "step": 4435 }, { "epoch": 1.11424807076981, "grad_norm": 0.27898117899894714, "learning_rate": 8.638333509037537e-06, "loss": 0.0785, "step": 4440 }, { "epoch": 1.1155028546332895, "grad_norm": 0.12959200143814087, "learning_rate": 8.61819961415309e-06, "loss": 0.0752, "step": 4445 }, { "epoch": 1.116757638496769, "grad_norm": 0.0896739810705185, "learning_rate": 8.598071428145663e-06, "loss": 0.0962, "step": 4450 }, { "epoch": 1.1180124223602483, "grad_norm": 0.37898027896881104, "learning_rate": 8.577949034174395e-06, "loss": 0.0864, "step": 4455 }, { "epoch": 1.1192672062237279, "grad_norm": 0.19253282248973846, "learning_rate": 8.55783251537451e-06, "loss": 0.0883, "step": 4460 }, { "epoch": 1.1205219900872074, "grad_norm": 0.35812661051750183, "learning_rate": 8.537721954856942e-06, "loss": 0.0839, "step": 4465 }, { "epoch": 1.121776773950687, "grad_norm": 0.3062339127063751, "learning_rate": 8.517617435708011e-06, "loss": 0.0808, "step": 4470 }, { "epoch": 1.1230315578141665, "grad_norm": 0.13663077354431152, "learning_rate": 8.497519040989096e-06, "loss": 0.0961, "step": 4475 }, { "epoch": 1.124286341677646, "grad_norm": 0.2609081566333771, "learning_rate": 8.477426853736257e-06, "loss": 0.0929, "step": 4480 }, { "epoch": 1.1255411255411256, "grad_norm": 0.12208747863769531, "learning_rate": 8.457340956959905e-06, "loss": 0.0911, "step": 4485 }, { "epoch": 1.126795909404605, "grad_norm": 0.19168135523796082, "learning_rate": 8.437261433644472e-06, "loss": 0.0863, "step": 4490 }, { "epoch": 1.1280506932680845, "grad_norm": 0.20487892627716064, "learning_rate": 8.417188366748051e-06, "loss": 0.0883, "step": 4495 }, { "epoch": 1.129305477131564, "grad_norm": 0.1829683482646942, "learning_rate": 8.397121839202069e-06, "loss": 0.0905, "step": 4500 }, { "epoch": 1.1305602609950436, "grad_norm": 0.13941001892089844, "learning_rate": 8.377061933910924e-06, "loss": 0.078, "step": 4505 }, { "epoch": 1.1318150448585231, "grad_norm": 0.1624182164669037, "learning_rate": 8.357008733751664e-06, "loss": 0.0877, "step": 4510 }, { "epoch": 1.1330698287220027, "grad_norm": 0.44617176055908203, "learning_rate": 8.33696232157363e-06, "loss": 0.0875, "step": 4515 }, { "epoch": 1.1343246125854822, "grad_norm": 0.1490097939968109, "learning_rate": 8.316922780198126e-06, "loss": 0.0755, "step": 4520 }, { "epoch": 1.1355793964489616, "grad_norm": 0.2881050109863281, "learning_rate": 8.296890192418052e-06, "loss": 0.0865, "step": 4525 }, { "epoch": 1.136834180312441, "grad_norm": 0.19701127707958221, "learning_rate": 8.276864640997602e-06, "loss": 0.0837, "step": 4530 }, { "epoch": 1.1380889641759206, "grad_norm": 0.4736967086791992, "learning_rate": 8.256846208671882e-06, "loss": 0.0843, "step": 4535 }, { "epoch": 1.1393437480394002, "grad_norm": 0.22454383969306946, "learning_rate": 8.236834978146597e-06, "loss": 0.0805, "step": 4540 }, { "epoch": 1.1405985319028797, "grad_norm": 0.1956356167793274, "learning_rate": 8.216831032097689e-06, "loss": 0.0877, "step": 4545 }, { "epoch": 1.1418533157663593, "grad_norm": 0.3192068338394165, "learning_rate": 8.196834453171008e-06, "loss": 0.0773, "step": 4550 }, { "epoch": 1.1431080996298388, "grad_norm": 0.21237006783485413, "learning_rate": 8.17684532398197e-06, "loss": 0.0799, "step": 4555 }, { "epoch": 1.1443628834933182, "grad_norm": 0.3464091718196869, "learning_rate": 8.15686372711521e-06, "loss": 0.0946, "step": 4560 }, { "epoch": 1.1456176673567977, "grad_norm": 0.44841843843460083, "learning_rate": 8.136889745124241e-06, "loss": 0.0937, "step": 4565 }, { "epoch": 1.1468724512202773, "grad_norm": 0.12974920868873596, "learning_rate": 8.116923460531117e-06, "loss": 0.0866, "step": 4570 }, { "epoch": 1.1481272350837568, "grad_norm": 0.2840186655521393, "learning_rate": 8.09696495582609e-06, "loss": 0.0759, "step": 4575 }, { "epoch": 1.1493820189472364, "grad_norm": 0.2896987497806549, "learning_rate": 8.077014313467274e-06, "loss": 0.0905, "step": 4580 }, { "epoch": 1.150636802810716, "grad_norm": 0.08831676840782166, "learning_rate": 8.057071615880297e-06, "loss": 0.0855, "step": 4585 }, { "epoch": 1.1518915866741954, "grad_norm": 0.310893714427948, "learning_rate": 8.037136945457959e-06, "loss": 0.0868, "step": 4590 }, { "epoch": 1.1531463705376748, "grad_norm": 0.2327936589717865, "learning_rate": 8.017210384559901e-06, "loss": 0.0661, "step": 4595 }, { "epoch": 1.1544011544011543, "grad_norm": 0.23894554376602173, "learning_rate": 7.997292015512257e-06, "loss": 0.0816, "step": 4600 }, { "epoch": 1.1556559382646339, "grad_norm": 0.2750934064388275, "learning_rate": 7.977381920607324e-06, "loss": 0.0724, "step": 4605 }, { "epoch": 1.1569107221281134, "grad_norm": 0.2656705677509308, "learning_rate": 7.957480182103198e-06, "loss": 0.0839, "step": 4610 }, { "epoch": 1.158165505991593, "grad_norm": 0.08596844226121902, "learning_rate": 7.93758688222347e-06, "loss": 0.0873, "step": 4615 }, { "epoch": 1.1594202898550725, "grad_norm": 0.2484734207391739, "learning_rate": 7.91770210315685e-06, "loss": 0.084, "step": 4620 }, { "epoch": 1.160675073718552, "grad_norm": 0.3806692361831665, "learning_rate": 7.897825927056865e-06, "loss": 0.0856, "step": 4625 }, { "epoch": 1.1619298575820314, "grad_norm": 0.26702991127967834, "learning_rate": 7.877958436041475e-06, "loss": 0.0804, "step": 4630 }, { "epoch": 1.163184641445511, "grad_norm": 0.19116578996181488, "learning_rate": 7.858099712192774e-06, "loss": 0.0882, "step": 4635 }, { "epoch": 1.1644394253089905, "grad_norm": 0.08863260596990585, "learning_rate": 7.83824983755663e-06, "loss": 0.0761, "step": 4640 }, { "epoch": 1.16569420917247, "grad_norm": 0.22460100054740906, "learning_rate": 7.818408894142351e-06, "loss": 0.0905, "step": 4645 }, { "epoch": 1.1669489930359496, "grad_norm": 0.2022630125284195, "learning_rate": 7.798576963922347e-06, "loss": 0.086, "step": 4650 }, { "epoch": 1.1682037768994291, "grad_norm": 0.17606490850448608, "learning_rate": 7.778754128831782e-06, "loss": 0.0742, "step": 4655 }, { "epoch": 1.1694585607629087, "grad_norm": 0.1387161910533905, "learning_rate": 7.75894047076826e-06, "loss": 0.0817, "step": 4660 }, { "epoch": 1.170713344626388, "grad_norm": 0.09419666230678558, "learning_rate": 7.739136071591455e-06, "loss": 0.0918, "step": 4665 }, { "epoch": 1.1719681284898675, "grad_norm": 0.30808964371681213, "learning_rate": 7.719341013122795e-06, "loss": 0.0724, "step": 4670 }, { "epoch": 1.173222912353347, "grad_norm": 0.43276599049568176, "learning_rate": 7.699555377145113e-06, "loss": 0.09, "step": 4675 }, { "epoch": 1.1744776962168266, "grad_norm": 0.1630372256040573, "learning_rate": 7.679779245402321e-06, "loss": 0.0795, "step": 4680 }, { "epoch": 1.1757324800803062, "grad_norm": 0.1593324989080429, "learning_rate": 7.660012699599062e-06, "loss": 0.0891, "step": 4685 }, { "epoch": 1.1769872639437857, "grad_norm": 0.19340068101882935, "learning_rate": 7.640255821400364e-06, "loss": 0.098, "step": 4690 }, { "epoch": 1.1782420478072653, "grad_norm": 0.10026690363883972, "learning_rate": 7.620508692431327e-06, "loss": 0.0809, "step": 4695 }, { "epoch": 1.1794968316707446, "grad_norm": 0.24709242582321167, "learning_rate": 7.600771394276767e-06, "loss": 0.09, "step": 4700 }, { "epoch": 1.1807516155342241, "grad_norm": 0.10687977075576782, "learning_rate": 7.5810440084808855e-06, "loss": 0.0948, "step": 4705 }, { "epoch": 1.1820063993977037, "grad_norm": 0.5234541893005371, "learning_rate": 7.561326616546932e-06, "loss": 0.0857, "step": 4710 }, { "epoch": 1.1832611832611832, "grad_norm": 0.5169919729232788, "learning_rate": 7.541619299936859e-06, "loss": 0.0716, "step": 4715 }, { "epoch": 1.1845159671246628, "grad_norm": 0.13368743658065796, "learning_rate": 7.521922140071003e-06, "loss": 0.082, "step": 4720 }, { "epoch": 1.1857707509881423, "grad_norm": 0.32512664794921875, "learning_rate": 7.50223521832773e-06, "loss": 0.0923, "step": 4725 }, { "epoch": 1.1870255348516219, "grad_norm": 0.15283085405826569, "learning_rate": 7.482558616043123e-06, "loss": 0.0913, "step": 4730 }, { "epoch": 1.1882803187151012, "grad_norm": 0.1149832084774971, "learning_rate": 7.462892414510605e-06, "loss": 0.0795, "step": 4735 }, { "epoch": 1.1895351025785807, "grad_norm": 0.1783827543258667, "learning_rate": 7.443236694980649e-06, "loss": 0.0922, "step": 4740 }, { "epoch": 1.1907898864420603, "grad_norm": 0.3042134940624237, "learning_rate": 7.423591538660416e-06, "loss": 0.1, "step": 4745 }, { "epoch": 1.1920446703055398, "grad_norm": 0.32708263397216797, "learning_rate": 7.4039570267134266e-06, "loss": 0.0874, "step": 4750 }, { "epoch": 1.1932994541690194, "grad_norm": 0.3249336779117584, "learning_rate": 7.384333240259216e-06, "loss": 0.0855, "step": 4755 }, { "epoch": 1.194554238032499, "grad_norm": 0.3625956177711487, "learning_rate": 7.364720260373017e-06, "loss": 0.0819, "step": 4760 }, { "epoch": 1.1958090218959785, "grad_norm": 0.13294678926467896, "learning_rate": 7.345118168085412e-06, "loss": 0.0896, "step": 4765 }, { "epoch": 1.1970638057594578, "grad_norm": 0.29470571875572205, "learning_rate": 7.325527044382004e-06, "loss": 0.0828, "step": 4770 }, { "epoch": 1.1983185896229374, "grad_norm": 0.23991379141807556, "learning_rate": 7.3059469702030725e-06, "loss": 0.0769, "step": 4775 }, { "epoch": 1.199573373486417, "grad_norm": 0.30648887157440186, "learning_rate": 7.286378026443252e-06, "loss": 0.0908, "step": 4780 }, { "epoch": 1.2008281573498965, "grad_norm": 0.10455156862735748, "learning_rate": 7.2668202939511946e-06, "loss": 0.0842, "step": 4785 }, { "epoch": 1.202082941213376, "grad_norm": 0.22708240151405334, "learning_rate": 7.2472738535292295e-06, "loss": 0.0885, "step": 4790 }, { "epoch": 1.2033377250768555, "grad_norm": 0.22052530944347382, "learning_rate": 7.227738785933025e-06, "loss": 0.0749, "step": 4795 }, { "epoch": 1.204592508940335, "grad_norm": 0.2625311315059662, "learning_rate": 7.208215171871277e-06, "loss": 0.0835, "step": 4800 }, { "epoch": 1.2058472928038144, "grad_norm": 0.20223468542099, "learning_rate": 7.188703092005353e-06, "loss": 0.0855, "step": 4805 }, { "epoch": 1.207102076667294, "grad_norm": 0.17523938417434692, "learning_rate": 7.169202626948973e-06, "loss": 0.0833, "step": 4810 }, { "epoch": 1.2083568605307735, "grad_norm": 0.11394723504781723, "learning_rate": 7.149713857267862e-06, "loss": 0.0769, "step": 4815 }, { "epoch": 1.209611644394253, "grad_norm": 0.28187689185142517, "learning_rate": 7.130236863479434e-06, "loss": 0.0908, "step": 4820 }, { "epoch": 1.2108664282577326, "grad_norm": 0.22345809638500214, "learning_rate": 7.110771726052446e-06, "loss": 0.087, "step": 4825 }, { "epoch": 1.2121212121212122, "grad_norm": 0.19860218465328217, "learning_rate": 7.091318525406671e-06, "loss": 0.078, "step": 4830 }, { "epoch": 1.2133759959846917, "grad_norm": 0.3908574879169464, "learning_rate": 7.071877341912576e-06, "loss": 0.0926, "step": 4835 }, { "epoch": 1.2146307798481712, "grad_norm": 0.18266786634922028, "learning_rate": 7.052448255890958e-06, "loss": 0.0858, "step": 4840 }, { "epoch": 1.2158855637116506, "grad_norm": 0.2580263316631317, "learning_rate": 7.033031347612655e-06, "loss": 0.0777, "step": 4845 }, { "epoch": 1.2171403475751301, "grad_norm": 0.22712433338165283, "learning_rate": 7.013626697298182e-06, "loss": 0.0771, "step": 4850 }, { "epoch": 1.2183951314386097, "grad_norm": 0.5155333876609802, "learning_rate": 6.994234385117414e-06, "loss": 0.0997, "step": 4855 }, { "epoch": 1.2196499153020892, "grad_norm": 0.16782841086387634, "learning_rate": 6.974854491189243e-06, "loss": 0.0891, "step": 4860 }, { "epoch": 1.2209046991655688, "grad_norm": 0.2045108526945114, "learning_rate": 6.95548709558127e-06, "loss": 0.0846, "step": 4865 }, { "epoch": 1.2221594830290483, "grad_norm": 0.46996012330055237, "learning_rate": 6.9361322783094465e-06, "loss": 0.0845, "step": 4870 }, { "epoch": 1.2234142668925279, "grad_norm": 0.3139326870441437, "learning_rate": 6.916790119337766e-06, "loss": 0.0842, "step": 4875 }, { "epoch": 1.2246690507560072, "grad_norm": 0.27236974239349365, "learning_rate": 6.897460698577918e-06, "loss": 0.0929, "step": 4880 }, { "epoch": 1.2259238346194867, "grad_norm": 0.3481730818748474, "learning_rate": 6.878144095888964e-06, "loss": 0.0832, "step": 4885 }, { "epoch": 1.2271786184829663, "grad_norm": 0.11182388663291931, "learning_rate": 6.858840391077017e-06, "loss": 0.0888, "step": 4890 }, { "epoch": 1.2284334023464458, "grad_norm": 0.5058387517929077, "learning_rate": 6.839549663894897e-06, "loss": 0.0784, "step": 4895 }, { "epoch": 1.2296881862099254, "grad_norm": 0.3258366584777832, "learning_rate": 6.820271994041796e-06, "loss": 0.0834, "step": 4900 }, { "epoch": 1.230942970073405, "grad_norm": 0.11910035461187363, "learning_rate": 6.8010074611629815e-06, "loss": 0.0806, "step": 4905 }, { "epoch": 1.2321977539368845, "grad_norm": 0.10795161128044128, "learning_rate": 6.781756144849431e-06, "loss": 0.0877, "step": 4910 }, { "epoch": 1.2334525378003638, "grad_norm": 0.07859393954277039, "learning_rate": 6.762518124637525e-06, "loss": 0.0789, "step": 4915 }, { "epoch": 1.2347073216638433, "grad_norm": 0.36102941632270813, "learning_rate": 6.743293480008703e-06, "loss": 0.0973, "step": 4920 }, { "epoch": 1.2359621055273229, "grad_norm": 0.07983565330505371, "learning_rate": 6.724082290389151e-06, "loss": 0.0922, "step": 4925 }, { "epoch": 1.2372168893908024, "grad_norm": 0.18033474683761597, "learning_rate": 6.704884635149467e-06, "loss": 0.083, "step": 4930 }, { "epoch": 1.238471673254282, "grad_norm": 0.33377745747566223, "learning_rate": 6.685700593604329e-06, "loss": 0.0769, "step": 4935 }, { "epoch": 1.2397264571177615, "grad_norm": 0.09846628457307816, "learning_rate": 6.666530245012168e-06, "loss": 0.0848, "step": 4940 }, { "epoch": 1.240981240981241, "grad_norm": 0.28946545720100403, "learning_rate": 6.647373668574841e-06, "loss": 0.0846, "step": 4945 }, { "epoch": 1.2422360248447206, "grad_norm": 0.2697726786136627, "learning_rate": 6.628230943437319e-06, "loss": 0.0741, "step": 4950 }, { "epoch": 1.2434908087082, "grad_norm": 0.24468262493610382, "learning_rate": 6.609102148687333e-06, "loss": 0.084, "step": 4955 }, { "epoch": 1.2447455925716795, "grad_norm": 0.7213346362113953, "learning_rate": 6.589987363355068e-06, "loss": 0.0909, "step": 4960 }, { "epoch": 1.246000376435159, "grad_norm": 0.1924159675836563, "learning_rate": 6.570886666412823e-06, "loss": 0.0967, "step": 4965 }, { "epoch": 1.2472551602986386, "grad_norm": 0.2051101177930832, "learning_rate": 6.551800136774697e-06, "loss": 0.0744, "step": 4970 }, { "epoch": 1.2485099441621181, "grad_norm": 0.2706952393054962, "learning_rate": 6.532727853296257e-06, "loss": 0.0763, "step": 4975 }, { "epoch": 1.2497647280255977, "grad_norm": 0.21088339388370514, "learning_rate": 6.513669894774209e-06, "loss": 0.0879, "step": 4980 }, { "epoch": 1.2510195118890772, "grad_norm": 0.339167982339859, "learning_rate": 6.494626339946075e-06, "loss": 0.0867, "step": 4985 }, { "epoch": 1.2522742957525566, "grad_norm": 0.2325342744588852, "learning_rate": 6.47559726748987e-06, "loss": 0.088, "step": 4990 }, { "epoch": 1.253529079616036, "grad_norm": 0.24122120440006256, "learning_rate": 6.456582756023781e-06, "loss": 0.082, "step": 4995 }, { "epoch": 1.2547838634795156, "grad_norm": 0.4205479025840759, "learning_rate": 6.437582884105835e-06, "loss": 0.0825, "step": 5000 }, { "epoch": 1.2560386473429952, "grad_norm": 0.4158347547054291, "learning_rate": 6.41859773023356e-06, "loss": 0.0845, "step": 5005 }, { "epoch": 1.2572934312064747, "grad_norm": 0.14699125289916992, "learning_rate": 6.399627372843699e-06, "loss": 0.0871, "step": 5010 }, { "epoch": 1.258548215069954, "grad_norm": 0.15239457786083221, "learning_rate": 6.380671890311852e-06, "loss": 0.0886, "step": 5015 }, { "epoch": 1.2598029989334338, "grad_norm": 0.35437726974487305, "learning_rate": 6.361731360952169e-06, "loss": 0.0941, "step": 5020 }, { "epoch": 1.2610577827969132, "grad_norm": 0.19353091716766357, "learning_rate": 6.342805863017012e-06, "loss": 0.0911, "step": 5025 }, { "epoch": 1.2623125666603927, "grad_norm": 0.19099637866020203, "learning_rate": 6.323895474696651e-06, "loss": 0.0929, "step": 5030 }, { "epoch": 1.2635673505238723, "grad_norm": 0.25522175431251526, "learning_rate": 6.305000274118926e-06, "loss": 0.0896, "step": 5035 }, { "epoch": 1.2648221343873518, "grad_norm": 0.35186290740966797, "learning_rate": 6.286120339348935e-06, "loss": 0.0717, "step": 5040 }, { "epoch": 1.2660769182508314, "grad_norm": 0.2007788121700287, "learning_rate": 6.267255748388697e-06, "loss": 0.0861, "step": 5045 }, { "epoch": 1.2673317021143107, "grad_norm": 0.07917312532663345, "learning_rate": 6.248406579176838e-06, "loss": 0.0829, "step": 5050 }, { "epoch": 1.2685864859777904, "grad_norm": 0.2649421989917755, "learning_rate": 6.229572909588282e-06, "loss": 0.0802, "step": 5055 }, { "epoch": 1.2698412698412698, "grad_norm": 0.13473555445671082, "learning_rate": 6.2107548174339085e-06, "loss": 0.0731, "step": 5060 }, { "epoch": 1.2710960537047493, "grad_norm": 0.3079237639904022, "learning_rate": 6.1919523804602335e-06, "loss": 0.0874, "step": 5065 }, { "epoch": 1.2723508375682289, "grad_norm": 0.1939849853515625, "learning_rate": 6.173165676349103e-06, "loss": 0.0782, "step": 5070 }, { "epoch": 1.2736056214317084, "grad_norm": 0.3249664902687073, "learning_rate": 6.15439478271736e-06, "loss": 0.078, "step": 5075 }, { "epoch": 1.274860405295188, "grad_norm": 0.21080322563648224, "learning_rate": 6.135639777116526e-06, "loss": 0.0777, "step": 5080 }, { "epoch": 1.2761151891586673, "grad_norm": 0.24793751537799835, "learning_rate": 6.116900737032484e-06, "loss": 0.0737, "step": 5085 }, { "epoch": 1.277369973022147, "grad_norm": 0.2202165126800537, "learning_rate": 6.0981777398851504e-06, "loss": 0.0823, "step": 5090 }, { "epoch": 1.2786247568856264, "grad_norm": 0.1270621120929718, "learning_rate": 6.079470863028164e-06, "loss": 0.0787, "step": 5095 }, { "epoch": 1.279879540749106, "grad_norm": 0.1102348044514656, "learning_rate": 6.0607801837485665e-06, "loss": 0.0749, "step": 5100 }, { "epoch": 1.2811343246125855, "grad_norm": 0.21010182797908783, "learning_rate": 6.042105779266479e-06, "loss": 0.0831, "step": 5105 }, { "epoch": 1.282389108476065, "grad_norm": 0.6771642565727234, "learning_rate": 6.023447726734771e-06, "loss": 0.0867, "step": 5110 }, { "epoch": 1.2836438923395446, "grad_norm": 0.22212505340576172, "learning_rate": 6.004806103238771e-06, "loss": 0.0935, "step": 5115 }, { "epoch": 1.284898676203024, "grad_norm": 0.18700364232063293, "learning_rate": 5.986180985795927e-06, "loss": 0.0867, "step": 5120 }, { "epoch": 1.2861534600665037, "grad_norm": 0.13250069320201874, "learning_rate": 5.967572451355486e-06, "loss": 0.0897, "step": 5125 }, { "epoch": 1.287408243929983, "grad_norm": 0.4712745249271393, "learning_rate": 5.9489805767981845e-06, "loss": 0.085, "step": 5130 }, { "epoch": 1.2886630277934625, "grad_norm": 0.1509675532579422, "learning_rate": 5.9304054389359354e-06, "loss": 0.082, "step": 5135 }, { "epoch": 1.289917811656942, "grad_norm": 0.37453317642211914, "learning_rate": 5.911847114511497e-06, "loss": 0.0786, "step": 5140 }, { "epoch": 1.2911725955204216, "grad_norm": 0.18769773840904236, "learning_rate": 5.893305680198175e-06, "loss": 0.0829, "step": 5145 }, { "epoch": 1.2924273793839012, "grad_norm": 0.4405484199523926, "learning_rate": 5.874781212599475e-06, "loss": 0.0804, "step": 5150 }, { "epoch": 1.2936821632473805, "grad_norm": 0.20101556181907654, "learning_rate": 5.856273788248819e-06, "loss": 0.0827, "step": 5155 }, { "epoch": 1.2949369471108603, "grad_norm": 0.2524980902671814, "learning_rate": 5.837783483609214e-06, "loss": 0.0766, "step": 5160 }, { "epoch": 1.2961917309743396, "grad_norm": 0.2875005304813385, "learning_rate": 5.819310375072935e-06, "loss": 0.0839, "step": 5165 }, { "epoch": 1.2974465148378191, "grad_norm": 0.3446800410747528, "learning_rate": 5.800854538961213e-06, "loss": 0.0906, "step": 5170 }, { "epoch": 1.2987012987012987, "grad_norm": 0.15456603467464447, "learning_rate": 5.782416051523909e-06, "loss": 0.0811, "step": 5175 }, { "epoch": 1.2999560825647782, "grad_norm": 0.3075577914714813, "learning_rate": 5.763994988939223e-06, "loss": 0.0808, "step": 5180 }, { "epoch": 1.3012108664282578, "grad_norm": 0.4799324572086334, "learning_rate": 5.745591427313365e-06, "loss": 0.0871, "step": 5185 }, { "epoch": 1.3024656502917373, "grad_norm": 0.1860457956790924, "learning_rate": 5.727205442680218e-06, "loss": 0.0799, "step": 5190 }, { "epoch": 1.3037204341552169, "grad_norm": 0.2134033590555191, "learning_rate": 5.708837111001069e-06, "loss": 0.0885, "step": 5195 }, { "epoch": 1.3049752180186962, "grad_norm": 0.40847694873809814, "learning_rate": 5.690486508164268e-06, "loss": 0.084, "step": 5200 }, { "epoch": 1.3062300018821758, "grad_norm": 0.21664460003376007, "learning_rate": 5.672153709984909e-06, "loss": 0.085, "step": 5205 }, { "epoch": 1.3074847857456553, "grad_norm": 0.22716949880123138, "learning_rate": 5.653838792204538e-06, "loss": 0.0807, "step": 5210 }, { "epoch": 1.3087395696091348, "grad_norm": 0.2670564353466034, "learning_rate": 5.6355418304908226e-06, "loss": 0.086, "step": 5215 }, { "epoch": 1.3099943534726144, "grad_norm": 0.18262840807437897, "learning_rate": 5.617262900437239e-06, "loss": 0.0784, "step": 5220 }, { "epoch": 1.311249137336094, "grad_norm": 0.41220247745513916, "learning_rate": 5.599002077562779e-06, "loss": 0.0788, "step": 5225 }, { "epoch": 1.3125039211995735, "grad_norm": 0.41886112093925476, "learning_rate": 5.580759437311624e-06, "loss": 0.0912, "step": 5230 }, { "epoch": 1.3137587050630528, "grad_norm": 0.30091163516044617, "learning_rate": 5.562535055052818e-06, "loss": 0.084, "step": 5235 }, { "epoch": 1.3150134889265324, "grad_norm": 0.38127401471138, "learning_rate": 5.544329006079987e-06, "loss": 0.0769, "step": 5240 }, { "epoch": 1.316268272790012, "grad_norm": 0.13492530584335327, "learning_rate": 5.526141365611018e-06, "loss": 0.0795, "step": 5245 }, { "epoch": 1.3175230566534915, "grad_norm": 0.27479758858680725, "learning_rate": 5.507972208787728e-06, "loss": 0.0838, "step": 5250 }, { "epoch": 1.318777840516971, "grad_norm": 0.1254061758518219, "learning_rate": 5.489821610675579e-06, "loss": 0.0773, "step": 5255 }, { "epoch": 1.3200326243804505, "grad_norm": 0.15454204380512238, "learning_rate": 5.471689646263358e-06, "loss": 0.0793, "step": 5260 }, { "epoch": 1.32128740824393, "grad_norm": 0.2265045940876007, "learning_rate": 5.453576390462861e-06, "loss": 0.0881, "step": 5265 }, { "epoch": 1.3225421921074094, "grad_norm": 0.14053259789943695, "learning_rate": 5.435481918108603e-06, "loss": 0.0836, "step": 5270 }, { "epoch": 1.323796975970889, "grad_norm": 0.15693189203739166, "learning_rate": 5.41740630395748e-06, "loss": 0.0816, "step": 5275 }, { "epoch": 1.3250517598343685, "grad_norm": 0.15856729447841644, "learning_rate": 5.399349622688479e-06, "loss": 0.0702, "step": 5280 }, { "epoch": 1.326306543697848, "grad_norm": 0.23238258063793182, "learning_rate": 5.3813119489023766e-06, "loss": 0.0839, "step": 5285 }, { "epoch": 1.3275613275613276, "grad_norm": 0.08567748963832855, "learning_rate": 5.363293357121422e-06, "loss": 0.0899, "step": 5290 }, { "epoch": 1.3288161114248072, "grad_norm": 0.0990014597773552, "learning_rate": 5.345293921789e-06, "loss": 0.0812, "step": 5295 }, { "epoch": 1.3300708952882867, "grad_norm": 0.2629724144935608, "learning_rate": 5.32731371726938e-06, "loss": 0.08, "step": 5300 }, { "epoch": 1.331325679151766, "grad_norm": 0.22977909445762634, "learning_rate": 5.309352817847374e-06, "loss": 0.0822, "step": 5305 }, { "epoch": 1.3325804630152456, "grad_norm": 0.1563161015510559, "learning_rate": 5.291411297728027e-06, "loss": 0.0814, "step": 5310 }, { "epoch": 1.3338352468787251, "grad_norm": 0.0872747004032135, "learning_rate": 5.273489231036321e-06, "loss": 0.0872, "step": 5315 }, { "epoch": 1.3350900307422047, "grad_norm": 0.24948804080486298, "learning_rate": 5.255586691816874e-06, "loss": 0.0708, "step": 5320 }, { "epoch": 1.3363448146056842, "grad_norm": 0.17048531770706177, "learning_rate": 5.237703754033616e-06, "loss": 0.0789, "step": 5325 }, { "epoch": 1.3375995984691638, "grad_norm": 0.33402279019355774, "learning_rate": 5.219840491569503e-06, "loss": 0.0754, "step": 5330 }, { "epoch": 1.3388543823326433, "grad_norm": 0.25611111521720886, "learning_rate": 5.2019969782262046e-06, "loss": 0.0792, "step": 5335 }, { "epoch": 1.3401091661961226, "grad_norm": 0.392952561378479, "learning_rate": 5.184173287723782e-06, "loss": 0.0684, "step": 5340 }, { "epoch": 1.3413639500596022, "grad_norm": 0.11270631104707718, "learning_rate": 5.166369493700412e-06, "loss": 0.0853, "step": 5345 }, { "epoch": 1.3426187339230817, "grad_norm": 0.3116929531097412, "learning_rate": 5.148585669712074e-06, "loss": 0.0821, "step": 5350 }, { "epoch": 1.3438735177865613, "grad_norm": 0.0726916715502739, "learning_rate": 5.130821889232228e-06, "loss": 0.092, "step": 5355 }, { "epoch": 1.3451283016500408, "grad_norm": 0.06830952316522598, "learning_rate": 5.113078225651529e-06, "loss": 0.085, "step": 5360 }, { "epoch": 1.3463830855135204, "grad_norm": 0.18756920099258423, "learning_rate": 5.095354752277526e-06, "loss": 0.0887, "step": 5365 }, { "epoch": 1.347637869377, "grad_norm": 0.29784929752349854, "learning_rate": 5.0776515423343445e-06, "loss": 0.0919, "step": 5370 }, { "epoch": 1.3488926532404792, "grad_norm": 0.24415934085845947, "learning_rate": 5.059968668962401e-06, "loss": 0.0904, "step": 5375 }, { "epoch": 1.3501474371039588, "grad_norm": 0.10524599254131317, "learning_rate": 5.042306205218082e-06, "loss": 0.0899, "step": 5380 }, { "epoch": 1.3514022209674383, "grad_norm": 0.13240844011306763, "learning_rate": 5.024664224073454e-06, "loss": 0.0838, "step": 5385 }, { "epoch": 1.3526570048309179, "grad_norm": 0.17067807912826538, "learning_rate": 5.007042798415969e-06, "loss": 0.0824, "step": 5390 }, { "epoch": 1.3539117886943974, "grad_norm": 0.1325388103723526, "learning_rate": 4.989442001048151e-06, "loss": 0.0839, "step": 5395 }, { "epoch": 1.355166572557877, "grad_norm": 0.4756743907928467, "learning_rate": 4.971861904687283e-06, "loss": 0.0775, "step": 5400 }, { "epoch": 1.3564213564213565, "grad_norm": 0.2643389105796814, "learning_rate": 4.954302581965143e-06, "loss": 0.089, "step": 5405 }, { "epoch": 1.3576761402848359, "grad_norm": 0.33239737153053284, "learning_rate": 4.93676410542768e-06, "loss": 0.0945, "step": 5410 }, { "epoch": 1.3589309241483154, "grad_norm": 0.315899133682251, "learning_rate": 4.919246547534709e-06, "loss": 0.0858, "step": 5415 }, { "epoch": 1.360185708011795, "grad_norm": 0.7093632817268372, "learning_rate": 4.901749980659617e-06, "loss": 0.0778, "step": 5420 }, { "epoch": 1.3614404918752745, "grad_norm": 0.17570814490318298, "learning_rate": 4.884274477089085e-06, "loss": 0.0856, "step": 5425 }, { "epoch": 1.362695275738754, "grad_norm": 0.1181860864162445, "learning_rate": 4.866820109022752e-06, "loss": 0.0844, "step": 5430 }, { "epoch": 1.3639500596022336, "grad_norm": 0.18417999148368835, "learning_rate": 4.84938694857295e-06, "loss": 0.0687, "step": 5435 }, { "epoch": 1.3652048434657131, "grad_norm": 0.10456906259059906, "learning_rate": 4.831975067764387e-06, "loss": 0.0765, "step": 5440 }, { "epoch": 1.3664596273291925, "grad_norm": 0.16597416996955872, "learning_rate": 4.814584538533848e-06, "loss": 0.0748, "step": 5445 }, { "epoch": 1.367714411192672, "grad_norm": 0.08436016738414764, "learning_rate": 4.797215432729913e-06, "loss": 0.0748, "step": 5450 }, { "epoch": 1.3689691950561516, "grad_norm": 0.2234387844800949, "learning_rate": 4.779867822112658e-06, "loss": 0.0795, "step": 5455 }, { "epoch": 1.370223978919631, "grad_norm": 0.2501266300678253, "learning_rate": 4.762541778353337e-06, "loss": 0.0785, "step": 5460 }, { "epoch": 1.3714787627831106, "grad_norm": 0.14667633175849915, "learning_rate": 4.745237373034103e-06, "loss": 0.0836, "step": 5465 }, { "epoch": 1.3727335466465902, "grad_norm": 0.12082650512456894, "learning_rate": 4.727954677647724e-06, "loss": 0.0827, "step": 5470 }, { "epoch": 1.3739883305100697, "grad_norm": 0.25682708621025085, "learning_rate": 4.7106937635972565e-06, "loss": 0.0806, "step": 5475 }, { "epoch": 1.375243114373549, "grad_norm": 0.11401407420635223, "learning_rate": 4.693454702195784e-06, "loss": 0.085, "step": 5480 }, { "epoch": 1.3764978982370286, "grad_norm": 0.13626615703105927, "learning_rate": 4.676237564666095e-06, "loss": 0.072, "step": 5485 }, { "epoch": 1.3777526821005082, "grad_norm": 0.274503231048584, "learning_rate": 4.659042422140399e-06, "loss": 0.0869, "step": 5490 }, { "epoch": 1.3790074659639877, "grad_norm": 0.3412606716156006, "learning_rate": 4.6418693456600424e-06, "loss": 0.0894, "step": 5495 }, { "epoch": 1.3802622498274673, "grad_norm": 0.17439858615398407, "learning_rate": 4.62471840617521e-06, "loss": 0.0855, "step": 5500 }, { "epoch": 1.3815170336909468, "grad_norm": 0.2983112037181854, "learning_rate": 4.607589674544603e-06, "loss": 0.0829, "step": 5505 }, { "epoch": 1.3827718175544264, "grad_norm": 0.32758232951164246, "learning_rate": 4.590483221535198e-06, "loss": 0.0889, "step": 5510 }, { "epoch": 1.3840266014179057, "grad_norm": 0.34168171882629395, "learning_rate": 4.573399117821922e-06, "loss": 0.0872, "step": 5515 }, { "epoch": 1.3852813852813852, "grad_norm": 0.21264196932315826, "learning_rate": 4.556337433987359e-06, "loss": 0.0772, "step": 5520 }, { "epoch": 1.3865361691448648, "grad_norm": 0.11897042393684387, "learning_rate": 4.539298240521463e-06, "loss": 0.0796, "step": 5525 }, { "epoch": 1.3877909530083443, "grad_norm": 0.20385603606700897, "learning_rate": 4.522281607821288e-06, "loss": 0.0698, "step": 5530 }, { "epoch": 1.3890457368718239, "grad_norm": 0.3090057671070099, "learning_rate": 4.505287606190658e-06, "loss": 0.0735, "step": 5535 }, { "epoch": 1.3903005207353034, "grad_norm": 0.21723704040050507, "learning_rate": 4.488316305839911e-06, "loss": 0.0877, "step": 5540 }, { "epoch": 1.391555304598783, "grad_norm": 0.2535438537597656, "learning_rate": 4.471367776885589e-06, "loss": 0.0802, "step": 5545 }, { "epoch": 1.3928100884622623, "grad_norm": 0.2798052728176117, "learning_rate": 4.454442089350151e-06, "loss": 0.0828, "step": 5550 }, { "epoch": 1.3940648723257418, "grad_norm": 0.1968078762292862, "learning_rate": 4.437539313161697e-06, "loss": 0.0878, "step": 5555 }, { "epoch": 1.3953196561892214, "grad_norm": 0.3160542845726013, "learning_rate": 4.420659518153667e-06, "loss": 0.0843, "step": 5560 }, { "epoch": 1.396574440052701, "grad_norm": 0.10868985950946808, "learning_rate": 4.403802774064548e-06, "loss": 0.0898, "step": 5565 }, { "epoch": 1.3978292239161805, "grad_norm": 0.18841342628002167, "learning_rate": 4.386969150537593e-06, "loss": 0.0839, "step": 5570 }, { "epoch": 1.39908400777966, "grad_norm": 0.20673134922981262, "learning_rate": 4.370158717120544e-06, "loss": 0.0807, "step": 5575 }, { "epoch": 1.4003387916431396, "grad_norm": 0.2445622682571411, "learning_rate": 4.35337154326532e-06, "loss": 0.0825, "step": 5580 }, { "epoch": 1.401593575506619, "grad_norm": 0.2141215056180954, "learning_rate": 4.336607698327755e-06, "loss": 0.0852, "step": 5585 }, { "epoch": 1.4028483593700984, "grad_norm": 0.2787396311759949, "learning_rate": 4.3198672515672925e-06, "loss": 0.0829, "step": 5590 }, { "epoch": 1.404103143233578, "grad_norm": 0.24689462780952454, "learning_rate": 4.303150272146706e-06, "loss": 0.0864, "step": 5595 }, { "epoch": 1.4053579270970575, "grad_norm": 0.17422537505626678, "learning_rate": 4.286456829131821e-06, "loss": 0.086, "step": 5600 }, { "epoch": 1.406612710960537, "grad_norm": 0.20011769235134125, "learning_rate": 4.269786991491222e-06, "loss": 0.0812, "step": 5605 }, { "epoch": 1.4078674948240166, "grad_norm": 0.2414385825395584, "learning_rate": 4.253140828095964e-06, "loss": 0.0834, "step": 5610 }, { "epoch": 1.4091222786874962, "grad_norm": 0.2885581851005554, "learning_rate": 4.236518407719289e-06, "loss": 0.0765, "step": 5615 }, { "epoch": 1.4103770625509755, "grad_norm": 0.20998211205005646, "learning_rate": 4.219919799036359e-06, "loss": 0.0748, "step": 5620 }, { "epoch": 1.411631846414455, "grad_norm": 0.26928192377090454, "learning_rate": 4.203345070623947e-06, "loss": 0.0923, "step": 5625 }, { "epoch": 1.4128866302779346, "grad_norm": 0.28317445516586304, "learning_rate": 4.186794290960162e-06, "loss": 0.08, "step": 5630 }, { "epoch": 1.4141414141414141, "grad_norm": 0.08973658084869385, "learning_rate": 4.170267528424185e-06, "loss": 0.0775, "step": 5635 }, { "epoch": 1.4153961980048937, "grad_norm": 0.4094654321670532, "learning_rate": 4.153764851295954e-06, "loss": 0.0969, "step": 5640 }, { "epoch": 1.4166509818683732, "grad_norm": 0.17649400234222412, "learning_rate": 4.137286327755913e-06, "loss": 0.0809, "step": 5645 }, { "epoch": 1.4179057657318528, "grad_norm": 0.1127919927239418, "learning_rate": 4.120832025884705e-06, "loss": 0.0782, "step": 5650 }, { "epoch": 1.419160549595332, "grad_norm": 0.5189236998558044, "learning_rate": 4.104402013662901e-06, "loss": 0.0867, "step": 5655 }, { "epoch": 1.4204153334588117, "grad_norm": 0.4165945053100586, "learning_rate": 4.0879963589707305e-06, "loss": 0.0836, "step": 5660 }, { "epoch": 1.4216701173222912, "grad_norm": 0.28989869356155396, "learning_rate": 4.071615129587787e-06, "loss": 0.0818, "step": 5665 }, { "epoch": 1.4229249011857708, "grad_norm": 0.101304791867733, "learning_rate": 4.055258393192746e-06, "loss": 0.0827, "step": 5670 }, { "epoch": 1.4241796850492503, "grad_norm": 0.2473038285970688, "learning_rate": 4.038926217363089e-06, "loss": 0.0805, "step": 5675 }, { "epoch": 1.4254344689127298, "grad_norm": 0.2448158711194992, "learning_rate": 4.022618669574839e-06, "loss": 0.0763, "step": 5680 }, { "epoch": 1.4266892527762094, "grad_norm": 0.5424969792366028, "learning_rate": 4.006335817202256e-06, "loss": 0.0836, "step": 5685 }, { "epoch": 1.4279440366396887, "grad_norm": 0.2771781086921692, "learning_rate": 3.990077727517573e-06, "loss": 0.0783, "step": 5690 }, { "epoch": 1.4291988205031683, "grad_norm": 0.05690968781709671, "learning_rate": 3.973844467690727e-06, "loss": 0.0729, "step": 5695 }, { "epoch": 1.4304536043666478, "grad_norm": 0.26809534430503845, "learning_rate": 3.957636104789056e-06, "loss": 0.0766, "step": 5700 }, { "epoch": 1.4317083882301274, "grad_norm": 0.31916674971580505, "learning_rate": 3.94145270577705e-06, "loss": 0.0988, "step": 5705 }, { "epoch": 1.432963172093607, "grad_norm": 0.5278578996658325, "learning_rate": 3.925294337516051e-06, "loss": 0.0853, "step": 5710 }, { "epoch": 1.4342179559570865, "grad_norm": 0.15941303968429565, "learning_rate": 3.909161066763999e-06, "loss": 0.0872, "step": 5715 }, { "epoch": 1.435472739820566, "grad_norm": 0.46407434344291687, "learning_rate": 3.893052960175128e-06, "loss": 0.0746, "step": 5720 }, { "epoch": 1.4367275236840453, "grad_norm": 0.13292035460472107, "learning_rate": 3.876970084299722e-06, "loss": 0.0829, "step": 5725 }, { "epoch": 1.4379823075475249, "grad_norm": 0.18183240294456482, "learning_rate": 3.860912505583819e-06, "loss": 0.0797, "step": 5730 }, { "epoch": 1.4392370914110044, "grad_norm": 0.16055604815483093, "learning_rate": 3.844880290368935e-06, "loss": 0.0805, "step": 5735 }, { "epoch": 1.440491875274484, "grad_norm": 0.28802794218063354, "learning_rate": 3.828873504891813e-06, "loss": 0.0795, "step": 5740 }, { "epoch": 1.4417466591379635, "grad_norm": 0.11848758161067963, "learning_rate": 3.8128922152841188e-06, "loss": 0.0856, "step": 5745 }, { "epoch": 1.443001443001443, "grad_norm": 0.131498321890831, "learning_rate": 3.7969364875721914e-06, "loss": 0.084, "step": 5750 }, { "epoch": 1.4442562268649226, "grad_norm": 0.1645013391971588, "learning_rate": 3.78100638767676e-06, "loss": 0.0635, "step": 5755 }, { "epoch": 1.445511010728402, "grad_norm": 0.1245434433221817, "learning_rate": 3.7651019814126656e-06, "loss": 0.0708, "step": 5760 }, { "epoch": 1.4467657945918815, "grad_norm": 0.5209866762161255, "learning_rate": 3.7492233344886073e-06, "loss": 0.0801, "step": 5765 }, { "epoch": 1.448020578455361, "grad_norm": 0.19441930949687958, "learning_rate": 3.7333705125068576e-06, "loss": 0.0711, "step": 5770 }, { "epoch": 1.4492753623188406, "grad_norm": 0.27351874113082886, "learning_rate": 3.71754358096299e-06, "loss": 0.071, "step": 5775 }, { "epoch": 1.4505301461823201, "grad_norm": 0.2211633026599884, "learning_rate": 3.7017426052456086e-06, "loss": 0.0878, "step": 5780 }, { "epoch": 1.4517849300457997, "grad_norm": 0.24535594880580902, "learning_rate": 3.685967650636095e-06, "loss": 0.0805, "step": 5785 }, { "epoch": 1.4530397139092792, "grad_norm": 0.24481262266635895, "learning_rate": 3.6702187823083147e-06, "loss": 0.0893, "step": 5790 }, { "epoch": 1.4542944977727585, "grad_norm": 0.16357897222042084, "learning_rate": 3.6544960653283544e-06, "loss": 0.0862, "step": 5795 }, { "epoch": 1.455549281636238, "grad_norm": 0.34867826104164124, "learning_rate": 3.6387995646542727e-06, "loss": 0.0774, "step": 5800 }, { "epoch": 1.4568040654997176, "grad_norm": 0.1586691290140152, "learning_rate": 3.6231293451357994e-06, "loss": 0.0822, "step": 5805 }, { "epoch": 1.4580588493631972, "grad_norm": 0.1658533662557602, "learning_rate": 3.6074854715140983e-06, "loss": 0.0921, "step": 5810 }, { "epoch": 1.4593136332266767, "grad_norm": 0.26845037937164307, "learning_rate": 3.591868008421472e-06, "loss": 0.075, "step": 5815 }, { "epoch": 1.4605684170901563, "grad_norm": 0.2408105581998825, "learning_rate": 3.5762770203811225e-06, "loss": 0.0927, "step": 5820 }, { "epoch": 1.4618232009536358, "grad_norm": 0.15939578413963318, "learning_rate": 3.560712571806858e-06, "loss": 0.0794, "step": 5825 }, { "epoch": 1.4630779848171152, "grad_norm": 0.07994109392166138, "learning_rate": 3.5451747270028527e-06, "loss": 0.0799, "step": 5830 }, { "epoch": 1.4643327686805947, "grad_norm": 0.09967489540576935, "learning_rate": 3.5296635501633558e-06, "loss": 0.08, "step": 5835 }, { "epoch": 1.4655875525440742, "grad_norm": 0.1677519679069519, "learning_rate": 3.5141791053724405e-06, "loss": 0.0809, "step": 5840 }, { "epoch": 1.4668423364075538, "grad_norm": 0.21794256567955017, "learning_rate": 3.4987214566037477e-06, "loss": 0.0792, "step": 5845 }, { "epoch": 1.4680971202710333, "grad_norm": 0.2627575993537903, "learning_rate": 3.483290667720196e-06, "loss": 0.0942, "step": 5850 }, { "epoch": 1.4693519041345129, "grad_norm": 0.15493802726268768, "learning_rate": 3.4678868024737456e-06, "loss": 0.0864, "step": 5855 }, { "epoch": 1.4706066879979924, "grad_norm": 0.44874611496925354, "learning_rate": 3.452509924505113e-06, "loss": 0.0773, "step": 5860 }, { "epoch": 1.4718614718614718, "grad_norm": 0.1353214532136917, "learning_rate": 3.437160097343526e-06, "loss": 0.0789, "step": 5865 }, { "epoch": 1.4731162557249513, "grad_norm": 0.14763125777244568, "learning_rate": 3.4218373844064433e-06, "loss": 0.0824, "step": 5870 }, { "epoch": 1.4743710395884309, "grad_norm": 0.21625354886054993, "learning_rate": 3.4065418489993118e-06, "loss": 0.0736, "step": 5875 }, { "epoch": 1.4756258234519104, "grad_norm": 0.24612949788570404, "learning_rate": 3.3912735543152864e-06, "loss": 0.0773, "step": 5880 }, { "epoch": 1.47688060731539, "grad_norm": 0.1752692461013794, "learning_rate": 3.376032563434979e-06, "loss": 0.0826, "step": 5885 }, { "epoch": 1.4781353911788695, "grad_norm": 0.2644670605659485, "learning_rate": 3.3608189393262037e-06, "loss": 0.0869, "step": 5890 }, { "epoch": 1.479390175042349, "grad_norm": 0.20996436476707458, "learning_rate": 3.345632744843702e-06, "loss": 0.0902, "step": 5895 }, { "epoch": 1.4806449589058284, "grad_norm": 0.2922036647796631, "learning_rate": 3.3304740427288886e-06, "loss": 0.0896, "step": 5900 }, { "epoch": 1.481899742769308, "grad_norm": 0.37115779519081116, "learning_rate": 3.3153428956096046e-06, "loss": 0.0876, "step": 5905 }, { "epoch": 1.4831545266327875, "grad_norm": 0.23147353529930115, "learning_rate": 3.3002393659998357e-06, "loss": 0.0778, "step": 5910 }, { "epoch": 1.484409310496267, "grad_norm": 0.19222880899906158, "learning_rate": 3.2851635162994788e-06, "loss": 0.0884, "step": 5915 }, { "epoch": 1.4856640943597466, "grad_norm": 0.1700582355260849, "learning_rate": 3.27011540879406e-06, "loss": 0.0831, "step": 5920 }, { "epoch": 1.486918878223226, "grad_norm": 0.10918935388326645, "learning_rate": 3.2550951056545e-06, "loss": 0.0816, "step": 5925 }, { "epoch": 1.4881736620867057, "grad_norm": 0.21468310058116913, "learning_rate": 3.2401026689368363e-06, "loss": 0.0807, "step": 5930 }, { "epoch": 1.489428445950185, "grad_norm": 0.3127315044403076, "learning_rate": 3.2251381605819876e-06, "loss": 0.0958, "step": 5935 }, { "epoch": 1.4906832298136645, "grad_norm": 0.21210235357284546, "learning_rate": 3.210201642415477e-06, "loss": 0.0748, "step": 5940 }, { "epoch": 1.491938013677144, "grad_norm": 0.20049948990345, "learning_rate": 3.1952931761471893e-06, "loss": 0.0795, "step": 5945 }, { "epoch": 1.4931927975406236, "grad_norm": 0.15716488659381866, "learning_rate": 3.180412823371123e-06, "loss": 0.0909, "step": 5950 }, { "epoch": 1.4944475814041032, "grad_norm": 0.32477137446403503, "learning_rate": 3.1655606455651134e-06, "loss": 0.0792, "step": 5955 }, { "epoch": 1.4957023652675827, "grad_norm": 0.17082227766513824, "learning_rate": 3.1507367040905943e-06, "loss": 0.0898, "step": 5960 }, { "epoch": 1.4969571491310623, "grad_norm": 0.1650392711162567, "learning_rate": 3.135941060192348e-06, "loss": 0.0843, "step": 5965 }, { "epoch": 1.4982119329945416, "grad_norm": 0.10460920631885529, "learning_rate": 3.121173774998245e-06, "loss": 0.0879, "step": 5970 }, { "epoch": 1.4994667168580211, "grad_norm": 0.13847452402114868, "learning_rate": 3.106434909518985e-06, "loss": 0.0928, "step": 5975 }, { "epoch": 1.5007215007215007, "grad_norm": 0.1215418130159378, "learning_rate": 3.091724524647861e-06, "loss": 0.0807, "step": 5980 }, { "epoch": 1.5019762845849802, "grad_norm": 0.26588910818099976, "learning_rate": 3.0770426811604946e-06, "loss": 0.081, "step": 5985 }, { "epoch": 1.5032310684484598, "grad_norm": 0.4260256290435791, "learning_rate": 3.0623894397145837e-06, "loss": 0.0778, "step": 5990 }, { "epoch": 1.5044858523119393, "grad_norm": 0.20512565970420837, "learning_rate": 3.0477648608496726e-06, "loss": 0.0746, "step": 5995 }, { "epoch": 1.5057406361754189, "grad_norm": 0.3236079812049866, "learning_rate": 3.0331690049868733e-06, "loss": 0.0818, "step": 6000 }, { "epoch": 1.5069954200388982, "grad_norm": 0.5079814791679382, "learning_rate": 3.018601932428632e-06, "loss": 0.0782, "step": 6005 }, { "epoch": 1.508250203902378, "grad_norm": 0.15258589386940002, "learning_rate": 3.004063703358484e-06, "loss": 0.0885, "step": 6010 }, { "epoch": 1.5095049877658573, "grad_norm": 0.3337784707546234, "learning_rate": 2.9895543778407875e-06, "loss": 0.0973, "step": 6015 }, { "epoch": 1.5107597716293368, "grad_norm": 0.1551678329706192, "learning_rate": 2.9750740158205005e-06, "loss": 0.0877, "step": 6020 }, { "epoch": 1.5120145554928164, "grad_norm": 0.2518864870071411, "learning_rate": 2.960622677122903e-06, "loss": 0.0777, "step": 6025 }, { "epoch": 1.513269339356296, "grad_norm": 0.22310376167297363, "learning_rate": 2.9462004214533803e-06, "loss": 0.0812, "step": 6030 }, { "epoch": 1.5145241232197755, "grad_norm": 0.23948289453983307, "learning_rate": 2.9318073083971486e-06, "loss": 0.0879, "step": 6035 }, { "epoch": 1.5157789070832548, "grad_norm": 0.24216221272945404, "learning_rate": 2.9174433974190365e-06, "loss": 0.0858, "step": 6040 }, { "epoch": 1.5170336909467346, "grad_norm": 0.19065602123737335, "learning_rate": 2.9031087478632116e-06, "loss": 0.0868, "step": 6045 }, { "epoch": 1.518288474810214, "grad_norm": 0.2692478597164154, "learning_rate": 2.8888034189529524e-06, "loss": 0.0737, "step": 6050 }, { "epoch": 1.5195432586736934, "grad_norm": 0.20995523035526276, "learning_rate": 2.874527469790408e-06, "loss": 0.0824, "step": 6055 }, { "epoch": 1.520798042537173, "grad_norm": 0.10530667006969452, "learning_rate": 2.860280959356336e-06, "loss": 0.0783, "step": 6060 }, { "epoch": 1.5220528264006525, "grad_norm": 0.39904889464378357, "learning_rate": 2.846063946509868e-06, "loss": 0.0917, "step": 6065 }, { "epoch": 1.523307610264132, "grad_norm": 0.2294001579284668, "learning_rate": 2.8318764899882745e-06, "loss": 0.0794, "step": 6070 }, { "epoch": 1.5245623941276114, "grad_norm": 0.31115418672561646, "learning_rate": 2.8177186484067143e-06, "loss": 0.0803, "step": 6075 }, { "epoch": 1.5258171779910912, "grad_norm": 0.21022425591945648, "learning_rate": 2.803590480257985e-06, "loss": 0.0809, "step": 6080 }, { "epoch": 1.5270719618545705, "grad_norm": 0.4131692051887512, "learning_rate": 2.7894920439122907e-06, "loss": 0.0776, "step": 6085 }, { "epoch": 1.52832674571805, "grad_norm": 0.08908002078533173, "learning_rate": 2.77542339761701e-06, "loss": 0.0797, "step": 6090 }, { "epoch": 1.5295815295815296, "grad_norm": 0.13143645226955414, "learning_rate": 2.7613845994964296e-06, "loss": 0.0838, "step": 6095 }, { "epoch": 1.5308363134450091, "grad_norm": 0.3454774022102356, "learning_rate": 2.7473757075515305e-06, "loss": 0.0785, "step": 6100 }, { "epoch": 1.5320910973084887, "grad_norm": 0.2272334098815918, "learning_rate": 2.7333967796597317e-06, "loss": 0.0773, "step": 6105 }, { "epoch": 1.533345881171968, "grad_norm": 0.2906242311000824, "learning_rate": 2.7194478735746543e-06, "loss": 0.0897, "step": 6110 }, { "epoch": 1.5346006650354478, "grad_norm": 0.21385426819324493, "learning_rate": 2.70552904692589e-06, "loss": 0.0788, "step": 6115 }, { "epoch": 1.5358554488989271, "grad_norm": 0.2976323962211609, "learning_rate": 2.691640357218759e-06, "loss": 0.0921, "step": 6120 }, { "epoch": 1.5371102327624067, "grad_norm": 0.20897260308265686, "learning_rate": 2.6777818618340667e-06, "loss": 0.0831, "step": 6125 }, { "epoch": 1.5383650166258862, "grad_norm": 0.22841490805149078, "learning_rate": 2.663953618027869e-06, "loss": 0.0829, "step": 6130 }, { "epoch": 1.5396198004893658, "grad_norm": 0.34173956513404846, "learning_rate": 2.6501556829312492e-06, "loss": 0.0851, "step": 6135 }, { "epoch": 1.5408745843528453, "grad_norm": 0.10126471519470215, "learning_rate": 2.6363881135500567e-06, "loss": 0.0763, "step": 6140 }, { "epoch": 1.5421293682163246, "grad_norm": 0.1869829148054123, "learning_rate": 2.6226509667646993e-06, "loss": 0.0882, "step": 6145 }, { "epoch": 1.5433841520798044, "grad_norm": 0.08933491259813309, "learning_rate": 2.6089442993298854e-06, "loss": 0.0928, "step": 6150 }, { "epoch": 1.5446389359432837, "grad_norm": 0.2921507656574249, "learning_rate": 2.595268167874396e-06, "loss": 0.0824, "step": 6155 }, { "epoch": 1.5458937198067633, "grad_norm": 0.27281445264816284, "learning_rate": 2.581622628900868e-06, "loss": 0.0822, "step": 6160 }, { "epoch": 1.5471485036702428, "grad_norm": 0.20460334420204163, "learning_rate": 2.568007738785533e-06, "loss": 0.0819, "step": 6165 }, { "epoch": 1.5484032875337224, "grad_norm": 0.5249515175819397, "learning_rate": 2.5544235537779962e-06, "loss": 0.0768, "step": 6170 }, { "epoch": 1.549658071397202, "grad_norm": 0.2648284137248993, "learning_rate": 2.540870130001015e-06, "loss": 0.0843, "step": 6175 }, { "epoch": 1.5509128552606812, "grad_norm": 0.47869521379470825, "learning_rate": 2.5273475234502565e-06, "loss": 0.089, "step": 6180 }, { "epoch": 1.552167639124161, "grad_norm": 0.3596646785736084, "learning_rate": 2.5138557899940595e-06, "loss": 0.0802, "step": 6185 }, { "epoch": 1.5534224229876403, "grad_norm": 0.13940221071243286, "learning_rate": 2.5003949853732135e-06, "loss": 0.0844, "step": 6190 }, { "epoch": 1.5546772068511199, "grad_norm": 0.11128222197294235, "learning_rate": 2.486965165200733e-06, "loss": 0.0806, "step": 6195 }, { "epoch": 1.5559319907145994, "grad_norm": 0.1166953295469284, "learning_rate": 2.4735663849616098e-06, "loss": 0.0896, "step": 6200 }, { "epoch": 1.557186774578079, "grad_norm": 0.2891371250152588, "learning_rate": 2.460198700012608e-06, "loss": 0.0809, "step": 6205 }, { "epoch": 1.5584415584415585, "grad_norm": 0.3150325417518616, "learning_rate": 2.4468621655820125e-06, "loss": 0.0852, "step": 6210 }, { "epoch": 1.5596963423050378, "grad_norm": 0.4029218554496765, "learning_rate": 2.433556836769411e-06, "loss": 0.0751, "step": 6215 }, { "epoch": 1.5609511261685176, "grad_norm": 0.19041700661182404, "learning_rate": 2.420282768545469e-06, "loss": 0.0852, "step": 6220 }, { "epoch": 1.562205910031997, "grad_norm": 0.12898743152618408, "learning_rate": 2.4070400157517036e-06, "loss": 0.0909, "step": 6225 }, { "epoch": 1.5634606938954765, "grad_norm": 0.07892940193414688, "learning_rate": 2.3938286331002458e-06, "loss": 0.0879, "step": 6230 }, { "epoch": 1.564715477758956, "grad_norm": 0.21748192608356476, "learning_rate": 2.380648675173619e-06, "loss": 0.0877, "step": 6235 }, { "epoch": 1.5659702616224356, "grad_norm": 0.13437186181545258, "learning_rate": 2.367500196424529e-06, "loss": 0.0812, "step": 6240 }, { "epoch": 1.5672250454859151, "grad_norm": 0.21154047548770905, "learning_rate": 2.3543832511756113e-06, "loss": 0.0896, "step": 6245 }, { "epoch": 1.5684798293493944, "grad_norm": 0.16429923474788666, "learning_rate": 2.3412978936192343e-06, "loss": 0.0875, "step": 6250 }, { "epoch": 1.5697346132128742, "grad_norm": 0.29867643117904663, "learning_rate": 2.328244177817254e-06, "loss": 0.0784, "step": 6255 }, { "epoch": 1.5709893970763535, "grad_norm": 0.0878303125500679, "learning_rate": 2.315222157700797e-06, "loss": 0.0809, "step": 6260 }, { "epoch": 1.572244180939833, "grad_norm": 0.24482104182243347, "learning_rate": 2.3022318870700533e-06, "loss": 0.0865, "step": 6265 }, { "epoch": 1.5734989648033126, "grad_norm": 0.3083607852458954, "learning_rate": 2.289273419594027e-06, "loss": 0.0895, "step": 6270 }, { "epoch": 1.5747537486667922, "grad_norm": 0.3820037543773651, "learning_rate": 2.2763468088103315e-06, "loss": 0.08, "step": 6275 }, { "epoch": 1.5760085325302717, "grad_norm": 0.09033305943012238, "learning_rate": 2.263452108124968e-06, "loss": 0.0795, "step": 6280 }, { "epoch": 1.577263316393751, "grad_norm": 0.1263839453458786, "learning_rate": 2.250589370812105e-06, "loss": 0.0771, "step": 6285 }, { "epoch": 1.5785181002572308, "grad_norm": 0.45043227076530457, "learning_rate": 2.237758650013847e-06, "loss": 0.075, "step": 6290 }, { "epoch": 1.5797728841207102, "grad_norm": 0.10394462198019028, "learning_rate": 2.2249599987400237e-06, "loss": 0.0894, "step": 6295 }, { "epoch": 1.5810276679841897, "grad_norm": 0.394104540348053, "learning_rate": 2.2121934698679793e-06, "loss": 0.0932, "step": 6300 }, { "epoch": 1.5822824518476692, "grad_norm": 0.12392072379589081, "learning_rate": 2.1994591161423327e-06, "loss": 0.078, "step": 6305 }, { "epoch": 1.5835372357111488, "grad_norm": 0.3277107775211334, "learning_rate": 2.186756990174783e-06, "loss": 0.0815, "step": 6310 }, { "epoch": 1.5847920195746283, "grad_norm": 0.35756587982177734, "learning_rate": 2.174087144443875e-06, "loss": 0.0915, "step": 6315 }, { "epoch": 1.5860468034381077, "grad_norm": 0.3044070303440094, "learning_rate": 2.161449631294785e-06, "loss": 0.0844, "step": 6320 }, { "epoch": 1.5873015873015874, "grad_norm": 0.1297033727169037, "learning_rate": 2.148844502939117e-06, "loss": 0.0834, "step": 6325 }, { "epoch": 1.5885563711650668, "grad_norm": 0.1339099109172821, "learning_rate": 2.1362718114546777e-06, "loss": 0.0886, "step": 6330 }, { "epoch": 1.5898111550285463, "grad_norm": 0.15922382473945618, "learning_rate": 2.1237316087852465e-06, "loss": 0.0965, "step": 6335 }, { "epoch": 1.5910659388920259, "grad_norm": 0.19360999763011932, "learning_rate": 2.111223946740394e-06, "loss": 0.0896, "step": 6340 }, { "epoch": 1.5923207227555054, "grad_norm": 0.11068299412727356, "learning_rate": 2.0987488769952436e-06, "loss": 0.0759, "step": 6345 }, { "epoch": 1.593575506618985, "grad_norm": 0.0986902266740799, "learning_rate": 2.0863064510902586e-06, "loss": 0.0912, "step": 6350 }, { "epoch": 1.5948302904824643, "grad_norm": 0.23040059208869934, "learning_rate": 2.0738967204310455e-06, "loss": 0.0857, "step": 6355 }, { "epoch": 1.596085074345944, "grad_norm": 0.1746888905763626, "learning_rate": 2.0615197362881234e-06, "loss": 0.0814, "step": 6360 }, { "epoch": 1.5973398582094234, "grad_norm": 0.19930033385753632, "learning_rate": 2.0491755497967183e-06, "loss": 0.0848, "step": 6365 }, { "epoch": 1.598594642072903, "grad_norm": 0.17192873358726501, "learning_rate": 2.0368642119565617e-06, "loss": 0.088, "step": 6370 }, { "epoch": 1.5998494259363825, "grad_norm": 0.26457393169403076, "learning_rate": 2.024585773631671e-06, "loss": 0.0843, "step": 6375 }, { "epoch": 1.601104209799862, "grad_norm": 0.15081310272216797, "learning_rate": 2.012340285550126e-06, "loss": 0.0822, "step": 6380 }, { "epoch": 1.6023589936633416, "grad_norm": 0.1679522842168808, "learning_rate": 2.0001277983038904e-06, "loss": 0.0895, "step": 6385 }, { "epoch": 1.6036137775268209, "grad_norm": 0.12190816551446915, "learning_rate": 1.9879483623485786e-06, "loss": 0.0764, "step": 6390 }, { "epoch": 1.6048685613903007, "grad_norm": 0.30287179350852966, "learning_rate": 1.975802028003253e-06, "loss": 0.076, "step": 6395 }, { "epoch": 1.60612334525378, "grad_norm": 0.3039589524269104, "learning_rate": 1.963688845450218e-06, "loss": 0.0883, "step": 6400 }, { "epoch": 1.6073781291172595, "grad_norm": 0.10443169623613358, "learning_rate": 1.9516088647348164e-06, "loss": 0.0792, "step": 6405 }, { "epoch": 1.608632912980739, "grad_norm": 0.10957269370555878, "learning_rate": 1.9395621357652117e-06, "loss": 0.0862, "step": 6410 }, { "epoch": 1.6098876968442186, "grad_norm": 0.14673763513565063, "learning_rate": 1.9275487083121946e-06, "loss": 0.0808, "step": 6415 }, { "epoch": 1.6111424807076982, "grad_norm": 0.32992008328437805, "learning_rate": 1.9155686320089684e-06, "loss": 0.0935, "step": 6420 }, { "epoch": 1.6123972645711775, "grad_norm": 0.17369329929351807, "learning_rate": 1.9036219563509439e-06, "loss": 0.0838, "step": 6425 }, { "epoch": 1.6136520484346573, "grad_norm": 0.22150751948356628, "learning_rate": 1.891708730695544e-06, "loss": 0.0873, "step": 6430 }, { "epoch": 1.6149068322981366, "grad_norm": 0.18616770207881927, "learning_rate": 1.8798290042619949e-06, "loss": 0.0934, "step": 6435 }, { "epoch": 1.6161616161616161, "grad_norm": 0.314390629529953, "learning_rate": 1.8679828261311073e-06, "loss": 0.0769, "step": 6440 }, { "epoch": 1.6174164000250957, "grad_norm": 0.35372182726860046, "learning_rate": 1.8561702452451047e-06, "loss": 0.0808, "step": 6445 }, { "epoch": 1.6186711838885752, "grad_norm": 0.16589049994945526, "learning_rate": 1.8443913104073984e-06, "loss": 0.0827, "step": 6450 }, { "epoch": 1.6199259677520548, "grad_norm": 0.18396617472171783, "learning_rate": 1.83264607028239e-06, "loss": 0.0795, "step": 6455 }, { "epoch": 1.621180751615534, "grad_norm": 0.0726623684167862, "learning_rate": 1.82093457339527e-06, "loss": 0.0816, "step": 6460 }, { "epoch": 1.6224355354790139, "grad_norm": 0.37276962399482727, "learning_rate": 1.809256868131828e-06, "loss": 0.0881, "step": 6465 }, { "epoch": 1.6236903193424932, "grad_norm": 0.2739810645580292, "learning_rate": 1.7976130027382332e-06, "loss": 0.0772, "step": 6470 }, { "epoch": 1.6249451032059727, "grad_norm": 0.18522068858146667, "learning_rate": 1.786003025320856e-06, "loss": 0.0858, "step": 6475 }, { "epoch": 1.6261998870694523, "grad_norm": 0.19277586042881012, "learning_rate": 1.774426983846058e-06, "loss": 0.082, "step": 6480 }, { "epoch": 1.6274546709329318, "grad_norm": 0.36260509490966797, "learning_rate": 1.7628849261399839e-06, "loss": 0.0751, "step": 6485 }, { "epoch": 1.6287094547964114, "grad_norm": 0.2178385704755783, "learning_rate": 1.7513768998883896e-06, "loss": 0.083, "step": 6490 }, { "epoch": 1.6299642386598907, "grad_norm": 0.1693069338798523, "learning_rate": 1.7399029526364254e-06, "loss": 0.0827, "step": 6495 }, { "epoch": 1.6312190225233705, "grad_norm": 0.2321835607290268, "learning_rate": 1.7284631317884448e-06, "loss": 0.0859, "step": 6500 }, { "epoch": 1.6324738063868498, "grad_norm": 0.21040475368499756, "learning_rate": 1.7170574846078037e-06, "loss": 0.0829, "step": 6505 }, { "epoch": 1.6337285902503293, "grad_norm": 0.1375027894973755, "learning_rate": 1.7056860582166823e-06, "loss": 0.0771, "step": 6510 }, { "epoch": 1.634983374113809, "grad_norm": 0.07659853994846344, "learning_rate": 1.6943488995958647e-06, "loss": 0.0859, "step": 6515 }, { "epoch": 1.6362381579772884, "grad_norm": 0.13758735358715057, "learning_rate": 1.6830460555845719e-06, "loss": 0.0846, "step": 6520 }, { "epoch": 1.637492941840768, "grad_norm": 0.2629936933517456, "learning_rate": 1.6717775728802432e-06, "loss": 0.083, "step": 6525 }, { "epoch": 1.6387477257042473, "grad_norm": 0.184475377202034, "learning_rate": 1.6605434980383594e-06, "loss": 0.0836, "step": 6530 }, { "epoch": 1.640002509567727, "grad_norm": 0.4641217887401581, "learning_rate": 1.649343877472248e-06, "loss": 0.0808, "step": 6535 }, { "epoch": 1.6412572934312064, "grad_norm": 0.23128898441791534, "learning_rate": 1.638178757452894e-06, "loss": 0.0745, "step": 6540 }, { "epoch": 1.642512077294686, "grad_norm": 0.09639477729797363, "learning_rate": 1.627048184108726e-06, "loss": 0.0849, "step": 6545 }, { "epoch": 1.6437668611581655, "grad_norm": 0.305663138628006, "learning_rate": 1.6159522034254628e-06, "loss": 0.0756, "step": 6550 }, { "epoch": 1.645021645021645, "grad_norm": 0.14679822325706482, "learning_rate": 1.604890861245898e-06, "loss": 0.0698, "step": 6555 }, { "epoch": 1.6462764288851246, "grad_norm": 0.16359837353229523, "learning_rate": 1.593864203269716e-06, "loss": 0.079, "step": 6560 }, { "epoch": 1.647531212748604, "grad_norm": 0.12374599277973175, "learning_rate": 1.582872275053301e-06, "loss": 0.0822, "step": 6565 }, { "epoch": 1.6487859966120837, "grad_norm": 0.2265704870223999, "learning_rate": 1.5719151220095596e-06, "loss": 0.0856, "step": 6570 }, { "epoch": 1.650040780475563, "grad_norm": 0.1844184547662735, "learning_rate": 1.5609927894077193e-06, "loss": 0.072, "step": 6575 }, { "epoch": 1.6512955643390426, "grad_norm": 0.0777512937784195, "learning_rate": 1.5501053223731532e-06, "loss": 0.0754, "step": 6580 }, { "epoch": 1.6525503482025221, "grad_norm": 0.3023741841316223, "learning_rate": 1.5392527658871813e-06, "loss": 0.0788, "step": 6585 }, { "epoch": 1.6538051320660017, "grad_norm": 0.42789706587791443, "learning_rate": 1.5284351647868956e-06, "loss": 0.0671, "step": 6590 }, { "epoch": 1.6550599159294812, "grad_norm": 0.1094786524772644, "learning_rate": 1.5176525637649708e-06, "loss": 0.0815, "step": 6595 }, { "epoch": 1.6563146997929605, "grad_norm": 0.2907751798629761, "learning_rate": 1.5069050073694813e-06, "loss": 0.0796, "step": 6600 }, { "epoch": 1.6575694836564403, "grad_norm": 0.11227094382047653, "learning_rate": 1.4961925400037102e-06, "loss": 0.0883, "step": 6605 }, { "epoch": 1.6588242675199196, "grad_norm": 0.34500670433044434, "learning_rate": 1.4855152059259737e-06, "loss": 0.0809, "step": 6610 }, { "epoch": 1.6600790513833992, "grad_norm": 0.1734425127506256, "learning_rate": 1.474873049249439e-06, "loss": 0.0819, "step": 6615 }, { "epoch": 1.6613338352468787, "grad_norm": 0.16895422339439392, "learning_rate": 1.4642661139419302e-06, "loss": 0.0874, "step": 6620 }, { "epoch": 1.6625886191103583, "grad_norm": 0.3084584176540375, "learning_rate": 1.453694443825766e-06, "loss": 0.0811, "step": 6625 }, { "epoch": 1.6638434029738378, "grad_norm": 0.09408409893512726, "learning_rate": 1.4431580825775604e-06, "loss": 0.0826, "step": 6630 }, { "epoch": 1.6650981868373171, "grad_norm": 0.06825320422649384, "learning_rate": 1.4326570737280488e-06, "loss": 0.0747, "step": 6635 }, { "epoch": 1.666352970700797, "grad_norm": 0.30554118752479553, "learning_rate": 1.4221914606619135e-06, "loss": 0.0773, "step": 6640 }, { "epoch": 1.6676077545642762, "grad_norm": 0.3845992982387543, "learning_rate": 1.4117612866176022e-06, "loss": 0.0893, "step": 6645 }, { "epoch": 1.6688625384277558, "grad_norm": 0.08592917770147324, "learning_rate": 1.4013665946871347e-06, "loss": 0.0817, "step": 6650 }, { "epoch": 1.6701173222912353, "grad_norm": 0.12979847192764282, "learning_rate": 1.391007427815949e-06, "loss": 0.0754, "step": 6655 }, { "epoch": 1.6713721061547149, "grad_norm": 0.2696765661239624, "learning_rate": 1.3806838288027113e-06, "loss": 0.0782, "step": 6660 }, { "epoch": 1.6726268900181944, "grad_norm": 0.20548641681671143, "learning_rate": 1.3703958402991345e-06, "loss": 0.0815, "step": 6665 }, { "epoch": 1.6738816738816737, "grad_norm": 0.4033859968185425, "learning_rate": 1.36014350480981e-06, "loss": 0.0826, "step": 6670 }, { "epoch": 1.6751364577451535, "grad_norm": 0.29467886686325073, "learning_rate": 1.3499268646920317e-06, "loss": 0.0746, "step": 6675 }, { "epoch": 1.6763912416086328, "grad_norm": 0.17417952418327332, "learning_rate": 1.339745962155613e-06, "loss": 0.0739, "step": 6680 }, { "epoch": 1.6776460254721124, "grad_norm": 0.30542805790901184, "learning_rate": 1.329600839262728e-06, "loss": 0.0873, "step": 6685 }, { "epoch": 1.678900809335592, "grad_norm": 0.48298323154449463, "learning_rate": 1.3194915379277195e-06, "loss": 0.0866, "step": 6690 }, { "epoch": 1.6801555931990715, "grad_norm": 0.2085920125246048, "learning_rate": 1.3094180999169348e-06, "loss": 0.0801, "step": 6695 }, { "epoch": 1.681410377062551, "grad_norm": 0.22730374336242676, "learning_rate": 1.299380566848557e-06, "loss": 0.0725, "step": 6700 }, { "epoch": 1.6826651609260304, "grad_norm": 0.2080390900373459, "learning_rate": 1.2893789801924328e-06, "loss": 0.0912, "step": 6705 }, { "epoch": 1.6839199447895101, "grad_norm": 0.2778363525867462, "learning_rate": 1.2794133812698794e-06, "loss": 0.0804, "step": 6710 }, { "epoch": 1.6851747286529895, "grad_norm": 0.11341708898544312, "learning_rate": 1.269483811253549e-06, "loss": 0.0884, "step": 6715 }, { "epoch": 1.686429512516469, "grad_norm": 0.11615065485239029, "learning_rate": 1.259590311167238e-06, "loss": 0.0817, "step": 6720 }, { "epoch": 1.6876842963799485, "grad_norm": 0.13391433656215668, "learning_rate": 1.2497329218857135e-06, "loss": 0.0831, "step": 6725 }, { "epoch": 1.688939080243428, "grad_norm": 0.36114147305488586, "learning_rate": 1.2399116841345605e-06, "loss": 0.0841, "step": 6730 }, { "epoch": 1.6901938641069076, "grad_norm": 0.13782304525375366, "learning_rate": 1.230126638489998e-06, "loss": 0.0861, "step": 6735 }, { "epoch": 1.691448647970387, "grad_norm": 0.7375853061676025, "learning_rate": 1.2203778253787191e-06, "loss": 0.0926, "step": 6740 }, { "epoch": 1.6927034318338667, "grad_norm": 0.15916559100151062, "learning_rate": 1.2106652850777257e-06, "loss": 0.0725, "step": 6745 }, { "epoch": 1.693958215697346, "grad_norm": 0.38723546266555786, "learning_rate": 1.2009890577141625e-06, "loss": 0.082, "step": 6750 }, { "epoch": 1.6952129995608256, "grad_norm": 0.2894856035709381, "learning_rate": 1.1913491832651359e-06, "loss": 0.0908, "step": 6755 }, { "epoch": 1.6964677834243052, "grad_norm": 0.10918894410133362, "learning_rate": 1.181745701557574e-06, "loss": 0.079, "step": 6760 }, { "epoch": 1.6977225672877847, "grad_norm": 0.27660104632377625, "learning_rate": 1.1721786522680445e-06, "loss": 0.0791, "step": 6765 }, { "epoch": 1.6989773511512642, "grad_norm": 0.13653279840946198, "learning_rate": 1.1626480749225932e-06, "loss": 0.0804, "step": 6770 }, { "epoch": 1.7002321350147436, "grad_norm": 0.29900994896888733, "learning_rate": 1.1531540088965842e-06, "loss": 0.0865, "step": 6775 }, { "epoch": 1.7014869188782233, "grad_norm": 0.10758146643638611, "learning_rate": 1.143696493414539e-06, "loss": 0.0898, "step": 6780 }, { "epoch": 1.7027417027417027, "grad_norm": 0.40997231006622314, "learning_rate": 1.134275567549965e-06, "loss": 0.0932, "step": 6785 }, { "epoch": 1.7039964866051822, "grad_norm": 0.1141999289393425, "learning_rate": 1.124891270225208e-06, "loss": 0.0903, "step": 6790 }, { "epoch": 1.7052512704686618, "grad_norm": 0.6266759037971497, "learning_rate": 1.1155436402112785e-06, "loss": 0.0825, "step": 6795 }, { "epoch": 1.7065060543321413, "grad_norm": 0.16094879806041718, "learning_rate": 1.1062327161276965e-06, "loss": 0.0833, "step": 6800 }, { "epoch": 1.7077608381956209, "grad_norm": 0.2038365751504898, "learning_rate": 1.0969585364423352e-06, "loss": 0.091, "step": 6805 }, { "epoch": 1.7090156220591002, "grad_norm": 0.1327473223209381, "learning_rate": 1.0877211394712617e-06, "loss": 0.0799, "step": 6810 }, { "epoch": 1.71027040592258, "grad_norm": 0.3634199798107147, "learning_rate": 1.0785205633785666e-06, "loss": 0.0812, "step": 6815 }, { "epoch": 1.7115251897860593, "grad_norm": 0.27833735942840576, "learning_rate": 1.0693568461762238e-06, "loss": 0.0825, "step": 6820 }, { "epoch": 1.7127799736495388, "grad_norm": 0.3433382511138916, "learning_rate": 1.0602300257239262e-06, "loss": 0.0731, "step": 6825 }, { "epoch": 1.7140347575130184, "grad_norm": 0.5607882738113403, "learning_rate": 1.0511401397289233e-06, "loss": 0.0769, "step": 6830 }, { "epoch": 1.715289541376498, "grad_norm": 0.3127841353416443, "learning_rate": 1.0420872257458725e-06, "loss": 0.0901, "step": 6835 }, { "epoch": 1.7165443252399775, "grad_norm": 0.10875175893306732, "learning_rate": 1.0330713211766864e-06, "loss": 0.0897, "step": 6840 }, { "epoch": 1.7177991091034568, "grad_norm": 0.5205060839653015, "learning_rate": 1.0240924632703676e-06, "loss": 0.0774, "step": 6845 }, { "epoch": 1.7190538929669366, "grad_norm": 0.15961715579032898, "learning_rate": 1.0151506891228636e-06, "loss": 0.0765, "step": 6850 }, { "epoch": 1.7203086768304159, "grad_norm": 0.6552174687385559, "learning_rate": 1.0062460356769189e-06, "loss": 0.0854, "step": 6855 }, { "epoch": 1.7215634606938954, "grad_norm": 0.10555896908044815, "learning_rate": 9.973785397218982e-07, "loss": 0.0886, "step": 6860 }, { "epoch": 1.722818244557375, "grad_norm": 0.21054576337337494, "learning_rate": 9.88548237893664e-07, "loss": 0.0841, "step": 6865 }, { "epoch": 1.7240730284208545, "grad_norm": 0.10787925869226456, "learning_rate": 9.79755166674411e-07, "loss": 0.076, "step": 6870 }, { "epoch": 1.725327812284334, "grad_norm": 0.24701017141342163, "learning_rate": 9.709993623925118e-07, "loss": 0.0804, "step": 6875 }, { "epoch": 1.7265825961478134, "grad_norm": 0.2187194675207138, "learning_rate": 9.622808612223722e-07, "loss": 0.0794, "step": 6880 }, { "epoch": 1.7278373800112932, "grad_norm": 0.16389504075050354, "learning_rate": 9.535996991842855e-07, "loss": 0.09, "step": 6885 }, { "epoch": 1.7290921638747725, "grad_norm": 0.33097338676452637, "learning_rate": 9.449559121442731e-07, "loss": 0.0903, "step": 6890 }, { "epoch": 1.730346947738252, "grad_norm": 0.1447478085756302, "learning_rate": 9.363495358139485e-07, "loss": 0.0778, "step": 6895 }, { "epoch": 1.7316017316017316, "grad_norm": 0.24518853425979614, "learning_rate": 9.277806057503592e-07, "loss": 0.0972, "step": 6900 }, { "epoch": 1.7328565154652111, "grad_norm": 0.09704507142305374, "learning_rate": 9.192491573558438e-07, "loss": 0.0884, "step": 6905 }, { "epoch": 1.7341112993286907, "grad_norm": 0.1697137951850891, "learning_rate": 9.107552258778907e-07, "loss": 0.0799, "step": 6910 }, { "epoch": 1.73536608319217, "grad_norm": 0.11926918476819992, "learning_rate": 9.022988464089888e-07, "loss": 0.0726, "step": 6915 }, { "epoch": 1.7366208670556498, "grad_norm": 0.16987641155719757, "learning_rate": 8.9388005388647e-07, "loss": 0.0835, "step": 6920 }, { "epoch": 1.737875650919129, "grad_norm": 0.24412445724010468, "learning_rate": 8.854988830923905e-07, "loss": 0.0727, "step": 6925 }, { "epoch": 1.7391304347826086, "grad_norm": 0.33428534865379333, "learning_rate": 8.771553686533684e-07, "loss": 0.0844, "step": 6930 }, { "epoch": 1.7403852186460882, "grad_norm": 0.4698134958744049, "learning_rate": 8.688495450404444e-07, "loss": 0.0815, "step": 6935 }, { "epoch": 1.7416400025095677, "grad_norm": 0.3183957040309906, "learning_rate": 8.605814465689366e-07, "loss": 0.0779, "step": 6940 }, { "epoch": 1.7428947863730473, "grad_norm": 0.1109415665268898, "learning_rate": 8.523511073983127e-07, "loss": 0.0782, "step": 6945 }, { "epoch": 1.7441495702365266, "grad_norm": 0.14376573264598846, "learning_rate": 8.441585615320269e-07, "loss": 0.0865, "step": 6950 }, { "epoch": 1.7454043541000064, "grad_norm": 0.19789855182170868, "learning_rate": 8.360038428174022e-07, "loss": 0.0932, "step": 6955 }, { "epoch": 1.7466591379634857, "grad_norm": 0.2379181683063507, "learning_rate": 8.278869849454718e-07, "loss": 0.0809, "step": 6960 }, { "epoch": 1.7479139218269653, "grad_norm": 0.3969587981700897, "learning_rate": 8.198080214508486e-07, "loss": 0.0843, "step": 6965 }, { "epoch": 1.7491687056904448, "grad_norm": 0.23574315011501312, "learning_rate": 8.117669857115895e-07, "loss": 0.0853, "step": 6970 }, { "epoch": 1.7504234895539243, "grad_norm": 0.3272697925567627, "learning_rate": 8.037639109490524e-07, "loss": 0.0818, "step": 6975 }, { "epoch": 1.751678273417404, "grad_norm": 0.2460939586162567, "learning_rate": 7.957988302277597e-07, "loss": 0.0811, "step": 6980 }, { "epoch": 1.7529330572808832, "grad_norm": 0.3471406102180481, "learning_rate": 7.87871776455259e-07, "loss": 0.0651, "step": 6985 }, { "epoch": 1.754187841144363, "grad_norm": 0.22290903329849243, "learning_rate": 7.799827823819972e-07, "loss": 0.0793, "step": 6990 }, { "epoch": 1.7554426250078423, "grad_norm": 0.1208643913269043, "learning_rate": 7.721318806011713e-07, "loss": 0.0765, "step": 6995 }, { "epoch": 1.7566974088713219, "grad_norm": 0.176291823387146, "learning_rate": 7.643191035486086e-07, "loss": 0.0892, "step": 7000 }, { "epoch": 1.7579521927348014, "grad_norm": 0.518677294254303, "learning_rate": 7.56544483502617e-07, "loss": 0.0762, "step": 7005 }, { "epoch": 1.759206976598281, "grad_norm": 0.24120575189590454, "learning_rate": 7.488080525838636e-07, "loss": 0.0979, "step": 7010 }, { "epoch": 1.7604617604617605, "grad_norm": 0.16398218274116516, "learning_rate": 7.411098427552377e-07, "loss": 0.0815, "step": 7015 }, { "epoch": 1.7617165443252398, "grad_norm": 0.15948547422885895, "learning_rate": 7.334498858217231e-07, "loss": 0.082, "step": 7020 }, { "epoch": 1.7629713281887196, "grad_norm": 0.163728266954422, "learning_rate": 7.258282134302519e-07, "loss": 0.0888, "step": 7025 }, { "epoch": 1.764226112052199, "grad_norm": 0.18428277969360352, "learning_rate": 7.182448570695944e-07, "loss": 0.0796, "step": 7030 }, { "epoch": 1.7654808959156785, "grad_norm": 0.2015131711959839, "learning_rate": 7.106998480702165e-07, "loss": 0.0741, "step": 7035 }, { "epoch": 1.766735679779158, "grad_norm": 0.24829888343811035, "learning_rate": 7.031932176041522e-07, "loss": 0.0785, "step": 7040 }, { "epoch": 1.7679904636426376, "grad_norm": 0.15142977237701416, "learning_rate": 6.957249966848711e-07, "loss": 0.0837, "step": 7045 }, { "epoch": 1.7692452475061171, "grad_norm": 0.25044482946395874, "learning_rate": 6.882952161671652e-07, "loss": 0.083, "step": 7050 }, { "epoch": 1.7705000313695964, "grad_norm": 0.27821311354637146, "learning_rate": 6.809039067469991e-07, "loss": 0.0851, "step": 7055 }, { "epoch": 1.7717548152330762, "grad_norm": 0.21314631402492523, "learning_rate": 6.735510989614047e-07, "loss": 0.0891, "step": 7060 }, { "epoch": 1.7730095990965555, "grad_norm": 0.24855484068393707, "learning_rate": 6.662368231883388e-07, "loss": 0.0828, "step": 7065 }, { "epoch": 1.774264382960035, "grad_norm": 0.09157968312501907, "learning_rate": 6.589611096465642e-07, "loss": 0.0827, "step": 7070 }, { "epoch": 1.7755191668235146, "grad_norm": 0.17919617891311646, "learning_rate": 6.517239883955295e-07, "loss": 0.082, "step": 7075 }, { "epoch": 1.7767739506869942, "grad_norm": 0.32990872859954834, "learning_rate": 6.445254893352381e-07, "loss": 0.0847, "step": 7080 }, { "epoch": 1.7780287345504737, "grad_norm": 0.20937122404575348, "learning_rate": 6.373656422061247e-07, "loss": 0.0729, "step": 7085 }, { "epoch": 1.779283518413953, "grad_norm": 0.34432610869407654, "learning_rate": 6.302444765889337e-07, "loss": 0.0836, "step": 7090 }, { "epoch": 1.7805383022774328, "grad_norm": 0.23893016576766968, "learning_rate": 6.23162021904603e-07, "loss": 0.085, "step": 7095 }, { "epoch": 1.7817930861409121, "grad_norm": 0.26055169105529785, "learning_rate": 6.161183074141319e-07, "loss": 0.0836, "step": 7100 }, { "epoch": 1.7830478700043917, "grad_norm": 0.3493628203868866, "learning_rate": 6.091133622184664e-07, "loss": 0.0879, "step": 7105 }, { "epoch": 1.7843026538678712, "grad_norm": 0.15807025134563446, "learning_rate": 6.021472152583818e-07, "loss": 0.0846, "step": 7110 }, { "epoch": 1.7855574377313508, "grad_norm": 0.19779103994369507, "learning_rate": 5.952198953143539e-07, "loss": 0.0914, "step": 7115 }, { "epoch": 1.7868122215948303, "grad_norm": 0.2763902246952057, "learning_rate": 5.883314310064492e-07, "loss": 0.0752, "step": 7120 }, { "epoch": 1.7880670054583097, "grad_norm": 0.22704683244228363, "learning_rate": 5.814818507942055e-07, "loss": 0.0785, "step": 7125 }, { "epoch": 1.7893217893217894, "grad_norm": 0.3757963478565216, "learning_rate": 5.746711829765017e-07, "loss": 0.0814, "step": 7130 }, { "epoch": 1.7905765731852687, "grad_norm": 0.1545080542564392, "learning_rate": 5.678994556914618e-07, "loss": 0.0743, "step": 7135 }, { "epoch": 1.7918313570487485, "grad_norm": 0.18506070971488953, "learning_rate": 5.611666969163243e-07, "loss": 0.0831, "step": 7140 }, { "epoch": 1.7930861409122278, "grad_norm": 0.17606933414936066, "learning_rate": 5.544729344673294e-07, "loss": 0.0753, "step": 7145 }, { "epoch": 1.7943409247757074, "grad_norm": 0.30049043893814087, "learning_rate": 5.47818195999602e-07, "loss": 0.0893, "step": 7150 }, { "epoch": 1.795595708639187, "grad_norm": 0.07360345870256424, "learning_rate": 5.412025090070483e-07, "loss": 0.0791, "step": 7155 }, { "epoch": 1.7968504925026663, "grad_norm": 0.21283096075057983, "learning_rate": 5.346259008222243e-07, "loss": 0.0813, "step": 7160 }, { "epoch": 1.798105276366146, "grad_norm": 0.38110825419425964, "learning_rate": 5.280883986162433e-07, "loss": 0.0791, "step": 7165 }, { "epoch": 1.7993600602296254, "grad_norm": 0.12914220988750458, "learning_rate": 5.215900293986431e-07, "loss": 0.0758, "step": 7170 }, { "epoch": 1.8006148440931051, "grad_norm": 0.5188528299331665, "learning_rate": 5.151308200172911e-07, "loss": 0.0823, "step": 7175 }, { "epoch": 1.8018696279565845, "grad_norm": 0.23793403804302216, "learning_rate": 5.087107971582628e-07, "loss": 0.0786, "step": 7180 }, { "epoch": 1.803124411820064, "grad_norm": 0.18523241579532623, "learning_rate": 5.02329987345741e-07, "loss": 0.0864, "step": 7185 }, { "epoch": 1.8043791956835435, "grad_norm": 0.18162751197814941, "learning_rate": 4.959884169418949e-07, "loss": 0.079, "step": 7190 }, { "epoch": 1.8056339795470229, "grad_norm": 0.15588027238845825, "learning_rate": 4.896861121467778e-07, "loss": 0.0791, "step": 7195 }, { "epoch": 1.8068887634105026, "grad_norm": 0.16926927864551544, "learning_rate": 4.834230989982214e-07, "loss": 0.0762, "step": 7200 }, { "epoch": 1.808143547273982, "grad_norm": 0.29264163970947266, "learning_rate": 4.77199403371722e-07, "loss": 0.0805, "step": 7205 }, { "epoch": 1.8093983311374617, "grad_norm": 0.15263986587524414, "learning_rate": 4.7101505098033575e-07, "loss": 0.0893, "step": 7210 }, { "epoch": 1.810653115000941, "grad_norm": 0.12115020304918289, "learning_rate": 4.6487006737457765e-07, "loss": 0.0773, "step": 7215 }, { "epoch": 1.8119078988644206, "grad_norm": 0.08451762050390244, "learning_rate": 4.5876447794230504e-07, "loss": 0.0772, "step": 7220 }, { "epoch": 1.8131626827279002, "grad_norm": 0.2606160640716553, "learning_rate": 4.5269830790862444e-07, "loss": 0.0787, "step": 7225 }, { "epoch": 1.8144174665913795, "grad_norm": 0.18426910042762756, "learning_rate": 4.4667158233577925e-07, "loss": 0.0788, "step": 7230 }, { "epoch": 1.8156722504548592, "grad_norm": 0.1966276913881302, "learning_rate": 4.40684326123052e-07, "loss": 0.0955, "step": 7235 }, { "epoch": 1.8169270343183386, "grad_norm": 0.19326919317245483, "learning_rate": 4.3473656400665256e-07, "loss": 0.0984, "step": 7240 }, { "epoch": 1.8181818181818183, "grad_norm": 0.2253500372171402, "learning_rate": 4.2882832055962885e-07, "loss": 0.0747, "step": 7245 }, { "epoch": 1.8194366020452977, "grad_norm": 0.28359681367874146, "learning_rate": 4.22959620191753e-07, "loss": 0.0751, "step": 7250 }, { "epoch": 1.8206913859087772, "grad_norm": 0.12297849357128143, "learning_rate": 4.171304871494264e-07, "loss": 0.0708, "step": 7255 }, { "epoch": 1.8219461697722568, "grad_norm": 0.049460649490356445, "learning_rate": 4.113409455155837e-07, "loss": 0.0758, "step": 7260 }, { "epoch": 1.823200953635736, "grad_norm": 0.1316637098789215, "learning_rate": 4.0559101920958243e-07, "loss": 0.075, "step": 7265 }, { "epoch": 1.8244557374992159, "grad_norm": 0.11034820973873138, "learning_rate": 3.9988073198711564e-07, "loss": 0.0816, "step": 7270 }, { "epoch": 1.8257105213626952, "grad_norm": 0.5154989957809448, "learning_rate": 3.942101074401028e-07, "loss": 0.0787, "step": 7275 }, { "epoch": 1.826965305226175, "grad_norm": 0.4702956974506378, "learning_rate": 3.885791689966023e-07, "loss": 0.0862, "step": 7280 }, { "epoch": 1.8282200890896543, "grad_norm": 0.41101551055908203, "learning_rate": 3.8298793992070814e-07, "loss": 0.0888, "step": 7285 }, { "epoch": 1.8294748729531338, "grad_norm": 0.16913031041622162, "learning_rate": 3.774364433124578e-07, "loss": 0.0761, "step": 7290 }, { "epoch": 1.8307296568166134, "grad_norm": 0.1570875644683838, "learning_rate": 3.7192470210773435e-07, "loss": 0.0713, "step": 7295 }, { "epoch": 1.8319844406800927, "grad_norm": 0.45383045077323914, "learning_rate": 3.6645273907816805e-07, "loss": 0.0774, "step": 7300 }, { "epoch": 1.8332392245435725, "grad_norm": 0.12889161705970764, "learning_rate": 3.6102057683105596e-07, "loss": 0.0824, "step": 7305 }, { "epoch": 1.8344940084070518, "grad_norm": 0.34759390354156494, "learning_rate": 3.5562823780924906e-07, "loss": 0.0912, "step": 7310 }, { "epoch": 1.8357487922705316, "grad_norm": 0.35057494044303894, "learning_rate": 3.5027574429107536e-07, "loss": 0.0732, "step": 7315 }, { "epoch": 1.8370035761340109, "grad_norm": 0.1586838960647583, "learning_rate": 3.4496311839024133e-07, "loss": 0.0896, "step": 7320 }, { "epoch": 1.8382583599974904, "grad_norm": 0.13935531675815582, "learning_rate": 3.396903820557385e-07, "loss": 0.0796, "step": 7325 }, { "epoch": 1.83951314386097, "grad_norm": 0.20520001649856567, "learning_rate": 3.344575570717612e-07, "loss": 0.0839, "step": 7330 }, { "epoch": 1.8407679277244493, "grad_norm": 0.13597147166728973, "learning_rate": 3.292646650576037e-07, "loss": 0.0852, "step": 7335 }, { "epoch": 1.842022711587929, "grad_norm": 0.1146201342344284, "learning_rate": 3.2411172746758424e-07, "loss": 0.0845, "step": 7340 }, { "epoch": 1.8432774954514084, "grad_norm": 0.24954962730407715, "learning_rate": 3.1899876559094657e-07, "loss": 0.0705, "step": 7345 }, { "epoch": 1.8445322793148882, "grad_norm": 0.43199557065963745, "learning_rate": 3.1392580055177867e-07, "loss": 0.0894, "step": 7350 }, { "epoch": 1.8457870631783675, "grad_norm": 0.3313222825527191, "learning_rate": 3.0889285330891973e-07, "loss": 0.0888, "step": 7355 }, { "epoch": 1.847041847041847, "grad_norm": 0.08865600824356079, "learning_rate": 3.038999446558755e-07, "loss": 0.086, "step": 7360 }, { "epoch": 1.8482966309053266, "grad_norm": 0.21259349584579468, "learning_rate": 2.989470952207385e-07, "loss": 0.0911, "step": 7365 }, { "epoch": 1.849551414768806, "grad_norm": 0.28370821475982666, "learning_rate": 2.940343254660905e-07, "loss": 0.0883, "step": 7370 }, { "epoch": 1.8508061986322857, "grad_norm": 0.1723158210515976, "learning_rate": 2.891616556889321e-07, "loss": 0.0745, "step": 7375 }, { "epoch": 1.852060982495765, "grad_norm": 0.2103600651025772, "learning_rate": 2.843291060205855e-07, "loss": 0.0821, "step": 7380 }, { "epoch": 1.8533157663592448, "grad_norm": 0.19018976390361786, "learning_rate": 2.7953669642662107e-07, "loss": 0.0885, "step": 7385 }, { "epoch": 1.854570550222724, "grad_norm": 0.38891276717185974, "learning_rate": 2.747844467067706e-07, "loss": 0.0832, "step": 7390 }, { "epoch": 1.8558253340862036, "grad_norm": 0.17385374009609222, "learning_rate": 2.7007237649484763e-07, "loss": 0.0788, "step": 7395 }, { "epoch": 1.8570801179496832, "grad_norm": 0.453713059425354, "learning_rate": 2.654005052586628e-07, "loss": 0.0777, "step": 7400 }, { "epoch": 1.8583349018131625, "grad_norm": 0.20061570405960083, "learning_rate": 2.607688522999441e-07, "loss": 0.0742, "step": 7405 }, { "epoch": 1.8595896856766423, "grad_norm": 0.1047121062874794, "learning_rate": 2.5617743675426354e-07, "loss": 0.0917, "step": 7410 }, { "epoch": 1.8608444695401216, "grad_norm": 0.24845577776432037, "learning_rate": 2.516262775909506e-07, "loss": 0.0806, "step": 7415 }, { "epoch": 1.8620992534036014, "grad_norm": 0.3548727035522461, "learning_rate": 2.471153936130133e-07, "loss": 0.0913, "step": 7420 }, { "epoch": 1.8633540372670807, "grad_norm": 0.2546403706073761, "learning_rate": 2.4264480345707053e-07, "loss": 0.0855, "step": 7425 }, { "epoch": 1.8646088211305603, "grad_norm": 0.2284100353717804, "learning_rate": 2.3821452559326218e-07, "loss": 0.0849, "step": 7430 }, { "epoch": 1.8658636049940398, "grad_norm": 0.25266894698143005, "learning_rate": 2.3382457832518134e-07, "loss": 0.0778, "step": 7435 }, { "epoch": 1.8671183888575191, "grad_norm": 0.19911006093025208, "learning_rate": 2.294749797897955e-07, "loss": 0.0781, "step": 7440 }, { "epoch": 1.868373172720999, "grad_norm": 0.35023069381713867, "learning_rate": 2.2516574795737323e-07, "loss": 0.0787, "step": 7445 }, { "epoch": 1.8696279565844782, "grad_norm": 0.19626636803150177, "learning_rate": 2.2089690063140766e-07, "loss": 0.0833, "step": 7450 }, { "epoch": 1.870882740447958, "grad_norm": 0.1876339167356491, "learning_rate": 2.1666845544854542e-07, "loss": 0.084, "step": 7455 }, { "epoch": 1.8721375243114373, "grad_norm": 0.09253226220607758, "learning_rate": 2.1248042987851325e-07, "loss": 0.0875, "step": 7460 }, { "epoch": 1.8733923081749169, "grad_norm": 0.31013625860214233, "learning_rate": 2.083328412240404e-07, "loss": 0.083, "step": 7465 }, { "epoch": 1.8746470920383964, "grad_norm": 0.17299607396125793, "learning_rate": 2.0422570662079866e-07, "loss": 0.0887, "step": 7470 }, { "epoch": 1.8759018759018757, "grad_norm": 0.1518164724111557, "learning_rate": 2.0015904303732126e-07, "loss": 0.0711, "step": 7475 }, { "epoch": 1.8771566597653555, "grad_norm": 0.16225028038024902, "learning_rate": 1.961328672749352e-07, "loss": 0.0743, "step": 7480 }, { "epoch": 1.8784114436288348, "grad_norm": 0.39483585953712463, "learning_rate": 1.921471959676957e-07, "loss": 0.0856, "step": 7485 }, { "epoch": 1.8796662274923146, "grad_norm": 0.43932509422302246, "learning_rate": 1.8820204558231415e-07, "loss": 0.0813, "step": 7490 }, { "epoch": 1.880921011355794, "grad_norm": 0.29697397351264954, "learning_rate": 1.8429743241808795e-07, "loss": 0.079, "step": 7495 }, { "epoch": 1.8821757952192735, "grad_norm": 0.1230466440320015, "learning_rate": 1.804333726068408e-07, "loss": 0.0918, "step": 7500 }, { "epoch": 1.883430579082753, "grad_norm": 0.18823686242103577, "learning_rate": 1.766098821128459e-07, "loss": 0.0755, "step": 7505 }, { "epoch": 1.8846853629462323, "grad_norm": 0.3333011865615845, "learning_rate": 1.7282697673276837e-07, "loss": 0.0908, "step": 7510 }, { "epoch": 1.8859401468097121, "grad_norm": 0.15476582944393158, "learning_rate": 1.6908467209559853e-07, "loss": 0.0869, "step": 7515 }, { "epoch": 1.8871949306731914, "grad_norm": 0.20987029373645782, "learning_rate": 1.6538298366257975e-07, "loss": 0.074, "step": 7520 }, { "epoch": 1.8884497145366712, "grad_norm": 0.13596096634864807, "learning_rate": 1.6172192672715525e-07, "loss": 0.0886, "step": 7525 }, { "epoch": 1.8897044984001505, "grad_norm": 0.22909042239189148, "learning_rate": 1.5810151641489912e-07, "loss": 0.0846, "step": 7530 }, { "epoch": 1.89095928226363, "grad_norm": 0.24221235513687134, "learning_rate": 1.545217676834554e-07, "loss": 0.0815, "step": 7535 }, { "epoch": 1.8922140661271096, "grad_norm": 0.14234666526317596, "learning_rate": 1.5098269532247357e-07, "loss": 0.0863, "step": 7540 }, { "epoch": 1.8934688499905892, "grad_norm": 0.24849990010261536, "learning_rate": 1.4748431395355088e-07, "loss": 0.0819, "step": 7545 }, { "epoch": 1.8947236338540687, "grad_norm": 0.2965373992919922, "learning_rate": 1.4402663803017249e-07, "loss": 0.0795, "step": 7550 }, { "epoch": 1.895978417717548, "grad_norm": 0.3867776691913605, "learning_rate": 1.4060968183764678e-07, "loss": 0.0904, "step": 7555 }, { "epoch": 1.8972332015810278, "grad_norm": 0.31307804584503174, "learning_rate": 1.3723345949305245e-07, "loss": 0.0798, "step": 7560 }, { "epoch": 1.8984879854445071, "grad_norm": 0.3106546700000763, "learning_rate": 1.338979849451738e-07, "loss": 0.0864, "step": 7565 }, { "epoch": 1.8997427693079867, "grad_norm": 0.22093643248081207, "learning_rate": 1.3060327197444767e-07, "loss": 0.0776, "step": 7570 }, { "epoch": 1.9009975531714662, "grad_norm": 0.1310010552406311, "learning_rate": 1.2734933419290996e-07, "loss": 0.077, "step": 7575 }, { "epoch": 1.9022523370349458, "grad_norm": 0.23409627377986908, "learning_rate": 1.2413618504412806e-07, "loss": 0.0777, "step": 7580 }, { "epoch": 1.9035071208984253, "grad_norm": 0.2928149402141571, "learning_rate": 1.2096383780315411e-07, "loss": 0.0899, "step": 7585 }, { "epoch": 1.9047619047619047, "grad_norm": 0.26420190930366516, "learning_rate": 1.1783230557647075e-07, "loss": 0.0875, "step": 7590 }, { "epoch": 1.9060166886253844, "grad_norm": 0.17748254537582397, "learning_rate": 1.1474160130193313e-07, "loss": 0.0878, "step": 7595 }, { "epoch": 1.9072714724888638, "grad_norm": 0.41803818941116333, "learning_rate": 1.1169173774871478e-07, "loss": 0.0881, "step": 7600 }, { "epoch": 1.9085262563523433, "grad_norm": 0.22346888482570648, "learning_rate": 1.086827275172575e-07, "loss": 0.0794, "step": 7605 }, { "epoch": 1.9097810402158228, "grad_norm": 0.1962645947933197, "learning_rate": 1.0571458303922033e-07, "loss": 0.0811, "step": 7610 }, { "epoch": 1.9110358240793024, "grad_norm": 0.3406698703765869, "learning_rate": 1.0278731657742292e-07, "loss": 0.0816, "step": 7615 }, { "epoch": 1.912290607942782, "grad_norm": 0.14449894428253174, "learning_rate": 9.990094022580332e-08, "loss": 0.0856, "step": 7620 }, { "epoch": 1.9135453918062613, "grad_norm": 0.1097446084022522, "learning_rate": 9.70554659093581e-08, "loss": 0.0911, "step": 7625 }, { "epoch": 1.914800175669741, "grad_norm": 0.43302279710769653, "learning_rate": 9.425090538409898e-08, "loss": 0.0877, "step": 7630 }, { "epoch": 1.9160549595332204, "grad_norm": 0.26026082038879395, "learning_rate": 9.148727023700731e-08, "loss": 0.0803, "step": 7635 }, { "epoch": 1.9173097433967, "grad_norm": 0.2453165352344513, "learning_rate": 8.876457188597642e-08, "loss": 0.0843, "step": 7640 }, { "epoch": 1.9185645272601795, "grad_norm": 0.4708387851715088, "learning_rate": 8.608282157977488e-08, "loss": 0.0788, "step": 7645 }, { "epoch": 1.919819311123659, "grad_norm": 0.12131650745868683, "learning_rate": 8.344203039799214e-08, "loss": 0.0806, "step": 7650 }, { "epoch": 1.9210740949871385, "grad_norm": 0.21256065368652344, "learning_rate": 8.084220925099751e-08, "loss": 0.087, "step": 7655 }, { "epoch": 1.9223288788506179, "grad_norm": 0.2987827658653259, "learning_rate": 7.82833688798934e-08, "loss": 0.0807, "step": 7660 }, { "epoch": 1.9235836627140976, "grad_norm": 0.12467091530561447, "learning_rate": 7.576551985647107e-08, "loss": 0.0995, "step": 7665 }, { "epoch": 1.924838446577577, "grad_norm": 0.15592101216316223, "learning_rate": 7.328867258316608e-08, "loss": 0.0881, "step": 7670 }, { "epoch": 1.9260932304410565, "grad_norm": 0.1511273980140686, "learning_rate": 7.085283729301728e-08, "loss": 0.0847, "step": 7675 }, { "epoch": 1.927348014304536, "grad_norm": 0.13429687917232513, "learning_rate": 6.845802404962243e-08, "loss": 0.0766, "step": 7680 }, { "epoch": 1.9286027981680156, "grad_norm": 0.10453824698925018, "learning_rate": 6.610424274710037e-08, "loss": 0.077, "step": 7685 }, { "epoch": 1.9298575820314952, "grad_norm": 0.18403221666812897, "learning_rate": 6.379150311004224e-08, "loss": 0.0802, "step": 7690 }, { "epoch": 1.9311123658949745, "grad_norm": 0.09399021416902542, "learning_rate": 6.151981469348034e-08, "loss": 0.0947, "step": 7695 }, { "epoch": 1.9323671497584543, "grad_norm": 0.19107823073863983, "learning_rate": 5.928918688284602e-08, "loss": 0.0799, "step": 7700 }, { "epoch": 1.9336219336219336, "grad_norm": 0.18196965754032135, "learning_rate": 5.709962889392628e-08, "loss": 0.0696, "step": 7705 }, { "epoch": 1.9348767174854131, "grad_norm": 0.32829707860946655, "learning_rate": 5.495114977282945e-08, "loss": 0.0825, "step": 7710 }, { "epoch": 1.9361315013488927, "grad_norm": 0.21754246950149536, "learning_rate": 5.284375839594958e-08, "loss": 0.0936, "step": 7715 }, { "epoch": 1.9373862852123722, "grad_norm": 0.13130666315555573, "learning_rate": 5.0777463469925406e-08, "loss": 0.089, "step": 7720 }, { "epoch": 1.9386410690758518, "grad_norm": 0.1840493381023407, "learning_rate": 4.8752273531609276e-08, "loss": 0.0847, "step": 7725 }, { "epoch": 1.939895852939331, "grad_norm": 0.12967444956302643, "learning_rate": 4.676819694802604e-08, "loss": 0.0782, "step": 7730 }, { "epoch": 1.9411506368028109, "grad_norm": 0.1979934573173523, "learning_rate": 4.4825241916344184e-08, "loss": 0.0797, "step": 7735 }, { "epoch": 1.9424054206662902, "grad_norm": 0.12666776776313782, "learning_rate": 4.292341646383813e-08, "loss": 0.0794, "step": 7740 }, { "epoch": 1.9436602045297697, "grad_norm": 0.11725469678640366, "learning_rate": 4.106272844785486e-08, "loss": 0.0777, "step": 7745 }, { "epoch": 1.9449149883932493, "grad_norm": 0.4441820979118347, "learning_rate": 3.924318555578843e-08, "loss": 0.0829, "step": 7750 }, { "epoch": 1.9461697722567288, "grad_norm": 0.26033830642700195, "learning_rate": 3.7464795305036664e-08, "loss": 0.0806, "step": 7755 }, { "epoch": 1.9474245561202084, "grad_norm": 0.19797271490097046, "learning_rate": 3.572756504297892e-08, "loss": 0.0681, "step": 7760 }, { "epoch": 1.9486793399836877, "grad_norm": 0.15105128288269043, "learning_rate": 3.4031501946942826e-08, "loss": 0.0868, "step": 7765 }, { "epoch": 1.9499341238471675, "grad_norm": 0.2811802625656128, "learning_rate": 3.2376613024175384e-08, "loss": 0.0895, "step": 7770 }, { "epoch": 1.9511889077106468, "grad_norm": 0.3569127917289734, "learning_rate": 3.0762905111811904e-08, "loss": 0.0927, "step": 7775 }, { "epoch": 1.9524436915741263, "grad_norm": 0.5091187953948975, "learning_rate": 2.9190384876849333e-08, "loss": 0.0864, "step": 7780 }, { "epoch": 1.9536984754376059, "grad_norm": 0.228666290640831, "learning_rate": 2.7659058816121855e-08, "loss": 0.0897, "step": 7785 }, { "epoch": 1.9549532593010854, "grad_norm": 0.07750795036554337, "learning_rate": 2.616893325626646e-08, "loss": 0.0733, "step": 7790 }, { "epoch": 1.956208043164565, "grad_norm": 0.2381664514541626, "learning_rate": 2.472001435370297e-08, "loss": 0.0729, "step": 7795 }, { "epoch": 1.9574628270280443, "grad_norm": 0.27068066596984863, "learning_rate": 2.3312308094607382e-08, "loss": 0.0957, "step": 7800 }, { "epoch": 1.958717610891524, "grad_norm": 0.37700343132019043, "learning_rate": 2.1945820294888564e-08, "loss": 0.0766, "step": 7805 }, { "epoch": 1.9599723947550034, "grad_norm": 0.32152286171913147, "learning_rate": 2.062055660015716e-08, "loss": 0.0764, "step": 7810 }, { "epoch": 1.961227178618483, "grad_norm": 0.164317324757576, "learning_rate": 1.9336522485710053e-08, "loss": 0.0825, "step": 7815 }, { "epoch": 1.9624819624819625, "grad_norm": 0.1482664942741394, "learning_rate": 1.8093723256507044e-08, "loss": 0.0761, "step": 7820 }, { "epoch": 1.963736746345442, "grad_norm": 0.15529005229473114, "learning_rate": 1.689216404714311e-08, "loss": 0.0879, "step": 7825 }, { "epoch": 1.9649915302089216, "grad_norm": 0.2088499665260315, "learning_rate": 1.5731849821833955e-08, "loss": 0.0852, "step": 7830 }, { "epoch": 1.966246314072401, "grad_norm": 0.2643318176269531, "learning_rate": 1.4612785374392701e-08, "loss": 0.0798, "step": 7835 }, { "epoch": 1.9675010979358807, "grad_norm": 0.16332080960273743, "learning_rate": 1.3534975328205468e-08, "loss": 0.0802, "step": 7840 }, { "epoch": 1.96875588179936, "grad_norm": 0.1658352017402649, "learning_rate": 1.2498424136223597e-08, "loss": 0.0821, "step": 7845 }, { "epoch": 1.9700106656628396, "grad_norm": 0.35251694917678833, "learning_rate": 1.1503136080932565e-08, "loss": 0.0822, "step": 7850 }, { "epoch": 1.971265449526319, "grad_norm": 0.1039208248257637, "learning_rate": 1.0549115274344213e-08, "loss": 0.0847, "step": 7855 }, { "epoch": 1.9725202333897986, "grad_norm": 0.3832782208919525, "learning_rate": 9.636365657971215e-09, "loss": 0.0732, "step": 7860 }, { "epoch": 1.9737750172532782, "grad_norm": 0.35436856746673584, "learning_rate": 8.764891002821519e-09, "loss": 0.0796, "step": 7865 }, { "epoch": 1.9750298011167575, "grad_norm": 0.3917001187801361, "learning_rate": 7.93469490936949e-09, "loss": 0.0747, "step": 7870 }, { "epoch": 1.9762845849802373, "grad_norm": 0.20635858178138733, "learning_rate": 7.145780807553681e-09, "loss": 0.0804, "step": 7875 }, { "epoch": 1.9775393688437166, "grad_norm": 0.12219113856554031, "learning_rate": 6.398151956754639e-09, "loss": 0.0854, "step": 7880 }, { "epoch": 1.9787941527071962, "grad_norm": 0.08329229801893234, "learning_rate": 5.69181144578268e-09, "loss": 0.0863, "step": 7885 }, { "epoch": 1.9800489365706757, "grad_norm": 0.2005588710308075, "learning_rate": 5.026762192870127e-09, "loss": 0.077, "step": 7890 }, { "epoch": 1.9813037204341553, "grad_norm": 0.1955815702676773, "learning_rate": 4.403006945650212e-09, "loss": 0.0843, "step": 7895 }, { "epoch": 1.9825585042976348, "grad_norm": 0.17901791632175446, "learning_rate": 3.820548281154857e-09, "loss": 0.0731, "step": 7900 }, { "epoch": 1.9838132881611141, "grad_norm": 0.1416298747062683, "learning_rate": 3.2793886057991277e-09, "loss": 0.0729, "step": 7905 }, { "epoch": 1.985068072024594, "grad_norm": 0.3370745778083801, "learning_rate": 2.7795301553712463e-09, "loss": 0.0794, "step": 7910 }, { "epoch": 1.9863228558880732, "grad_norm": 0.16576644778251648, "learning_rate": 2.3209749950259264e-09, "loss": 0.075, "step": 7915 }, { "epoch": 1.9875776397515528, "grad_norm": 0.2831018567085266, "learning_rate": 1.9037250192732728e-09, "loss": 0.0754, "step": 7920 }, { "epoch": 1.9888324236150323, "grad_norm": 0.1539755016565323, "learning_rate": 1.527781951971008e-09, "loss": 0.0859, "step": 7925 }, { "epoch": 1.9900872074785119, "grad_norm": 0.18794752657413483, "learning_rate": 1.1931473463200339e-09, "loss": 0.0924, "step": 7930 }, { "epoch": 1.9913419913419914, "grad_norm": 0.1152065321803093, "learning_rate": 8.998225848566577e-10, "loss": 0.0793, "step": 7935 }, { "epoch": 1.9925967752054707, "grad_norm": 0.36054694652557373, "learning_rate": 6.478088794448223e-10, "loss": 0.0732, "step": 7940 }, { "epoch": 1.9938515590689505, "grad_norm": 0.05416973680257797, "learning_rate": 4.3710727127277417e-10, "loss": 0.0812, "step": 7945 }, { "epoch": 1.9951063429324298, "grad_norm": 0.19745150208473206, "learning_rate": 2.677186308497337e-10, "loss": 0.0805, "step": 7950 }, { "epoch": 1.9963611267959094, "grad_norm": 0.11429372429847717, "learning_rate": 1.3964365800145374e-10, "loss": 0.0796, "step": 7955 }, { "epoch": 1.997615910659389, "grad_norm": 0.16175100207328796, "learning_rate": 5.288288186688917e-11, "loss": 0.0791, "step": 7960 }, { "epoch": 1.9988706945228685, "grad_norm": 0.3085331618785858, "learning_rate": 7.436660894866165e-12, "loss": 0.0807, "step": 7965 }, { "epoch": 1.9996235648409562, "step": 7968, "total_flos": 0.0, "train_loss": 0.0878100987938962, "train_runtime": 323838.4314, "train_samples_per_second": 1.575, "train_steps_per_second": 0.025 } ], "logging_steps": 5, "max_steps": 7968, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }