diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11193 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9996235648409562, + "eval_steps": 500, + "global_step": 7968, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012547838634795157, + "grad_norm": 25.442365646362305, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.1369, + "step": 5 + }, + { + "epoch": 0.0025095677269590315, + "grad_norm": 2707.152587890625, + "learning_rate": 8.333333333333333e-07, + "loss": 0.1494, + "step": 10 + }, + { + "epoch": 0.003764351590438547, + "grad_norm": 1.4790934324264526, + "learning_rate": 1.25e-06, + "loss": 0.1436, + "step": 15 + }, + { + "epoch": 0.005019135453918063, + "grad_norm": 6611.14990234375, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.1239, + "step": 20 + }, + { + "epoch": 0.006273919317397578, + "grad_norm": 3.4831368923187256, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.1313, + "step": 25 + }, + { + "epoch": 0.007528703180877094, + "grad_norm": 0.8000121712684631, + "learning_rate": 2.5e-06, + "loss": 0.1203, + "step": 30 + }, + { + "epoch": 0.00878348704435661, + "grad_norm": 0.5967940092086792, + "learning_rate": 2.916666666666667e-06, + "loss": 0.1185, + "step": 35 + }, + { + "epoch": 0.010038270907836126, + "grad_norm": 3509.61083984375, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1243, + "step": 40 + }, + { + "epoch": 0.01129305477131564, + "grad_norm": 4.068423271179199, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.123, + "step": 45 + }, + { + "epoch": 0.012547838634795156, + "grad_norm": 981.9570922851562, + "learning_rate": 4.166666666666667e-06, + "loss": 0.1358, + "step": 50 + }, + { + "epoch": 0.013802622498274672, + "grad_norm": 565.902099609375, + "learning_rate": 4.583333333333333e-06, + "loss": 0.1181, + "step": 55 + }, + { + "epoch": 0.015057406361754188, + "grad_norm": 1.1762653589248657, + "learning_rate": 5e-06, + "loss": 0.1138, + "step": 60 + }, + { + "epoch": 0.016312190225233704, + "grad_norm": 13.798249244689941, + "learning_rate": 5.416666666666667e-06, + "loss": 0.1303, + "step": 65 + }, + { + "epoch": 0.01756697408871322, + "grad_norm": 1.9088438749313354, + "learning_rate": 5.833333333333334e-06, + "loss": 0.128, + "step": 70 + }, + { + "epoch": 0.018821757952192736, + "grad_norm": 379.896484375, + "learning_rate": 6.25e-06, + "loss": 0.1115, + "step": 75 + }, + { + "epoch": 0.020076541815672252, + "grad_norm": 53.034820556640625, + "learning_rate": 6.666666666666667e-06, + "loss": 0.1106, + "step": 80 + }, + { + "epoch": 0.021331325679151768, + "grad_norm": 0.5621482729911804, + "learning_rate": 7.083333333333335e-06, + "loss": 0.1339, + "step": 85 + }, + { + "epoch": 0.02258610954263128, + "grad_norm": 1503.7694091796875, + "learning_rate": 7.500000000000001e-06, + "loss": 0.1091, + "step": 90 + }, + { + "epoch": 0.023840893406110796, + "grad_norm": 21.09101676940918, + "learning_rate": 7.916666666666667e-06, + "loss": 0.1027, + "step": 95 + }, + { + "epoch": 0.025095677269590312, + "grad_norm": 153.1422882080078, + "learning_rate": 8.333333333333334e-06, + "loss": 0.1158, + "step": 100 + }, + { + "epoch": 0.026350461133069828, + "grad_norm": 686.55615234375, + "learning_rate": 8.750000000000001e-06, + "loss": 0.1239, + "step": 105 + }, + { + "epoch": 0.027605244996549344, + "grad_norm": 456.04541015625, + "learning_rate": 9.166666666666666e-06, + "loss": 0.1176, + "step": 110 + }, + { + "epoch": 0.02886002886002886, + "grad_norm": 244.83534240722656, + "learning_rate": 9.583333333333335e-06, + "loss": 0.0952, + "step": 115 + }, + { + "epoch": 0.030114812723508376, + "grad_norm": 720.8134155273438, + "learning_rate": 1e-05, + "loss": 0.1181, + "step": 120 + }, + { + "epoch": 0.03136959658698789, + "grad_norm": 1.439946174621582, + "learning_rate": 1.0416666666666668e-05, + "loss": 0.1147, + "step": 125 + }, + { + "epoch": 0.03262438045046741, + "grad_norm": 24.099985122680664, + "learning_rate": 1.0833333333333334e-05, + "loss": 0.1133, + "step": 130 + }, + { + "epoch": 0.03387916431394692, + "grad_norm": 10.10527515411377, + "learning_rate": 1.125e-05, + "loss": 0.1237, + "step": 135 + }, + { + "epoch": 0.03513394817742644, + "grad_norm": 171.32110595703125, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.3536, + "step": 140 + }, + { + "epoch": 0.03638873204090595, + "grad_norm": 129.24395751953125, + "learning_rate": 1.2083333333333333e-05, + "loss": 0.1079, + "step": 145 + }, + { + "epoch": 0.03764351590438547, + "grad_norm": 3.056920289993286, + "learning_rate": 1.25e-05, + "loss": 0.1223, + "step": 150 + }, + { + "epoch": 0.038898299767864984, + "grad_norm": 58.41600036621094, + "learning_rate": 1.2916666666666668e-05, + "loss": 0.094, + "step": 155 + }, + { + "epoch": 0.040153083631344504, + "grad_norm": 67.04778289794922, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.1181, + "step": 160 + }, + { + "epoch": 0.041407867494824016, + "grad_norm": 152.23841857910156, + "learning_rate": 1.375e-05, + "loss": 0.1122, + "step": 165 + }, + { + "epoch": 0.042662651358303535, + "grad_norm": 35.90134048461914, + "learning_rate": 1.416666666666667e-05, + "loss": 0.1152, + "step": 170 + }, + { + "epoch": 0.04391743522178305, + "grad_norm": 1.0190317630767822, + "learning_rate": 1.4583333333333333e-05, + "loss": 0.1138, + "step": 175 + }, + { + "epoch": 0.04517221908526256, + "grad_norm": 0.9965628385543823, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.1127, + "step": 180 + }, + { + "epoch": 0.04642700294874208, + "grad_norm": 1.6608027219772339, + "learning_rate": 1.5416666666666668e-05, + "loss": 0.1114, + "step": 185 + }, + { + "epoch": 0.04768178681222159, + "grad_norm": 1.2562459707260132, + "learning_rate": 1.5833333333333333e-05, + "loss": 0.1051, + "step": 190 + }, + { + "epoch": 0.04893657067570111, + "grad_norm": 0.13770118355751038, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.1049, + "step": 195 + }, + { + "epoch": 0.050191354539180624, + "grad_norm": 0.3370998799800873, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.1044, + "step": 200 + }, + { + "epoch": 0.051446138402660144, + "grad_norm": 0.5696057081222534, + "learning_rate": 1.7083333333333333e-05, + "loss": 0.1049, + "step": 205 + }, + { + "epoch": 0.052700922266139656, + "grad_norm": 0.24888598918914795, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.1065, + "step": 210 + }, + { + "epoch": 0.053955706129619176, + "grad_norm": 0.5320826172828674, + "learning_rate": 1.7916666666666667e-05, + "loss": 0.1044, + "step": 215 + }, + { + "epoch": 0.05521048999309869, + "grad_norm": 0.28355732560157776, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.1189, + "step": 220 + }, + { + "epoch": 0.05646527385657821, + "grad_norm": 0.20372584462165833, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.1038, + "step": 225 + }, + { + "epoch": 0.05772005772005772, + "grad_norm": 0.5983103513717651, + "learning_rate": 1.916666666666667e-05, + "loss": 0.0955, + "step": 230 + }, + { + "epoch": 0.05897484158353723, + "grad_norm": 0.36013373732566833, + "learning_rate": 1.9583333333333333e-05, + "loss": 0.1012, + "step": 235 + }, + { + "epoch": 0.06022962544701675, + "grad_norm": 0.20898281037807465, + "learning_rate": 2e-05, + "loss": 0.0993, + "step": 240 + }, + { + "epoch": 0.061484409310496264, + "grad_norm": 0.29076099395751953, + "learning_rate": 1.999997934261318e-05, + "loss": 0.106, + "step": 245 + }, + { + "epoch": 0.06273919317397578, + "grad_norm": 0.8422909379005432, + "learning_rate": 1.999991737053805e-05, + "loss": 0.0944, + "step": 250 + }, + { + "epoch": 0.0639939770374553, + "grad_norm": 0.165451779961586, + "learning_rate": 1.999981408403066e-05, + "loss": 0.1004, + "step": 255 + }, + { + "epoch": 0.06524876090093482, + "grad_norm": 0.14131979644298553, + "learning_rate": 1.9999669483517726e-05, + "loss": 0.0877, + "step": 260 + }, + { + "epoch": 0.06650354476441434, + "grad_norm": 0.2859463095664978, + "learning_rate": 1.9999483569596664e-05, + "loss": 0.0939, + "step": 265 + }, + { + "epoch": 0.06775832862789384, + "grad_norm": 0.26985302567481995, + "learning_rate": 1.9999256343035577e-05, + "loss": 0.1005, + "step": 270 + }, + { + "epoch": 0.06901311249137336, + "grad_norm": 0.16001251339912415, + "learning_rate": 1.9998987804773244e-05, + "loss": 0.0991, + "step": 275 + }, + { + "epoch": 0.07026789635485288, + "grad_norm": 0.28724098205566406, + "learning_rate": 1.9998677955919127e-05, + "loss": 0.1039, + "step": 280 + }, + { + "epoch": 0.0715226802183324, + "grad_norm": 0.10330358892679214, + "learning_rate": 1.9998326797753352e-05, + "loss": 0.0992, + "step": 285 + }, + { + "epoch": 0.0727774640818119, + "grad_norm": 0.630402147769928, + "learning_rate": 1.999793433172673e-05, + "loss": 0.1055, + "step": 290 + }, + { + "epoch": 0.07403224794529142, + "grad_norm": 0.3052836060523987, + "learning_rate": 1.9997500559460718e-05, + "loss": 0.0966, + "step": 295 + }, + { + "epoch": 0.07528703180877094, + "grad_norm": 0.3691750466823578, + "learning_rate": 1.999702548274744e-05, + "loss": 0.1047, + "step": 300 + }, + { + "epoch": 0.07654181567225045, + "grad_norm": 0.47557133436203003, + "learning_rate": 1.999650910354967e-05, + "loss": 0.1022, + "step": 305 + }, + { + "epoch": 0.07779659953572997, + "grad_norm": 0.21415193378925323, + "learning_rate": 1.999595142400081e-05, + "loss": 0.1075, + "step": 310 + }, + { + "epoch": 0.07905138339920949, + "grad_norm": 0.3545961081981659, + "learning_rate": 1.99953524464049e-05, + "loss": 0.096, + "step": 315 + }, + { + "epoch": 0.08030616726268901, + "grad_norm": 0.3949846029281616, + "learning_rate": 1.9994712173236604e-05, + "loss": 0.0974, + "step": 320 + }, + { + "epoch": 0.08156095112616851, + "grad_norm": 1.0949060916900635, + "learning_rate": 1.9994030607141196e-05, + "loss": 0.0973, + "step": 325 + }, + { + "epoch": 0.08281573498964803, + "grad_norm": 0.35101133584976196, + "learning_rate": 1.9993307750934555e-05, + "loss": 0.0965, + "step": 330 + }, + { + "epoch": 0.08407051885312755, + "grad_norm": 0.29526200890541077, + "learning_rate": 1.999254360760314e-05, + "loss": 0.0898, + "step": 335 + }, + { + "epoch": 0.08532530271660707, + "grad_norm": 0.09260907024145126, + "learning_rate": 1.999173818030399e-05, + "loss": 0.0948, + "step": 340 + }, + { + "epoch": 0.08658008658008658, + "grad_norm": 0.2705346345901489, + "learning_rate": 1.999089147236472e-05, + "loss": 0.0946, + "step": 345 + }, + { + "epoch": 0.0878348704435661, + "grad_norm": 0.3629266917705536, + "learning_rate": 1.999000348728347e-05, + "loss": 0.0999, + "step": 350 + }, + { + "epoch": 0.08908965430704562, + "grad_norm": 0.11858992278575897, + "learning_rate": 1.9989074228728942e-05, + "loss": 0.0931, + "step": 355 + }, + { + "epoch": 0.09034443817052512, + "grad_norm": 0.41660797595977783, + "learning_rate": 1.9988103700540345e-05, + "loss": 0.1047, + "step": 360 + }, + { + "epoch": 0.09159922203400464, + "grad_norm": 0.7686080932617188, + "learning_rate": 1.9987091906727387e-05, + "loss": 0.0968, + "step": 365 + }, + { + "epoch": 0.09285400589748416, + "grad_norm": 0.3915267884731293, + "learning_rate": 1.998603885147028e-05, + "loss": 0.0947, + "step": 370 + }, + { + "epoch": 0.09410878976096368, + "grad_norm": 0.23744699358940125, + "learning_rate": 1.998494453911969e-05, + "loss": 0.0915, + "step": 375 + }, + { + "epoch": 0.09536357362444318, + "grad_norm": 0.11716315150260925, + "learning_rate": 1.9983808974196752e-05, + "loss": 0.1005, + "step": 380 + }, + { + "epoch": 0.0966183574879227, + "grad_norm": 0.355939656496048, + "learning_rate": 1.9982632161393022e-05, + "loss": 0.0992, + "step": 385 + }, + { + "epoch": 0.09787314135140222, + "grad_norm": 0.23542310297489166, + "learning_rate": 1.9981414105570473e-05, + "loss": 0.0864, + "step": 390 + }, + { + "epoch": 0.09912792521488174, + "grad_norm": 0.5069738626480103, + "learning_rate": 1.9980154811761482e-05, + "loss": 0.0929, + "step": 395 + }, + { + "epoch": 0.10038270907836125, + "grad_norm": 0.5965291857719421, + "learning_rate": 1.9978854285168784e-05, + "loss": 0.0979, + "step": 400 + }, + { + "epoch": 0.10163749294184077, + "grad_norm": 0.557249128818512, + "learning_rate": 1.9977512531165484e-05, + "loss": 0.0994, + "step": 405 + }, + { + "epoch": 0.10289227680532029, + "grad_norm": 0.3753920793533325, + "learning_rate": 1.9976129555295003e-05, + "loss": 0.1009, + "step": 410 + }, + { + "epoch": 0.10414706066879979, + "grad_norm": 0.18703804910182953, + "learning_rate": 1.9974705363271076e-05, + "loss": 0.0938, + "step": 415 + }, + { + "epoch": 0.10540184453227931, + "grad_norm": 0.49501076340675354, + "learning_rate": 1.997323996097772e-05, + "loss": 0.083, + "step": 420 + }, + { + "epoch": 0.10665662839575883, + "grad_norm": 0.13180793821811676, + "learning_rate": 1.9971733354469215e-05, + "loss": 0.1087, + "step": 425 + }, + { + "epoch": 0.10791141225923835, + "grad_norm": 0.4710563123226166, + "learning_rate": 1.9970185549970066e-05, + "loss": 0.0977, + "step": 430 + }, + { + "epoch": 0.10916619612271786, + "grad_norm": 0.5056783556938171, + "learning_rate": 1.9968596553874993e-05, + "loss": 0.0947, + "step": 435 + }, + { + "epoch": 0.11042097998619738, + "grad_norm": 0.2544513940811157, + "learning_rate": 1.99669663727489e-05, + "loss": 0.0925, + "step": 440 + }, + { + "epoch": 0.1116757638496769, + "grad_norm": 0.512052059173584, + "learning_rate": 1.9965295013326843e-05, + "loss": 0.0984, + "step": 445 + }, + { + "epoch": 0.11293054771315642, + "grad_norm": 0.31233909726142883, + "learning_rate": 1.9963582482514003e-05, + "loss": 0.0903, + "step": 450 + }, + { + "epoch": 0.11418533157663592, + "grad_norm": 0.3753993511199951, + "learning_rate": 1.9961828787385662e-05, + "loss": 0.0985, + "step": 455 + }, + { + "epoch": 0.11544011544011544, + "grad_norm": 0.14997100830078125, + "learning_rate": 1.996003393518718e-05, + "loss": 0.0991, + "step": 460 + }, + { + "epoch": 0.11669489930359496, + "grad_norm": 0.1637381911277771, + "learning_rate": 1.995819793333394e-05, + "loss": 0.0833, + "step": 465 + }, + { + "epoch": 0.11794968316707446, + "grad_norm": 0.35544639825820923, + "learning_rate": 1.9956320789411338e-05, + "loss": 0.0901, + "step": 470 + }, + { + "epoch": 0.11920446703055398, + "grad_norm": 0.29124122858047485, + "learning_rate": 1.9954402511174763e-05, + "loss": 0.1043, + "step": 475 + }, + { + "epoch": 0.1204592508940335, + "grad_norm": 0.1517423689365387, + "learning_rate": 1.9952443106549535e-05, + "loss": 0.0937, + "step": 480 + }, + { + "epoch": 0.12171403475751302, + "grad_norm": 0.22879864275455475, + "learning_rate": 1.9950442583630884e-05, + "loss": 0.0983, + "step": 485 + }, + { + "epoch": 0.12296881862099253, + "grad_norm": 0.12301073968410492, + "learning_rate": 1.9948400950683932e-05, + "loss": 0.0937, + "step": 490 + }, + { + "epoch": 0.12422360248447205, + "grad_norm": 0.46629127860069275, + "learning_rate": 1.9946318216143633e-05, + "loss": 0.109, + "step": 495 + }, + { + "epoch": 0.12547838634795155, + "grad_norm": 0.34613123536109924, + "learning_rate": 1.9944194388614764e-05, + "loss": 0.0979, + "step": 500 + }, + { + "epoch": 0.12673317021143107, + "grad_norm": 0.3203139901161194, + "learning_rate": 1.9942029476871868e-05, + "loss": 0.105, + "step": 505 + }, + { + "epoch": 0.1279879540749106, + "grad_norm": 0.23387163877487183, + "learning_rate": 1.9939823489859226e-05, + "loss": 0.0918, + "step": 510 + }, + { + "epoch": 0.1292427379383901, + "grad_norm": 0.5645913481712341, + "learning_rate": 1.9937576436690822e-05, + "loss": 0.1008, + "step": 515 + }, + { + "epoch": 0.13049752180186963, + "grad_norm": 0.758552074432373, + "learning_rate": 1.9935288326650314e-05, + "loss": 0.1041, + "step": 520 + }, + { + "epoch": 0.13175230566534915, + "grad_norm": 0.40192508697509766, + "learning_rate": 1.993295916919097e-05, + "loss": 0.0961, + "step": 525 + }, + { + "epoch": 0.13300708952882867, + "grad_norm": 0.3283374309539795, + "learning_rate": 1.9930588973935653e-05, + "loss": 0.1099, + "step": 530 + }, + { + "epoch": 0.13426187339230816, + "grad_norm": 0.13031624257564545, + "learning_rate": 1.992817775067677e-05, + "loss": 0.0964, + "step": 535 + }, + { + "epoch": 0.13551665725578768, + "grad_norm": 0.40052691102027893, + "learning_rate": 1.9925725509376236e-05, + "loss": 0.1003, + "step": 540 + }, + { + "epoch": 0.1367714411192672, + "grad_norm": 0.20770736038684845, + "learning_rate": 1.992323226016543e-05, + "loss": 0.0977, + "step": 545 + }, + { + "epoch": 0.13802622498274672, + "grad_norm": 0.5854383111000061, + "learning_rate": 1.9920698013345162e-05, + "loss": 0.096, + "step": 550 + }, + { + "epoch": 0.13928100884622624, + "grad_norm": 0.33884522318840027, + "learning_rate": 1.99181227793856e-05, + "loss": 0.0981, + "step": 555 + }, + { + "epoch": 0.14053579270970576, + "grad_norm": 0.9304419159889221, + "learning_rate": 1.9915506568926283e-05, + "loss": 0.1069, + "step": 560 + }, + { + "epoch": 0.14179057657318528, + "grad_norm": 0.5028262138366699, + "learning_rate": 1.991284939277601e-05, + "loss": 0.1064, + "step": 565 + }, + { + "epoch": 0.1430453604366648, + "grad_norm": 0.30051878094673157, + "learning_rate": 1.991015126191285e-05, + "loss": 0.0977, + "step": 570 + }, + { + "epoch": 0.1443001443001443, + "grad_norm": 0.5658770799636841, + "learning_rate": 1.990741218748407e-05, + "loss": 0.1132, + "step": 575 + }, + { + "epoch": 0.1455549281636238, + "grad_norm": 0.1313030868768692, + "learning_rate": 1.9904632180806094e-05, + "loss": 0.0976, + "step": 580 + }, + { + "epoch": 0.14680971202710333, + "grad_norm": 0.3436416685581207, + "learning_rate": 1.9901811253364458e-05, + "loss": 0.103, + "step": 585 + }, + { + "epoch": 0.14806449589058285, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.9898949416813757e-05, + "loss": 0.0994, + "step": 590 + }, + { + "epoch": 0.14931927975406237, + "grad_norm": 0.35225412249565125, + "learning_rate": 1.9896046682977603e-05, + "loss": 0.0989, + "step": 595 + }, + { + "epoch": 0.1505740636175419, + "grad_norm": 0.19844387471675873, + "learning_rate": 1.989310306384858e-05, + "loss": 0.1032, + "step": 600 + }, + { + "epoch": 0.1518288474810214, + "grad_norm": 0.21333719789981842, + "learning_rate": 1.989011857158818e-05, + "loss": 0.1, + "step": 605 + }, + { + "epoch": 0.1530836313445009, + "grad_norm": 0.3265141546726227, + "learning_rate": 1.9887093218526768e-05, + "loss": 0.0935, + "step": 610 + }, + { + "epoch": 0.15433841520798042, + "grad_norm": 0.24291647970676422, + "learning_rate": 1.9884027017163515e-05, + "loss": 0.0923, + "step": 615 + }, + { + "epoch": 0.15559319907145994, + "grad_norm": 0.4325827658176422, + "learning_rate": 1.9880919980166374e-05, + "loss": 0.089, + "step": 620 + }, + { + "epoch": 0.15684798293493946, + "grad_norm": 0.1882610321044922, + "learning_rate": 1.9877772120371986e-05, + "loss": 0.0893, + "step": 625 + }, + { + "epoch": 0.15810276679841898, + "grad_norm": 0.4826973080635071, + "learning_rate": 1.987458345078567e-05, + "loss": 0.0983, + "step": 630 + }, + { + "epoch": 0.1593575506618985, + "grad_norm": 0.5353069305419922, + "learning_rate": 1.9871353984581342e-05, + "loss": 0.0937, + "step": 635 + }, + { + "epoch": 0.16061233452537801, + "grad_norm": 0.12057194858789444, + "learning_rate": 1.9868083735101464e-05, + "loss": 0.0999, + "step": 640 + }, + { + "epoch": 0.1618671183888575, + "grad_norm": 0.4315410256385803, + "learning_rate": 1.9864772715857e-05, + "loss": 0.0982, + "step": 645 + }, + { + "epoch": 0.16312190225233703, + "grad_norm": 0.1834162324666977, + "learning_rate": 1.9861420940527357e-05, + "loss": 0.0876, + "step": 650 + }, + { + "epoch": 0.16437668611581654, + "grad_norm": 0.1956656128168106, + "learning_rate": 1.985802842296031e-05, + "loss": 0.0833, + "step": 655 + }, + { + "epoch": 0.16563146997929606, + "grad_norm": 0.08431841433048248, + "learning_rate": 1.9854595177171968e-05, + "loss": 0.0933, + "step": 660 + }, + { + "epoch": 0.16688625384277558, + "grad_norm": 0.40520185232162476, + "learning_rate": 1.9851121217346717e-05, + "loss": 0.0887, + "step": 665 + }, + { + "epoch": 0.1681410377062551, + "grad_norm": 0.661037266254425, + "learning_rate": 1.9847606557837138e-05, + "loss": 0.0971, + "step": 670 + }, + { + "epoch": 0.16939582156973462, + "grad_norm": 0.35876908898353577, + "learning_rate": 1.9844051213163967e-05, + "loss": 0.0891, + "step": 675 + }, + { + "epoch": 0.17065060543321414, + "grad_norm": 0.73444002866745, + "learning_rate": 1.9840455198016033e-05, + "loss": 0.1015, + "step": 680 + }, + { + "epoch": 0.17190538929669363, + "grad_norm": 0.5403853058815002, + "learning_rate": 1.9836818527250185e-05, + "loss": 0.094, + "step": 685 + }, + { + "epoch": 0.17316017316017315, + "grad_norm": 0.26550817489624023, + "learning_rate": 1.9833141215891253e-05, + "loss": 0.0916, + "step": 690 + }, + { + "epoch": 0.17441495702365267, + "grad_norm": 0.20519235730171204, + "learning_rate": 1.9829423279131962e-05, + "loss": 0.0938, + "step": 695 + }, + { + "epoch": 0.1756697408871322, + "grad_norm": 0.2511141896247864, + "learning_rate": 1.9825664732332886e-05, + "loss": 0.095, + "step": 700 + }, + { + "epoch": 0.1769245247506117, + "grad_norm": 0.42727699875831604, + "learning_rate": 1.982186559102237e-05, + "loss": 0.1039, + "step": 705 + }, + { + "epoch": 0.17817930861409123, + "grad_norm": 0.361933171749115, + "learning_rate": 1.9818025870896485e-05, + "loss": 0.0933, + "step": 710 + }, + { + "epoch": 0.17943409247757075, + "grad_norm": 0.081818588078022, + "learning_rate": 1.981414558781895e-05, + "loss": 0.0972, + "step": 715 + }, + { + "epoch": 0.18068887634105024, + "grad_norm": 0.24198131263256073, + "learning_rate": 1.9810224757821063e-05, + "loss": 0.08, + "step": 720 + }, + { + "epoch": 0.18194366020452976, + "grad_norm": 0.22726424038410187, + "learning_rate": 1.9806263397101645e-05, + "loss": 0.0889, + "step": 725 + }, + { + "epoch": 0.18319844406800928, + "grad_norm": 0.26447054743766785, + "learning_rate": 1.980226152202697e-05, + "loss": 0.0988, + "step": 730 + }, + { + "epoch": 0.1844532279314888, + "grad_norm": 0.20784974098205566, + "learning_rate": 1.9798219149130692e-05, + "loss": 0.1134, + "step": 735 + }, + { + "epoch": 0.18570801179496832, + "grad_norm": 0.6903693079948425, + "learning_rate": 1.9794136295113783e-05, + "loss": 0.1002, + "step": 740 + }, + { + "epoch": 0.18696279565844784, + "grad_norm": 0.9974376559257507, + "learning_rate": 1.9790012976844465e-05, + "loss": 0.0922, + "step": 745 + }, + { + "epoch": 0.18821757952192736, + "grad_norm": 0.3469702899456024, + "learning_rate": 1.9785849211358133e-05, + "loss": 0.0843, + "step": 750 + }, + { + "epoch": 0.18947236338540685, + "grad_norm": 0.25853797793388367, + "learning_rate": 1.9781645015857287e-05, + "loss": 0.0832, + "step": 755 + }, + { + "epoch": 0.19072714724888637, + "grad_norm": 0.4150042235851288, + "learning_rate": 1.9777400407711467e-05, + "loss": 0.0951, + "step": 760 + }, + { + "epoch": 0.1919819311123659, + "grad_norm": 0.8367013335227966, + "learning_rate": 1.9773115404457175e-05, + "loss": 0.1043, + "step": 765 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 0.09506336599588394, + "learning_rate": 1.976879002379781e-05, + "loss": 0.1006, + "step": 770 + }, + { + "epoch": 0.19449149883932493, + "grad_norm": 0.47608646750450134, + "learning_rate": 1.9764424283603577e-05, + "loss": 0.0969, + "step": 775 + }, + { + "epoch": 0.19574628270280445, + "grad_norm": 0.3089286684989929, + "learning_rate": 1.976001820191143e-05, + "loss": 0.081, + "step": 780 + }, + { + "epoch": 0.19700106656628397, + "grad_norm": 0.7951834201812744, + "learning_rate": 1.9755571796925014e-05, + "loss": 0.0987, + "step": 785 + }, + { + "epoch": 0.19825585042976349, + "grad_norm": 0.29604893922805786, + "learning_rate": 1.9751085087014533e-05, + "loss": 0.0905, + "step": 790 + }, + { + "epoch": 0.19951063429324298, + "grad_norm": 0.6843680143356323, + "learning_rate": 1.974655809071673e-05, + "loss": 0.1044, + "step": 795 + }, + { + "epoch": 0.2007654181567225, + "grad_norm": 0.3378108739852905, + "learning_rate": 1.9741990826734793e-05, + "loss": 0.0791, + "step": 800 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.2935211658477783, + "learning_rate": 1.9737383313938266e-05, + "loss": 0.0996, + "step": 805 + }, + { + "epoch": 0.20327498588368154, + "grad_norm": 0.2529759705066681, + "learning_rate": 1.9732735571362985e-05, + "loss": 0.0992, + "step": 810 + }, + { + "epoch": 0.20452976974716106, + "grad_norm": 0.2465377002954483, + "learning_rate": 1.9728047618210995e-05, + "loss": 0.0976, + "step": 815 + }, + { + "epoch": 0.20578455361064057, + "grad_norm": 0.4152264893054962, + "learning_rate": 1.9723319473850465e-05, + "loss": 0.1063, + "step": 820 + }, + { + "epoch": 0.2070393374741201, + "grad_norm": 0.3043062686920166, + "learning_rate": 1.971855115781562e-05, + "loss": 0.1005, + "step": 825 + }, + { + "epoch": 0.20829412133759959, + "grad_norm": 0.15693755447864532, + "learning_rate": 1.9713742689806646e-05, + "loss": 0.0905, + "step": 830 + }, + { + "epoch": 0.2095489052010791, + "grad_norm": 0.21518734097480774, + "learning_rate": 1.9708894089689622e-05, + "loss": 0.0952, + "step": 835 + }, + { + "epoch": 0.21080368906455862, + "grad_norm": 0.5088281631469727, + "learning_rate": 1.9704005377496428e-05, + "loss": 0.0844, + "step": 840 + }, + { + "epoch": 0.21205847292803814, + "grad_norm": 0.19282662868499756, + "learning_rate": 1.969907657342467e-05, + "loss": 0.0892, + "step": 845 + }, + { + "epoch": 0.21331325679151766, + "grad_norm": 0.3823853135108948, + "learning_rate": 1.969410769783759e-05, + "loss": 0.083, + "step": 850 + }, + { + "epoch": 0.21456804065499718, + "grad_norm": 0.31330108642578125, + "learning_rate": 1.9689098771263982e-05, + "loss": 0.0958, + "step": 855 + }, + { + "epoch": 0.2158228245184767, + "grad_norm": 0.2921331822872162, + "learning_rate": 1.968404981439812e-05, + "loss": 0.0915, + "step": 860 + }, + { + "epoch": 0.21707760838195622, + "grad_norm": 0.23992085456848145, + "learning_rate": 1.9678960848099646e-05, + "loss": 0.0945, + "step": 865 + }, + { + "epoch": 0.2183323922454357, + "grad_norm": 0.22444355487823486, + "learning_rate": 1.967383189339352e-05, + "loss": 0.0816, + "step": 870 + }, + { + "epoch": 0.21958717610891523, + "grad_norm": 0.27808722853660583, + "learning_rate": 1.9668662971469886e-05, + "loss": 0.0872, + "step": 875 + }, + { + "epoch": 0.22084195997239475, + "grad_norm": 0.4084593653678894, + "learning_rate": 1.9663454103684043e-05, + "loss": 0.1007, + "step": 880 + }, + { + "epoch": 0.22209674383587427, + "grad_norm": 0.2565450966358185, + "learning_rate": 1.9658205311556304e-05, + "loss": 0.0973, + "step": 885 + }, + { + "epoch": 0.2233515276993538, + "grad_norm": 0.19569608569145203, + "learning_rate": 1.9652916616771933e-05, + "loss": 0.0884, + "step": 890 + }, + { + "epoch": 0.2246063115628333, + "grad_norm": 0.17645685374736786, + "learning_rate": 1.9647588041181057e-05, + "loss": 0.0918, + "step": 895 + }, + { + "epoch": 0.22586109542631283, + "grad_norm": 0.5224778056144714, + "learning_rate": 1.9642219606798566e-05, + "loss": 0.0899, + "step": 900 + }, + { + "epoch": 0.22711587928979232, + "grad_norm": 0.29252490401268005, + "learning_rate": 1.963681133580402e-05, + "loss": 0.0978, + "step": 905 + }, + { + "epoch": 0.22837066315327184, + "grad_norm": 0.2631376385688782, + "learning_rate": 1.9631363250541577e-05, + "loss": 0.0961, + "step": 910 + }, + { + "epoch": 0.22962544701675136, + "grad_norm": 0.4589993953704834, + "learning_rate": 1.9625875373519866e-05, + "loss": 0.0942, + "step": 915 + }, + { + "epoch": 0.23088023088023088, + "grad_norm": 0.4137325882911682, + "learning_rate": 1.9620347727411933e-05, + "loss": 0.0917, + "step": 920 + }, + { + "epoch": 0.2321350147437104, + "grad_norm": 0.21914274990558624, + "learning_rate": 1.9614780335055127e-05, + "loss": 0.0851, + "step": 925 + }, + { + "epoch": 0.23338979860718992, + "grad_norm": 0.3876037001609802, + "learning_rate": 1.9609173219450998e-05, + "loss": 0.0878, + "step": 930 + }, + { + "epoch": 0.23464458247066944, + "grad_norm": 0.376610666513443, + "learning_rate": 1.9603526403765218e-05, + "loss": 0.0949, + "step": 935 + }, + { + "epoch": 0.23589936633414893, + "grad_norm": 0.2858617603778839, + "learning_rate": 1.9597839911327475e-05, + "loss": 0.091, + "step": 940 + }, + { + "epoch": 0.23715415019762845, + "grad_norm": 0.6040699481964111, + "learning_rate": 1.959211376563139e-05, + "loss": 0.0866, + "step": 945 + }, + { + "epoch": 0.23840893406110797, + "grad_norm": 0.15516065061092377, + "learning_rate": 1.9586347990334406e-05, + "loss": 0.093, + "step": 950 + }, + { + "epoch": 0.2396637179245875, + "grad_norm": 0.38089755177497864, + "learning_rate": 1.958054260925768e-05, + "loss": 0.091, + "step": 955 + }, + { + "epoch": 0.240918501788067, + "grad_norm": 0.15390656888484955, + "learning_rate": 1.9574697646386027e-05, + "loss": 0.0883, + "step": 960 + }, + { + "epoch": 0.24217328565154653, + "grad_norm": 0.5063360929489136, + "learning_rate": 1.956881312586777e-05, + "loss": 0.094, + "step": 965 + }, + { + "epoch": 0.24342806951502605, + "grad_norm": 0.15385521948337555, + "learning_rate": 1.9562889072014682e-05, + "loss": 0.0942, + "step": 970 + }, + { + "epoch": 0.24468285337850557, + "grad_norm": 0.4311351180076599, + "learning_rate": 1.9556925509301844e-05, + "loss": 0.0896, + "step": 975 + }, + { + "epoch": 0.24593763724198506, + "grad_norm": 0.1626339703798294, + "learning_rate": 1.955092246236759e-05, + "loss": 0.0901, + "step": 980 + }, + { + "epoch": 0.24719242110546458, + "grad_norm": 0.1978272795677185, + "learning_rate": 1.954487995601337e-05, + "loss": 0.086, + "step": 985 + }, + { + "epoch": 0.2484472049689441, + "grad_norm": 0.5115323066711426, + "learning_rate": 1.953879801520366e-05, + "loss": 0.0966, + "step": 990 + }, + { + "epoch": 0.24970198883242362, + "grad_norm": 0.556075930595398, + "learning_rate": 1.9532676665065863e-05, + "loss": 0.0947, + "step": 995 + }, + { + "epoch": 0.2509567726959031, + "grad_norm": 0.4283283054828644, + "learning_rate": 1.9526515930890203e-05, + "loss": 0.0995, + "step": 1000 + }, + { + "epoch": 0.25221155655938265, + "grad_norm": 0.20551514625549316, + "learning_rate": 1.9520315838129602e-05, + "loss": 0.0822, + "step": 1005 + }, + { + "epoch": 0.25346634042286215, + "grad_norm": 0.2339772880077362, + "learning_rate": 1.9514076412399615e-05, + "loss": 0.0916, + "step": 1010 + }, + { + "epoch": 0.2547211242863417, + "grad_norm": 0.33765801787376404, + "learning_rate": 1.9507797679478282e-05, + "loss": 0.0834, + "step": 1015 + }, + { + "epoch": 0.2559759081498212, + "grad_norm": 0.15917867422103882, + "learning_rate": 1.9501479665306046e-05, + "loss": 0.1036, + "step": 1020 + }, + { + "epoch": 0.25723069201330073, + "grad_norm": 0.15824545919895172, + "learning_rate": 1.9495122395985642e-05, + "loss": 0.0926, + "step": 1025 + }, + { + "epoch": 0.2584854758767802, + "grad_norm": 0.22190357744693756, + "learning_rate": 1.948872589778198e-05, + "loss": 0.0915, + "step": 1030 + }, + { + "epoch": 0.2597402597402597, + "grad_norm": 0.39479732513427734, + "learning_rate": 1.9482290197122054e-05, + "loss": 0.0954, + "step": 1035 + }, + { + "epoch": 0.26099504360373926, + "grad_norm": 0.44342002272605896, + "learning_rate": 1.947581532059481e-05, + "loss": 0.0968, + "step": 1040 + }, + { + "epoch": 0.26224982746721875, + "grad_norm": 0.18394285440444946, + "learning_rate": 1.946930129495106e-05, + "loss": 0.0968, + "step": 1045 + }, + { + "epoch": 0.2635046113306983, + "grad_norm": 0.40263161063194275, + "learning_rate": 1.9462748147103342e-05, + "loss": 0.0954, + "step": 1050 + }, + { + "epoch": 0.2647593951941778, + "grad_norm": 0.3291696608066559, + "learning_rate": 1.9456155904125853e-05, + "loss": 0.0974, + "step": 1055 + }, + { + "epoch": 0.26601417905765734, + "grad_norm": 0.17444024980068207, + "learning_rate": 1.9449524593254283e-05, + "loss": 0.0892, + "step": 1060 + }, + { + "epoch": 0.26726896292113683, + "grad_norm": 0.5202435851097107, + "learning_rate": 1.944285424188575e-05, + "loss": 0.1026, + "step": 1065 + }, + { + "epoch": 0.2685237467846163, + "grad_norm": 0.6516605615615845, + "learning_rate": 1.943614487757866e-05, + "loss": 0.0899, + "step": 1070 + }, + { + "epoch": 0.26977853064809587, + "grad_norm": 0.7146198153495789, + "learning_rate": 1.9429396528052594e-05, + "loss": 0.0834, + "step": 1075 + }, + { + "epoch": 0.27103331451157536, + "grad_norm": 0.46206018328666687, + "learning_rate": 1.9422609221188208e-05, + "loss": 0.0961, + "step": 1080 + }, + { + "epoch": 0.2722880983750549, + "grad_norm": 0.258039265871048, + "learning_rate": 1.9415782985027105e-05, + "loss": 0.0861, + "step": 1085 + }, + { + "epoch": 0.2735428822385344, + "grad_norm": 0.2528323829174042, + "learning_rate": 1.9408917847771732e-05, + "loss": 0.0903, + "step": 1090 + }, + { + "epoch": 0.27479766610201395, + "grad_norm": 0.35763663053512573, + "learning_rate": 1.9402013837785242e-05, + "loss": 0.0961, + "step": 1095 + }, + { + "epoch": 0.27605244996549344, + "grad_norm": 0.33493563532829285, + "learning_rate": 1.93950709835914e-05, + "loss": 0.086, + "step": 1100 + }, + { + "epoch": 0.27730723382897293, + "grad_norm": 0.6536179780960083, + "learning_rate": 1.9388089313874447e-05, + "loss": 0.083, + "step": 1105 + }, + { + "epoch": 0.2785620176924525, + "grad_norm": 0.6487680077552795, + "learning_rate": 1.9381068857478994e-05, + "loss": 0.0968, + "step": 1110 + }, + { + "epoch": 0.27981680155593197, + "grad_norm": 0.25090888142585754, + "learning_rate": 1.9374009643409895e-05, + "loss": 0.0785, + "step": 1115 + }, + { + "epoch": 0.2810715854194115, + "grad_norm": 0.1529439091682434, + "learning_rate": 1.9366911700832146e-05, + "loss": 0.0946, + "step": 1120 + }, + { + "epoch": 0.282326369282891, + "grad_norm": 0.2744344472885132, + "learning_rate": 1.935977505907072e-05, + "loss": 0.0827, + "step": 1125 + }, + { + "epoch": 0.28358115314637056, + "grad_norm": 0.4758091866970062, + "learning_rate": 1.93525997476105e-05, + "loss": 0.0895, + "step": 1130 + }, + { + "epoch": 0.28483593700985005, + "grad_norm": 0.5062987804412842, + "learning_rate": 1.9345385796096118e-05, + "loss": 0.1007, + "step": 1135 + }, + { + "epoch": 0.2860907208733296, + "grad_norm": 0.44954121112823486, + "learning_rate": 1.933813323433186e-05, + "loss": 0.0915, + "step": 1140 + }, + { + "epoch": 0.2873455047368091, + "grad_norm": 0.2048788219690323, + "learning_rate": 1.9330842092281508e-05, + "loss": 0.0866, + "step": 1145 + }, + { + "epoch": 0.2886002886002886, + "grad_norm": 0.3660197854042053, + "learning_rate": 1.9323512400068262e-05, + "loss": 0.0916, + "step": 1150 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.5215709209442139, + "learning_rate": 1.931614418797457e-05, + "loss": 0.096, + "step": 1155 + }, + { + "epoch": 0.2911098563272476, + "grad_norm": 0.46644458174705505, + "learning_rate": 1.9308737486442045e-05, + "loss": 0.1107, + "step": 1160 + }, + { + "epoch": 0.29236464019072717, + "grad_norm": 0.24164815247058868, + "learning_rate": 1.9301292326071295e-05, + "loss": 0.0923, + "step": 1165 + }, + { + "epoch": 0.29361942405420666, + "grad_norm": 0.42168012261390686, + "learning_rate": 1.9293808737621837e-05, + "loss": 0.0998, + "step": 1170 + }, + { + "epoch": 0.2948742079176862, + "grad_norm": 0.37700730562210083, + "learning_rate": 1.9286286752011948e-05, + "loss": 0.1036, + "step": 1175 + }, + { + "epoch": 0.2961289917811657, + "grad_norm": 0.6957244277000427, + "learning_rate": 1.927872640031854e-05, + "loss": 0.0903, + "step": 1180 + }, + { + "epoch": 0.2973837756446452, + "grad_norm": 0.30158841609954834, + "learning_rate": 1.9271127713777033e-05, + "loss": 0.0861, + "step": 1185 + }, + { + "epoch": 0.29863855950812473, + "grad_norm": 0.41035765409469604, + "learning_rate": 1.9263490723781233e-05, + "loss": 0.0909, + "step": 1190 + }, + { + "epoch": 0.2998933433716042, + "grad_norm": 0.1835126429796219, + "learning_rate": 1.9255815461883184e-05, + "loss": 0.0968, + "step": 1195 + }, + { + "epoch": 0.3011481272350838, + "grad_norm": 0.18504665791988373, + "learning_rate": 1.9248101959793066e-05, + "loss": 0.0995, + "step": 1200 + }, + { + "epoch": 0.30240291109856327, + "grad_norm": 0.32556024193763733, + "learning_rate": 1.9240350249379035e-05, + "loss": 0.0972, + "step": 1205 + }, + { + "epoch": 0.3036576949620428, + "grad_norm": 0.2088983952999115, + "learning_rate": 1.92325603626671e-05, + "loss": 0.0935, + "step": 1210 + }, + { + "epoch": 0.3049124788255223, + "grad_norm": 0.5765480995178223, + "learning_rate": 1.922473233184101e-05, + "loss": 0.0933, + "step": 1215 + }, + { + "epoch": 0.3061672626890018, + "grad_norm": 0.7290382981300354, + "learning_rate": 1.9216866189242095e-05, + "loss": 0.0954, + "step": 1220 + }, + { + "epoch": 0.30742204655248134, + "grad_norm": 0.2641292214393616, + "learning_rate": 1.9208961967369148e-05, + "loss": 0.0796, + "step": 1225 + }, + { + "epoch": 0.30867683041596083, + "grad_norm": 0.6574556827545166, + "learning_rate": 1.9201019698878272e-05, + "loss": 0.1038, + "step": 1230 + }, + { + "epoch": 0.3099316142794404, + "grad_norm": 0.19239722192287445, + "learning_rate": 1.9193039416582785e-05, + "loss": 0.094, + "step": 1235 + }, + { + "epoch": 0.3111863981429199, + "grad_norm": 0.14215205609798431, + "learning_rate": 1.918502115345303e-05, + "loss": 0.0965, + "step": 1240 + }, + { + "epoch": 0.3124411820063994, + "grad_norm": 0.3955984115600586, + "learning_rate": 1.9176964942616286e-05, + "loss": 0.0961, + "step": 1245 + }, + { + "epoch": 0.3136959658698789, + "grad_norm": 0.5442742109298706, + "learning_rate": 1.9168870817356602e-05, + "loss": 0.0844, + "step": 1250 + }, + { + "epoch": 0.3149507497333584, + "grad_norm": 0.2363368719816208, + "learning_rate": 1.916073881111468e-05, + "loss": 0.0882, + "step": 1255 + }, + { + "epoch": 0.31620553359683795, + "grad_norm": 0.4277198016643524, + "learning_rate": 1.915256895748771e-05, + "loss": 0.0813, + "step": 1260 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.15375563502311707, + "learning_rate": 1.9144361290229266e-05, + "loss": 0.1007, + "step": 1265 + }, + { + "epoch": 0.318715101323797, + "grad_norm": 0.2776433527469635, + "learning_rate": 1.913611584324913e-05, + "loss": 0.0898, + "step": 1270 + }, + { + "epoch": 0.3199698851872765, + "grad_norm": 0.20093151926994324, + "learning_rate": 1.912783265061319e-05, + "loss": 0.0897, + "step": 1275 + }, + { + "epoch": 0.32122466905075603, + "grad_norm": 0.13896812498569489, + "learning_rate": 1.9119511746543265e-05, + "loss": 0.0945, + "step": 1280 + }, + { + "epoch": 0.3224794529142355, + "grad_norm": 0.1117207333445549, + "learning_rate": 1.911115316541698e-05, + "loss": 0.0887, + "step": 1285 + }, + { + "epoch": 0.323734236777715, + "grad_norm": 0.19571149349212646, + "learning_rate": 1.9102756941767625e-05, + "loss": 0.086, + "step": 1290 + }, + { + "epoch": 0.32498902064119456, + "grad_norm": 0.12047363817691803, + "learning_rate": 1.9094323110284006e-05, + "loss": 0.1008, + "step": 1295 + }, + { + "epoch": 0.32624380450467405, + "grad_norm": 0.2128770500421524, + "learning_rate": 1.9085851705810307e-05, + "loss": 0.1054, + "step": 1300 + }, + { + "epoch": 0.3274985883681536, + "grad_norm": 0.18241125345230103, + "learning_rate": 1.907734276334595e-05, + "loss": 0.0981, + "step": 1305 + }, + { + "epoch": 0.3287533722316331, + "grad_norm": 0.32163918018341064, + "learning_rate": 1.9068796318045434e-05, + "loss": 0.0871, + "step": 1310 + }, + { + "epoch": 0.33000815609511264, + "grad_norm": 0.6417638063430786, + "learning_rate": 1.90602124052182e-05, + "loss": 0.0959, + "step": 1315 + }, + { + "epoch": 0.33126293995859213, + "grad_norm": 0.2659170627593994, + "learning_rate": 1.9051591060328496e-05, + "loss": 0.0938, + "step": 1320 + }, + { + "epoch": 0.3325177238220717, + "grad_norm": 0.34006327390670776, + "learning_rate": 1.904293231899521e-05, + "loss": 0.0835, + "step": 1325 + }, + { + "epoch": 0.33377250768555117, + "grad_norm": 0.44254758954048157, + "learning_rate": 1.9034236216991738e-05, + "loss": 0.0894, + "step": 1330 + }, + { + "epoch": 0.33502729154903066, + "grad_norm": 0.2562454342842102, + "learning_rate": 1.9025502790245824e-05, + "loss": 0.0953, + "step": 1335 + }, + { + "epoch": 0.3362820754125102, + "grad_norm": 0.33543840050697327, + "learning_rate": 1.901673207483943e-05, + "loss": 0.0911, + "step": 1340 + }, + { + "epoch": 0.3375368592759897, + "grad_norm": 0.3265765905380249, + "learning_rate": 1.9007924107008563e-05, + "loss": 0.0885, + "step": 1345 + }, + { + "epoch": 0.33879164313946925, + "grad_norm": 0.08985516428947449, + "learning_rate": 1.8999078923143142e-05, + "loss": 0.085, + "step": 1350 + }, + { + "epoch": 0.34004642700294874, + "grad_norm": 0.5862855315208435, + "learning_rate": 1.899019655978685e-05, + "loss": 0.0956, + "step": 1355 + }, + { + "epoch": 0.3413012108664283, + "grad_norm": 0.32543689012527466, + "learning_rate": 1.8981277053636963e-05, + "loss": 0.096, + "step": 1360 + }, + { + "epoch": 0.3425559947299078, + "grad_norm": 0.6047563552856445, + "learning_rate": 1.8972320441544224e-05, + "loss": 0.0843, + "step": 1365 + }, + { + "epoch": 0.34381077859338727, + "grad_norm": 0.28414762020111084, + "learning_rate": 1.8963326760512668e-05, + "loss": 0.0892, + "step": 1370 + }, + { + "epoch": 0.3450655624568668, + "grad_norm": 0.337194561958313, + "learning_rate": 1.895429604769949e-05, + "loss": 0.099, + "step": 1375 + }, + { + "epoch": 0.3463203463203463, + "grad_norm": 0.4529132843017578, + "learning_rate": 1.894522834041487e-05, + "loss": 0.0941, + "step": 1380 + }, + { + "epoch": 0.34757513018382585, + "grad_norm": 0.12156742066144943, + "learning_rate": 1.8936123676121844e-05, + "loss": 0.0896, + "step": 1385 + }, + { + "epoch": 0.34882991404730535, + "grad_norm": 0.25660645961761475, + "learning_rate": 1.8926982092436117e-05, + "loss": 0.1036, + "step": 1390 + }, + { + "epoch": 0.3500846979107849, + "grad_norm": 0.31460127234458923, + "learning_rate": 1.891780362712594e-05, + "loss": 0.1055, + "step": 1395 + }, + { + "epoch": 0.3513394817742644, + "grad_norm": 0.31471750140190125, + "learning_rate": 1.8908588318111932e-05, + "loss": 0.0958, + "step": 1400 + }, + { + "epoch": 0.3525942656377439, + "grad_norm": 0.13055211305618286, + "learning_rate": 1.889933620346694e-05, + "loss": 0.1019, + "step": 1405 + }, + { + "epoch": 0.3538490495012234, + "grad_norm": 0.13505928218364716, + "learning_rate": 1.8890047321415856e-05, + "loss": 0.0816, + "step": 1410 + }, + { + "epoch": 0.3551038333647029, + "grad_norm": 0.2555672526359558, + "learning_rate": 1.8880721710335495e-05, + "loss": 0.0998, + "step": 1415 + }, + { + "epoch": 0.35635861722818246, + "grad_norm": 0.3331303298473358, + "learning_rate": 1.8871359408754405e-05, + "loss": 0.094, + "step": 1420 + }, + { + "epoch": 0.35761340109166195, + "grad_norm": 0.17009656131267548, + "learning_rate": 1.8861960455352723e-05, + "loss": 0.0946, + "step": 1425 + }, + { + "epoch": 0.3588681849551415, + "grad_norm": 0.2334425002336502, + "learning_rate": 1.885252488896201e-05, + "loss": 0.0906, + "step": 1430 + }, + { + "epoch": 0.360122968818621, + "grad_norm": 0.21691180765628815, + "learning_rate": 1.8843052748565097e-05, + "loss": 0.091, + "step": 1435 + }, + { + "epoch": 0.3613777526821005, + "grad_norm": 0.40637996792793274, + "learning_rate": 1.8833544073295918e-05, + "loss": 0.0856, + "step": 1440 + }, + { + "epoch": 0.36263253654558003, + "grad_norm": 0.4045301079750061, + "learning_rate": 1.882399890243935e-05, + "loss": 0.0821, + "step": 1445 + }, + { + "epoch": 0.3638873204090595, + "grad_norm": 0.25760528445243835, + "learning_rate": 1.8814417275431046e-05, + "loss": 0.0895, + "step": 1450 + }, + { + "epoch": 0.36514210427253907, + "grad_norm": 0.3337756097316742, + "learning_rate": 1.8804799231857292e-05, + "loss": 0.0939, + "step": 1455 + }, + { + "epoch": 0.36639688813601856, + "grad_norm": 0.325998991727829, + "learning_rate": 1.8795144811454805e-05, + "loss": 0.0978, + "step": 1460 + }, + { + "epoch": 0.3676516719994981, + "grad_norm": 0.12589257955551147, + "learning_rate": 1.878545405411061e-05, + "loss": 0.0826, + "step": 1465 + }, + { + "epoch": 0.3689064558629776, + "grad_norm": 0.10235779732465744, + "learning_rate": 1.877572699986185e-05, + "loss": 0.0841, + "step": 1470 + }, + { + "epoch": 0.3701612397264571, + "grad_norm": 0.20532777905464172, + "learning_rate": 1.876596368889563e-05, + "loss": 0.0897, + "step": 1475 + }, + { + "epoch": 0.37141602358993664, + "grad_norm": 0.30277219414711, + "learning_rate": 1.8756164161548848e-05, + "loss": 0.0975, + "step": 1480 + }, + { + "epoch": 0.37267080745341613, + "grad_norm": 0.15634752810001373, + "learning_rate": 1.8746328458308034e-05, + "loss": 0.0879, + "step": 1485 + }, + { + "epoch": 0.3739255913168957, + "grad_norm": 0.2814009487628937, + "learning_rate": 1.873645661980917e-05, + "loss": 0.0885, + "step": 1490 + }, + { + "epoch": 0.37518037518037517, + "grad_norm": 0.7077462673187256, + "learning_rate": 1.872654868683753e-05, + "loss": 0.1143, + "step": 1495 + }, + { + "epoch": 0.3764351590438547, + "grad_norm": 0.14586646854877472, + "learning_rate": 1.8716604700327516e-05, + "loss": 0.1006, + "step": 1500 + }, + { + "epoch": 0.3776899429073342, + "grad_norm": 0.23802891373634338, + "learning_rate": 1.8706624701362485e-05, + "loss": 0.0926, + "step": 1505 + }, + { + "epoch": 0.3789447267708137, + "grad_norm": 0.41399145126342773, + "learning_rate": 1.8696608731174576e-05, + "loss": 0.0897, + "step": 1510 + }, + { + "epoch": 0.38019951063429325, + "grad_norm": 0.18691197037696838, + "learning_rate": 1.8686556831144545e-05, + "loss": 0.0843, + "step": 1515 + }, + { + "epoch": 0.38145429449777274, + "grad_norm": 0.3470015823841095, + "learning_rate": 1.867646904280159e-05, + "loss": 0.1038, + "step": 1520 + }, + { + "epoch": 0.3827090783612523, + "grad_norm": 0.3475337624549866, + "learning_rate": 1.8666345407823177e-05, + "loss": 0.0992, + "step": 1525 + }, + { + "epoch": 0.3839638622247318, + "grad_norm": 0.21168570220470428, + "learning_rate": 1.865618596803487e-05, + "loss": 0.0877, + "step": 1530 + }, + { + "epoch": 0.3852186460882113, + "grad_norm": 0.2576999366283417, + "learning_rate": 1.864599076541018e-05, + "loss": 0.099, + "step": 1535 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 0.42329198122024536, + "learning_rate": 1.8635759842070344e-05, + "loss": 0.0886, + "step": 1540 + }, + { + "epoch": 0.38772821381517036, + "grad_norm": 0.2144610732793808, + "learning_rate": 1.862549324028419e-05, + "loss": 0.0976, + "step": 1545 + }, + { + "epoch": 0.38898299767864986, + "grad_norm": 0.36767107248306274, + "learning_rate": 1.8615191002467955e-05, + "loss": 0.1067, + "step": 1550 + }, + { + "epoch": 0.39023778154212935, + "grad_norm": 0.32758021354675293, + "learning_rate": 1.8604853171185098e-05, + "loss": 0.0925, + "step": 1555 + }, + { + "epoch": 0.3914925654056089, + "grad_norm": 0.11192046850919724, + "learning_rate": 1.859447978914614e-05, + "loss": 0.0815, + "step": 1560 + }, + { + "epoch": 0.3927473492690884, + "grad_norm": 0.17995843291282654, + "learning_rate": 1.8584070899208468e-05, + "loss": 0.0823, + "step": 1565 + }, + { + "epoch": 0.39400213313256793, + "grad_norm": 0.3672018349170685, + "learning_rate": 1.857362654437618e-05, + "loss": 0.0955, + "step": 1570 + }, + { + "epoch": 0.3952569169960474, + "grad_norm": 0.11518882215023041, + "learning_rate": 1.8563146767799884e-05, + "loss": 0.0955, + "step": 1575 + }, + { + "epoch": 0.39651170085952697, + "grad_norm": 0.30487769842147827, + "learning_rate": 1.8552631612776554e-05, + "loss": 0.0842, + "step": 1580 + }, + { + "epoch": 0.39776648472300646, + "grad_norm": 0.6982203722000122, + "learning_rate": 1.85420811227493e-05, + "loss": 0.0801, + "step": 1585 + }, + { + "epoch": 0.39902126858648596, + "grad_norm": 0.22043141722679138, + "learning_rate": 1.853149534130724e-05, + "loss": 0.1039, + "step": 1590 + }, + { + "epoch": 0.4002760524499655, + "grad_norm": 0.3814282715320587, + "learning_rate": 1.8520874312185292e-05, + "loss": 0.0861, + "step": 1595 + }, + { + "epoch": 0.401530836313445, + "grad_norm": 0.14649051427841187, + "learning_rate": 1.8510218079263995e-05, + "loss": 0.09, + "step": 1600 + }, + { + "epoch": 0.40278562017692454, + "grad_norm": 0.16551180183887482, + "learning_rate": 1.849952668656933e-05, + "loss": 0.079, + "step": 1605 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.17042899131774902, + "learning_rate": 1.8488800178272553e-05, + "loss": 0.1073, + "step": 1610 + }, + { + "epoch": 0.4052951879038836, + "grad_norm": 0.4069715440273285, + "learning_rate": 1.847803859868998e-05, + "loss": 0.0948, + "step": 1615 + }, + { + "epoch": 0.40654997176736307, + "grad_norm": 0.18239188194274902, + "learning_rate": 1.8467241992282842e-05, + "loss": 0.0934, + "step": 1620 + }, + { + "epoch": 0.40780475563084256, + "grad_norm": 0.4659058749675751, + "learning_rate": 1.845641040365707e-05, + "loss": 0.0953, + "step": 1625 + }, + { + "epoch": 0.4090595394943221, + "grad_norm": 0.1839870810508728, + "learning_rate": 1.844554387756313e-05, + "loss": 0.0874, + "step": 1630 + }, + { + "epoch": 0.4103143233578016, + "grad_norm": 0.8208329081535339, + "learning_rate": 1.8434642458895823e-05, + "loss": 0.0993, + "step": 1635 + }, + { + "epoch": 0.41156910722128115, + "grad_norm": 0.3736049234867096, + "learning_rate": 1.8423706192694118e-05, + "loss": 0.0892, + "step": 1640 + }, + { + "epoch": 0.41282389108476064, + "grad_norm": 0.1566988229751587, + "learning_rate": 1.841273512414095e-05, + "loss": 0.0893, + "step": 1645 + }, + { + "epoch": 0.4140786749482402, + "grad_norm": 0.5364568829536438, + "learning_rate": 1.840172929856304e-05, + "loss": 0.1002, + "step": 1650 + }, + { + "epoch": 0.4153334588117197, + "grad_norm": 0.41213005781173706, + "learning_rate": 1.8390688761430707e-05, + "loss": 0.0942, + "step": 1655 + }, + { + "epoch": 0.41658824267519917, + "grad_norm": 0.417540580034256, + "learning_rate": 1.8379613558357686e-05, + "loss": 0.09, + "step": 1660 + }, + { + "epoch": 0.4178430265386787, + "grad_norm": 0.30479973554611206, + "learning_rate": 1.836850373510092e-05, + "loss": 0.0819, + "step": 1665 + }, + { + "epoch": 0.4190978104021582, + "grad_norm": 0.5476796627044678, + "learning_rate": 1.8357359337560393e-05, + "loss": 0.0835, + "step": 1670 + }, + { + "epoch": 0.42035259426563776, + "grad_norm": 0.14834348857402802, + "learning_rate": 1.8346180411778934e-05, + "loss": 0.081, + "step": 1675 + }, + { + "epoch": 0.42160737812911725, + "grad_norm": 0.49090176820755005, + "learning_rate": 1.833496700394202e-05, + "loss": 0.0864, + "step": 1680 + }, + { + "epoch": 0.4228621619925968, + "grad_norm": 0.17847497761249542, + "learning_rate": 1.83237191603776e-05, + "loss": 0.0705, + "step": 1685 + }, + { + "epoch": 0.4241169458560763, + "grad_norm": 0.22627241909503937, + "learning_rate": 1.831243692755587e-05, + "loss": 0.0757, + "step": 1690 + }, + { + "epoch": 0.4253717297195558, + "grad_norm": 0.6790867447853088, + "learning_rate": 1.830112035208913e-05, + "loss": 0.086, + "step": 1695 + }, + { + "epoch": 0.4266265135830353, + "grad_norm": 0.33606070280075073, + "learning_rate": 1.828976948073155e-05, + "loss": 0.0954, + "step": 1700 + }, + { + "epoch": 0.4278812974465148, + "grad_norm": 0.15440745651721954, + "learning_rate": 1.8278384360379008e-05, + "loss": 0.0877, + "step": 1705 + }, + { + "epoch": 0.42913608130999437, + "grad_norm": 0.3315647542476654, + "learning_rate": 1.8266965038068856e-05, + "loss": 0.0856, + "step": 1710 + }, + { + "epoch": 0.43039086517347386, + "grad_norm": 0.2767457365989685, + "learning_rate": 1.8255511560979782e-05, + "loss": 0.0946, + "step": 1715 + }, + { + "epoch": 0.4316456490369534, + "grad_norm": 0.24647273123264313, + "learning_rate": 1.824402397643155e-05, + "loss": 0.1019, + "step": 1720 + }, + { + "epoch": 0.4329004329004329, + "grad_norm": 0.5085861086845398, + "learning_rate": 1.823250233188487e-05, + "loss": 0.0915, + "step": 1725 + }, + { + "epoch": 0.43415521676391244, + "grad_norm": 0.2605689465999603, + "learning_rate": 1.822094667494115e-05, + "loss": 0.0787, + "step": 1730 + }, + { + "epoch": 0.43541000062739194, + "grad_norm": 0.2040780782699585, + "learning_rate": 1.8209357053342325e-05, + "loss": 0.0897, + "step": 1735 + }, + { + "epoch": 0.4366647844908714, + "grad_norm": 0.12423911690711975, + "learning_rate": 1.8197733514970655e-05, + "loss": 0.1005, + "step": 1740 + }, + { + "epoch": 0.437919568354351, + "grad_norm": 0.08479491621255875, + "learning_rate": 1.8186076107848524e-05, + "loss": 0.0871, + "step": 1745 + }, + { + "epoch": 0.43917435221783047, + "grad_norm": 0.2682114541530609, + "learning_rate": 1.8174384880138247e-05, + "loss": 0.0907, + "step": 1750 + }, + { + "epoch": 0.44042913608131, + "grad_norm": 0.32582300901412964, + "learning_rate": 1.8162659880141865e-05, + "loss": 0.0893, + "step": 1755 + }, + { + "epoch": 0.4416839199447895, + "grad_norm": 0.4764627516269684, + "learning_rate": 1.8150901156300956e-05, + "loss": 0.0892, + "step": 1760 + }, + { + "epoch": 0.44293870380826905, + "grad_norm": 0.2317512184381485, + "learning_rate": 1.8139108757196412e-05, + "loss": 0.0972, + "step": 1765 + }, + { + "epoch": 0.44419348767174854, + "grad_norm": 0.1146375834941864, + "learning_rate": 1.812728273154827e-05, + "loss": 0.1024, + "step": 1770 + }, + { + "epoch": 0.44544827153522804, + "grad_norm": 0.07953114807605743, + "learning_rate": 1.8115423128215485e-05, + "loss": 0.0892, + "step": 1775 + }, + { + "epoch": 0.4467030553987076, + "grad_norm": 0.5142281651496887, + "learning_rate": 1.810352999619574e-05, + "loss": 0.1007, + "step": 1780 + }, + { + "epoch": 0.4479578392621871, + "grad_norm": 0.35662662982940674, + "learning_rate": 1.8091603384625243e-05, + "loss": 0.0937, + "step": 1785 + }, + { + "epoch": 0.4492126231256666, + "grad_norm": 0.8468720316886902, + "learning_rate": 1.8079643342778516e-05, + "loss": 0.0987, + "step": 1790 + }, + { + "epoch": 0.4504674069891461, + "grad_norm": 0.34367749094963074, + "learning_rate": 1.80676499200682e-05, + "loss": 0.087, + "step": 1795 + }, + { + "epoch": 0.45172219085262566, + "grad_norm": 0.21747180819511414, + "learning_rate": 1.8055623166044855e-05, + "loss": 0.0883, + "step": 1800 + }, + { + "epoch": 0.45297697471610515, + "grad_norm": 0.4869808554649353, + "learning_rate": 1.8043563130396738e-05, + "loss": 0.0989, + "step": 1805 + }, + { + "epoch": 0.45423175857958464, + "grad_norm": 0.2503896653652191, + "learning_rate": 1.8031469862949618e-05, + "loss": 0.083, + "step": 1810 + }, + { + "epoch": 0.4554865424430642, + "grad_norm": 0.34971126914024353, + "learning_rate": 1.801934341366655e-05, + "loss": 0.0845, + "step": 1815 + }, + { + "epoch": 0.4567413263065437, + "grad_norm": 0.6193472743034363, + "learning_rate": 1.800718383264769e-05, + "loss": 0.0966, + "step": 1820 + }, + { + "epoch": 0.45799611017002323, + "grad_norm": 0.5469531416893005, + "learning_rate": 1.799499117013007e-05, + "loss": 0.0844, + "step": 1825 + }, + { + "epoch": 0.4592508940335027, + "grad_norm": 0.2877050042152405, + "learning_rate": 1.7982765476487398e-05, + "loss": 0.0929, + "step": 1830 + }, + { + "epoch": 0.46050567789698227, + "grad_norm": 0.2750946283340454, + "learning_rate": 1.797050680222985e-05, + "loss": 0.0883, + "step": 1835 + }, + { + "epoch": 0.46176046176046176, + "grad_norm": 0.4805077016353607, + "learning_rate": 1.7958215198003866e-05, + "loss": 0.0916, + "step": 1840 + }, + { + "epoch": 0.46301524562394125, + "grad_norm": 0.4399943947792053, + "learning_rate": 1.7945890714591926e-05, + "loss": 0.099, + "step": 1845 + }, + { + "epoch": 0.4642700294874208, + "grad_norm": 0.8317438364028931, + "learning_rate": 1.7933533402912354e-05, + "loss": 0.0836, + "step": 1850 + }, + { + "epoch": 0.4655248133509003, + "grad_norm": 0.26974841952323914, + "learning_rate": 1.7921143314019106e-05, + "loss": 0.1013, + "step": 1855 + }, + { + "epoch": 0.46677959721437984, + "grad_norm": 0.2595142722129822, + "learning_rate": 1.7908720499101552e-05, + "loss": 0.0881, + "step": 1860 + }, + { + "epoch": 0.46803438107785933, + "grad_norm": 0.18567359447479248, + "learning_rate": 1.789626500948427e-05, + "loss": 0.0929, + "step": 1865 + }, + { + "epoch": 0.4692891649413389, + "grad_norm": 0.11302470415830612, + "learning_rate": 1.7883776896626836e-05, + "loss": 0.0882, + "step": 1870 + }, + { + "epoch": 0.47054394880481837, + "grad_norm": 0.06441876292228699, + "learning_rate": 1.7871256212123605e-05, + "loss": 0.091, + "step": 1875 + }, + { + "epoch": 0.47179873266829786, + "grad_norm": 0.5252417922019958, + "learning_rate": 1.78587030077035e-05, + "loss": 0.0995, + "step": 1880 + }, + { + "epoch": 0.4730535165317774, + "grad_norm": 0.2045115977525711, + "learning_rate": 1.7846117335229808e-05, + "loss": 0.0889, + "step": 1885 + }, + { + "epoch": 0.4743083003952569, + "grad_norm": 0.22728148102760315, + "learning_rate": 1.783349924669994e-05, + "loss": 0.1018, + "step": 1890 + }, + { + "epoch": 0.47556308425873645, + "grad_norm": 0.17804867029190063, + "learning_rate": 1.7820848794245243e-05, + "loss": 0.1001, + "step": 1895 + }, + { + "epoch": 0.47681786812221594, + "grad_norm": 0.5709582567214966, + "learning_rate": 1.7808166030130782e-05, + "loss": 0.087, + "step": 1900 + }, + { + "epoch": 0.4780726519856955, + "grad_norm": 0.19447216391563416, + "learning_rate": 1.779545100675511e-05, + "loss": 0.0881, + "step": 1905 + }, + { + "epoch": 0.479327435849175, + "grad_norm": 0.5337616801261902, + "learning_rate": 1.778270377665005e-05, + "loss": 0.0954, + "step": 1910 + }, + { + "epoch": 0.48058221971265447, + "grad_norm": 0.8371096253395081, + "learning_rate": 1.77699243924805e-05, + "loss": 0.0886, + "step": 1915 + }, + { + "epoch": 0.481837003576134, + "grad_norm": 0.21723900735378265, + "learning_rate": 1.77571129070442e-05, + "loss": 0.0899, + "step": 1920 + }, + { + "epoch": 0.4830917874396135, + "grad_norm": 0.18002496659755707, + "learning_rate": 1.7744269373271507e-05, + "loss": 0.0876, + "step": 1925 + }, + { + "epoch": 0.48434657130309305, + "grad_norm": 0.15647904574871063, + "learning_rate": 1.7731393844225187e-05, + "loss": 0.0842, + "step": 1930 + }, + { + "epoch": 0.48560135516657255, + "grad_norm": 0.36643457412719727, + "learning_rate": 1.7718486373100207e-05, + "loss": 0.0848, + "step": 1935 + }, + { + "epoch": 0.4868561390300521, + "grad_norm": 0.37278589606285095, + "learning_rate": 1.7705547013223486e-05, + "loss": 0.1, + "step": 1940 + }, + { + "epoch": 0.4881109228935316, + "grad_norm": 0.18367764353752136, + "learning_rate": 1.7692575818053696e-05, + "loss": 0.0969, + "step": 1945 + }, + { + "epoch": 0.48936570675701113, + "grad_norm": 0.4618573486804962, + "learning_rate": 1.7679572841181033e-05, + "loss": 0.0854, + "step": 1950 + }, + { + "epoch": 0.4906204906204906, + "grad_norm": 0.2800655961036682, + "learning_rate": 1.7666538136327007e-05, + "loss": 0.0942, + "step": 1955 + }, + { + "epoch": 0.4918752744839701, + "grad_norm": 0.2937714457511902, + "learning_rate": 1.7653471757344203e-05, + "loss": 0.0977, + "step": 1960 + }, + { + "epoch": 0.49313005834744966, + "grad_norm": 0.7581244707107544, + "learning_rate": 1.7640373758216075e-05, + "loss": 0.0939, + "step": 1965 + }, + { + "epoch": 0.49438484221092915, + "grad_norm": 0.21476201713085175, + "learning_rate": 1.7627244193056705e-05, + "loss": 0.0936, + "step": 1970 + }, + { + "epoch": 0.4956396260744087, + "grad_norm": 0.1473897099494934, + "learning_rate": 1.7614083116110597e-05, + "loss": 0.0859, + "step": 1975 + }, + { + "epoch": 0.4968944099378882, + "grad_norm": 0.4635510742664337, + "learning_rate": 1.7600890581752435e-05, + "loss": 0.0984, + "step": 1980 + }, + { + "epoch": 0.49814919380136774, + "grad_norm": 0.15860065817832947, + "learning_rate": 1.758766664448689e-05, + "loss": 0.0934, + "step": 1985 + }, + { + "epoch": 0.49940397766484723, + "grad_norm": 0.10272369533777237, + "learning_rate": 1.7574411358948347e-05, + "loss": 0.0749, + "step": 1990 + }, + { + "epoch": 0.5006587615283268, + "grad_norm": 0.3367800712585449, + "learning_rate": 1.7561124779900723e-05, + "loss": 0.0809, + "step": 1995 + }, + { + "epoch": 0.5019135453918062, + "grad_norm": 0.23143918812274933, + "learning_rate": 1.7547806962237222e-05, + "loss": 0.1092, + "step": 2000 + }, + { + "epoch": 0.5031683292552858, + "grad_norm": 0.5789488554000854, + "learning_rate": 1.7534457960980097e-05, + "loss": 0.1088, + "step": 2005 + }, + { + "epoch": 0.5044231131187653, + "grad_norm": 0.2879166007041931, + "learning_rate": 1.7521077831280453e-05, + "loss": 0.0873, + "step": 2010 + }, + { + "epoch": 0.5056778969822449, + "grad_norm": 0.6664144396781921, + "learning_rate": 1.750766662841799e-05, + "loss": 0.0984, + "step": 2015 + }, + { + "epoch": 0.5069326808457243, + "grad_norm": 0.13004139065742493, + "learning_rate": 1.7494224407800792e-05, + "loss": 0.0879, + "step": 2020 + }, + { + "epoch": 0.5081874647092038, + "grad_norm": 0.30764704942703247, + "learning_rate": 1.7480751224965083e-05, + "loss": 0.093, + "step": 2025 + }, + { + "epoch": 0.5094422485726834, + "grad_norm": 0.15937355160713196, + "learning_rate": 1.7467247135575016e-05, + "loss": 0.088, + "step": 2030 + }, + { + "epoch": 0.5106970324361628, + "grad_norm": 0.12112176418304443, + "learning_rate": 1.7453712195422432e-05, + "loss": 0.0967, + "step": 2035 + }, + { + "epoch": 0.5119518162996424, + "grad_norm": 0.35342133045196533, + "learning_rate": 1.744014646042663e-05, + "loss": 0.0834, + "step": 2040 + }, + { + "epoch": 0.5132066001631219, + "grad_norm": 0.39632728695869446, + "learning_rate": 1.7426549986634135e-05, + "loss": 0.0866, + "step": 2045 + }, + { + "epoch": 0.5144613840266015, + "grad_norm": 0.4647534489631653, + "learning_rate": 1.741292283021847e-05, + "loss": 0.0834, + "step": 2050 + }, + { + "epoch": 0.5157161678900809, + "grad_norm": 0.3376861810684204, + "learning_rate": 1.7399265047479926e-05, + "loss": 0.0771, + "step": 2055 + }, + { + "epoch": 0.5169709517535604, + "grad_norm": 0.3715597689151764, + "learning_rate": 1.7385576694845324e-05, + "loss": 0.0886, + "step": 2060 + }, + { + "epoch": 0.51822573561704, + "grad_norm": 0.4062286615371704, + "learning_rate": 1.7371857828867778e-05, + "loss": 0.0886, + "step": 2065 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 0.2665550708770752, + "learning_rate": 1.7358108506226477e-05, + "loss": 0.0918, + "step": 2070 + }, + { + "epoch": 0.520735303343999, + "grad_norm": 0.4846581220626831, + "learning_rate": 1.7344328783726436e-05, + "loss": 0.0834, + "step": 2075 + }, + { + "epoch": 0.5219900872074785, + "grad_norm": 0.2385398894548416, + "learning_rate": 1.7330518718298263e-05, + "loss": 0.0953, + "step": 2080 + }, + { + "epoch": 0.5232448710709581, + "grad_norm": 0.5059955716133118, + "learning_rate": 1.7316678366997935e-05, + "loss": 0.0796, + "step": 2085 + }, + { + "epoch": 0.5244996549344375, + "grad_norm": 0.5229114294052124, + "learning_rate": 1.7302807787006547e-05, + "loss": 0.0953, + "step": 2090 + }, + { + "epoch": 0.5257544387979171, + "grad_norm": 0.3612153232097626, + "learning_rate": 1.728890703563009e-05, + "loss": 0.0874, + "step": 2095 + }, + { + "epoch": 0.5270092226613966, + "grad_norm": 0.2730209529399872, + "learning_rate": 1.7274976170299197e-05, + "loss": 0.0977, + "step": 2100 + }, + { + "epoch": 0.528264006524876, + "grad_norm": 0.28223976492881775, + "learning_rate": 1.726101524856893e-05, + "loss": 0.0901, + "step": 2105 + }, + { + "epoch": 0.5295187903883556, + "grad_norm": 0.24942927062511444, + "learning_rate": 1.724702432811852e-05, + "loss": 0.0978, + "step": 2110 + }, + { + "epoch": 0.5307735742518351, + "grad_norm": 0.21909397840499878, + "learning_rate": 1.7233003466751133e-05, + "loss": 0.0978, + "step": 2115 + }, + { + "epoch": 0.5320283581153147, + "grad_norm": 0.28112301230430603, + "learning_rate": 1.7218952722393646e-05, + "loss": 0.0899, + "step": 2120 + }, + { + "epoch": 0.5332831419787941, + "grad_norm": 0.23570087552070618, + "learning_rate": 1.7204872153096386e-05, + "loss": 0.0839, + "step": 2125 + }, + { + "epoch": 0.5345379258422737, + "grad_norm": 0.5193778872489929, + "learning_rate": 1.719076181703291e-05, + "loss": 0.1028, + "step": 2130 + }, + { + "epoch": 0.5357927097057532, + "grad_norm": 0.2834993600845337, + "learning_rate": 1.7176621772499752e-05, + "loss": 0.0984, + "step": 2135 + }, + { + "epoch": 0.5370474935692326, + "grad_norm": 0.41437774896621704, + "learning_rate": 1.716245207791618e-05, + "loss": 0.092, + "step": 2140 + }, + { + "epoch": 0.5383022774327122, + "grad_norm": 0.4809644818305969, + "learning_rate": 1.714825279182398e-05, + "loss": 0.0814, + "step": 2145 + }, + { + "epoch": 0.5395570612961917, + "grad_norm": 0.10345987975597382, + "learning_rate": 1.7134023972887164e-05, + "loss": 0.0736, + "step": 2150 + }, + { + "epoch": 0.5408118451596713, + "grad_norm": 0.06666211783885956, + "learning_rate": 1.7119765679891794e-05, + "loss": 0.078, + "step": 2155 + }, + { + "epoch": 0.5420666290231507, + "grad_norm": 0.4196692407131195, + "learning_rate": 1.7105477971745668e-05, + "loss": 0.0963, + "step": 2160 + }, + { + "epoch": 0.5433214128866303, + "grad_norm": 0.1562952995300293, + "learning_rate": 1.7091160907478137e-05, + "loss": 0.0777, + "step": 2165 + }, + { + "epoch": 0.5445761967501098, + "grad_norm": 0.2740727663040161, + "learning_rate": 1.7076814546239825e-05, + "loss": 0.0953, + "step": 2170 + }, + { + "epoch": 0.5458309806135893, + "grad_norm": 0.2551318407058716, + "learning_rate": 1.7062438947302405e-05, + "loss": 0.0775, + "step": 2175 + }, + { + "epoch": 0.5470857644770688, + "grad_norm": 0.9234172105789185, + "learning_rate": 1.704803417005833e-05, + "loss": 0.089, + "step": 2180 + }, + { + "epoch": 0.5483405483405484, + "grad_norm": 0.29249778389930725, + "learning_rate": 1.7033600274020616e-05, + "loss": 0.0925, + "step": 2185 + }, + { + "epoch": 0.5495953322040279, + "grad_norm": 0.1841478943824768, + "learning_rate": 1.7019137318822577e-05, + "loss": 0.0937, + "step": 2190 + }, + { + "epoch": 0.5508501160675073, + "grad_norm": 0.15575248003005981, + "learning_rate": 1.7004645364217584e-05, + "loss": 0.0894, + "step": 2195 + }, + { + "epoch": 0.5521048999309869, + "grad_norm": 0.37570688128471375, + "learning_rate": 1.699012447007882e-05, + "loss": 0.0853, + "step": 2200 + }, + { + "epoch": 0.5533596837944664, + "grad_norm": 0.49081242084503174, + "learning_rate": 1.6975574696399033e-05, + "loss": 0.0938, + "step": 2205 + }, + { + "epoch": 0.5546144676579459, + "grad_norm": 0.39014649391174316, + "learning_rate": 1.6960996103290282e-05, + "loss": 0.0876, + "step": 2210 + }, + { + "epoch": 0.5558692515214254, + "grad_norm": 0.48171404004096985, + "learning_rate": 1.694638875098369e-05, + "loss": 0.0865, + "step": 2215 + }, + { + "epoch": 0.557124035384905, + "grad_norm": 0.18437805771827698, + "learning_rate": 1.693175269982921e-05, + "loss": 0.0895, + "step": 2220 + }, + { + "epoch": 0.5583788192483845, + "grad_norm": 0.35959693789482117, + "learning_rate": 1.691708801029535e-05, + "loss": 0.0898, + "step": 2225 + }, + { + "epoch": 0.5596336031118639, + "grad_norm": 0.5558937191963196, + "learning_rate": 1.6902394742968945e-05, + "loss": 0.0974, + "step": 2230 + }, + { + "epoch": 0.5608883869753435, + "grad_norm": 0.2495732605457306, + "learning_rate": 1.68876729585549e-05, + "loss": 0.0879, + "step": 2235 + }, + { + "epoch": 0.562143170838823, + "grad_norm": 0.1926206350326538, + "learning_rate": 1.6872922717875923e-05, + "loss": 0.0889, + "step": 2240 + }, + { + "epoch": 0.5633979547023026, + "grad_norm": 0.17689497768878937, + "learning_rate": 1.6858144081872315e-05, + "loss": 0.0924, + "step": 2245 + }, + { + "epoch": 0.564652738565782, + "grad_norm": 0.10556932538747787, + "learning_rate": 1.6843337111601663e-05, + "loss": 0.086, + "step": 2250 + }, + { + "epoch": 0.5659075224292616, + "grad_norm": 0.19952704012393951, + "learning_rate": 1.6828501868238637e-05, + "loss": 0.0801, + "step": 2255 + }, + { + "epoch": 0.5671623062927411, + "grad_norm": 0.2911912202835083, + "learning_rate": 1.6813638413074707e-05, + "loss": 0.0925, + "step": 2260 + }, + { + "epoch": 0.5684170901562206, + "grad_norm": 0.23794835805892944, + "learning_rate": 1.67987468075179e-05, + "loss": 0.0913, + "step": 2265 + }, + { + "epoch": 0.5696718740197001, + "grad_norm": 0.1402294784784317, + "learning_rate": 1.6783827113092547e-05, + "loss": 0.0882, + "step": 2270 + }, + { + "epoch": 0.5709266578831796, + "grad_norm": 0.15708528459072113, + "learning_rate": 1.6768879391439035e-05, + "loss": 0.0941, + "step": 2275 + }, + { + "epoch": 0.5721814417466592, + "grad_norm": 0.37110403180122375, + "learning_rate": 1.6753903704313527e-05, + "loss": 0.0969, + "step": 2280 + }, + { + "epoch": 0.5734362256101386, + "grad_norm": 0.2311350554227829, + "learning_rate": 1.6738900113587745e-05, + "loss": 0.0849, + "step": 2285 + }, + { + "epoch": 0.5746910094736182, + "grad_norm": 0.22549764811992645, + "learning_rate": 1.6723868681248677e-05, + "loss": 0.0923, + "step": 2290 + }, + { + "epoch": 0.5759457933370977, + "grad_norm": 0.12788964807987213, + "learning_rate": 1.6708809469398347e-05, + "loss": 0.0798, + "step": 2295 + }, + { + "epoch": 0.5772005772005772, + "grad_norm": 0.4113115668296814, + "learning_rate": 1.6693722540253554e-05, + "loss": 0.0794, + "step": 2300 + }, + { + "epoch": 0.5784553610640567, + "grad_norm": 0.28315308690071106, + "learning_rate": 1.6678607956145596e-05, + "loss": 0.0855, + "step": 2305 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.18012386560440063, + "learning_rate": 1.6663465779520042e-05, + "loss": 0.0906, + "step": 2310 + }, + { + "epoch": 0.5809649287910158, + "grad_norm": 0.0916949212551117, + "learning_rate": 1.6648296072936445e-05, + "loss": 0.0807, + "step": 2315 + }, + { + "epoch": 0.5822197126544952, + "grad_norm": 0.3671141266822815, + "learning_rate": 1.6633098899068112e-05, + "loss": 0.093, + "step": 2320 + }, + { + "epoch": 0.5834744965179748, + "grad_norm": 0.16488447785377502, + "learning_rate": 1.6617874320701813e-05, + "loss": 0.0883, + "step": 2325 + }, + { + "epoch": 0.5847292803814543, + "grad_norm": 0.291689395904541, + "learning_rate": 1.660262240073756e-05, + "loss": 0.0908, + "step": 2330 + }, + { + "epoch": 0.5859840642449338, + "grad_norm": 0.25021910667419434, + "learning_rate": 1.658734320218831e-05, + "loss": 0.0799, + "step": 2335 + }, + { + "epoch": 0.5872388481084133, + "grad_norm": 0.11796696484088898, + "learning_rate": 1.6572036788179728e-05, + "loss": 0.0904, + "step": 2340 + }, + { + "epoch": 0.5884936319718929, + "grad_norm": 0.14167477190494537, + "learning_rate": 1.6556703221949912e-05, + "loss": 0.0912, + "step": 2345 + }, + { + "epoch": 0.5897484158353724, + "grad_norm": 0.12993638217449188, + "learning_rate": 1.6541342566849145e-05, + "loss": 0.0851, + "step": 2350 + }, + { + "epoch": 0.5910031996988518, + "grad_norm": 0.23770704865455627, + "learning_rate": 1.652595488633963e-05, + "loss": 0.0898, + "step": 2355 + }, + { + "epoch": 0.5922579835623314, + "grad_norm": 0.5402049422264099, + "learning_rate": 1.6510540243995216e-05, + "loss": 0.0882, + "step": 2360 + }, + { + "epoch": 0.5935127674258109, + "grad_norm": 0.16188447177410126, + "learning_rate": 1.6495098703501153e-05, + "loss": 0.0781, + "step": 2365 + }, + { + "epoch": 0.5947675512892904, + "grad_norm": 0.33231815695762634, + "learning_rate": 1.6479630328653814e-05, + "loss": 0.0879, + "step": 2370 + }, + { + "epoch": 0.5960223351527699, + "grad_norm": 0.3051691949367523, + "learning_rate": 1.6464135183360444e-05, + "loss": 0.1065, + "step": 2375 + }, + { + "epoch": 0.5972771190162495, + "grad_norm": 0.09856946021318436, + "learning_rate": 1.6448613331638877e-05, + "loss": 0.0931, + "step": 2380 + }, + { + "epoch": 0.598531902879729, + "grad_norm": 0.3320614993572235, + "learning_rate": 1.6433064837617294e-05, + "loss": 0.0806, + "step": 2385 + }, + { + "epoch": 0.5997866867432085, + "grad_norm": 0.6440458297729492, + "learning_rate": 1.641748976553395e-05, + "loss": 0.0981, + "step": 2390 + }, + { + "epoch": 0.601041470606688, + "grad_norm": 0.24246153235435486, + "learning_rate": 1.64018881797369e-05, + "loss": 0.088, + "step": 2395 + }, + { + "epoch": 0.6022962544701675, + "grad_norm": 0.21367953717708588, + "learning_rate": 1.6386260144683744e-05, + "loss": 0.087, + "step": 2400 + }, + { + "epoch": 0.603551038333647, + "grad_norm": 0.2851586639881134, + "learning_rate": 1.6370605724941356e-05, + "loss": 0.0877, + "step": 2405 + }, + { + "epoch": 0.6048058221971265, + "grad_norm": 0.19967404007911682, + "learning_rate": 1.6354924985185614e-05, + "loss": 0.0888, + "step": 2410 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.22060827910900116, + "learning_rate": 1.633921799020114e-05, + "loss": 0.0845, + "step": 2415 + }, + { + "epoch": 0.6073153899240856, + "grad_norm": 0.4157061278820038, + "learning_rate": 1.632348480488103e-05, + "loss": 0.0899, + "step": 2420 + }, + { + "epoch": 0.6085701737875651, + "grad_norm": 0.27656257152557373, + "learning_rate": 1.6307725494226586e-05, + "loss": 0.09, + "step": 2425 + }, + { + "epoch": 0.6098249576510446, + "grad_norm": 0.3946853578090668, + "learning_rate": 1.6291940123347033e-05, + "loss": 0.0863, + "step": 2430 + }, + { + "epoch": 0.6110797415145242, + "grad_norm": 0.2414676994085312, + "learning_rate": 1.6276128757459282e-05, + "loss": 0.0905, + "step": 2435 + }, + { + "epoch": 0.6123345253780036, + "grad_norm": 0.23438376188278198, + "learning_rate": 1.6260291461887628e-05, + "loss": 0.0726, + "step": 2440 + }, + { + "epoch": 0.6135893092414831, + "grad_norm": 0.20305827260017395, + "learning_rate": 1.6244428302063506e-05, + "loss": 0.0891, + "step": 2445 + }, + { + "epoch": 0.6148440931049627, + "grad_norm": 0.3930656909942627, + "learning_rate": 1.62285393435252e-05, + "loss": 0.0884, + "step": 2450 + }, + { + "epoch": 0.6160988769684422, + "grad_norm": 0.23331406712532043, + "learning_rate": 1.6212624651917573e-05, + "loss": 0.0904, + "step": 2455 + }, + { + "epoch": 0.6173536608319217, + "grad_norm": 0.34075310826301575, + "learning_rate": 1.6196684292991827e-05, + "loss": 0.0941, + "step": 2460 + }, + { + "epoch": 0.6186084446954012, + "grad_norm": 0.33201703429222107, + "learning_rate": 1.6180718332605185e-05, + "loss": 0.0838, + "step": 2465 + }, + { + "epoch": 0.6198632285588808, + "grad_norm": 0.10392648726701736, + "learning_rate": 1.6164726836720656e-05, + "loss": 0.0974, + "step": 2470 + }, + { + "epoch": 0.6211180124223602, + "grad_norm": 0.21417252719402313, + "learning_rate": 1.614870987140674e-05, + "loss": 0.0891, + "step": 2475 + }, + { + "epoch": 0.6223727962858397, + "grad_norm": 0.24638135731220245, + "learning_rate": 1.6132667502837164e-05, + "loss": 0.0827, + "step": 2480 + }, + { + "epoch": 0.6236275801493193, + "grad_norm": 0.183788001537323, + "learning_rate": 1.611659979729062e-05, + "loss": 0.0917, + "step": 2485 + }, + { + "epoch": 0.6248823640127988, + "grad_norm": 0.25125667452812195, + "learning_rate": 1.6100506821150455e-05, + "loss": 0.0888, + "step": 2490 + }, + { + "epoch": 0.6261371478762783, + "grad_norm": 0.24602219462394714, + "learning_rate": 1.6084388640904452e-05, + "loss": 0.0773, + "step": 2495 + }, + { + "epoch": 0.6273919317397578, + "grad_norm": 0.6547009944915771, + "learning_rate": 1.60682453231445e-05, + "loss": 0.1033, + "step": 2500 + }, + { + "epoch": 0.6286467156032374, + "grad_norm": 0.11527785658836365, + "learning_rate": 1.605207693456635e-05, + "loss": 0.0947, + "step": 2505 + }, + { + "epoch": 0.6299014994667168, + "grad_norm": 0.3719431459903717, + "learning_rate": 1.6035883541969336e-05, + "loss": 0.0873, + "step": 2510 + }, + { + "epoch": 0.6311562833301964, + "grad_norm": 0.2360219806432724, + "learning_rate": 1.601966521225609e-05, + "loss": 0.0826, + "step": 2515 + }, + { + "epoch": 0.6324110671936759, + "grad_norm": 0.27890655398368835, + "learning_rate": 1.6003422012432275e-05, + "loss": 0.0955, + "step": 2520 + }, + { + "epoch": 0.6336658510571554, + "grad_norm": 0.14903779327869415, + "learning_rate": 1.5987154009606308e-05, + "loss": 0.0951, + "step": 2525 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.20811066031455994, + "learning_rate": 1.5970861270989065e-05, + "loss": 0.0874, + "step": 2530 + }, + { + "epoch": 0.6361754187841144, + "grad_norm": 0.39913272857666016, + "learning_rate": 1.5954543863893638e-05, + "loss": 0.0846, + "step": 2535 + }, + { + "epoch": 0.637430202647594, + "grad_norm": 0.3802116811275482, + "learning_rate": 1.5938201855735017e-05, + "loss": 0.0754, + "step": 2540 + }, + { + "epoch": 0.6386849865110734, + "grad_norm": 0.4720747172832489, + "learning_rate": 1.592183531402984e-05, + "loss": 0.0872, + "step": 2545 + }, + { + "epoch": 0.639939770374553, + "grad_norm": 0.46679922938346863, + "learning_rate": 1.590544430639611e-05, + "loss": 0.0867, + "step": 2550 + }, + { + "epoch": 0.6411945542380325, + "grad_norm": 0.29669615626335144, + "learning_rate": 1.5889028900552897e-05, + "loss": 0.0988, + "step": 2555 + }, + { + "epoch": 0.6424493381015121, + "grad_norm": 0.20683659613132477, + "learning_rate": 1.587258916432008e-05, + "loss": 0.0935, + "step": 2560 + }, + { + "epoch": 0.6437041219649915, + "grad_norm": 0.3284091055393219, + "learning_rate": 1.5856125165618056e-05, + "loss": 0.0779, + "step": 2565 + }, + { + "epoch": 0.644958905828471, + "grad_norm": 0.24657922983169556, + "learning_rate": 1.5839636972467466e-05, + "loss": 0.0827, + "step": 2570 + }, + { + "epoch": 0.6462136896919506, + "grad_norm": 0.32977283000946045, + "learning_rate": 1.5823124652988907e-05, + "loss": 0.0955, + "step": 2575 + }, + { + "epoch": 0.64746847355543, + "grad_norm": 0.28097760677337646, + "learning_rate": 1.580658827540265e-05, + "loss": 0.0837, + "step": 2580 + }, + { + "epoch": 0.6487232574189096, + "grad_norm": 0.12431260943412781, + "learning_rate": 1.5790027908028366e-05, + "loss": 0.0901, + "step": 2585 + }, + { + "epoch": 0.6499780412823891, + "grad_norm": 0.22479486465454102, + "learning_rate": 1.5773443619284844e-05, + "loss": 0.084, + "step": 2590 + }, + { + "epoch": 0.6512328251458687, + "grad_norm": 0.36471477150917053, + "learning_rate": 1.5756835477689683e-05, + "loss": 0.0811, + "step": 2595 + }, + { + "epoch": 0.6524876090093481, + "grad_norm": 0.11465982347726822, + "learning_rate": 1.574020355185906e-05, + "loss": 0.0775, + "step": 2600 + }, + { + "epoch": 0.6537423928728276, + "grad_norm": 0.22774779796600342, + "learning_rate": 1.5723547910507392e-05, + "loss": 0.0893, + "step": 2605 + }, + { + "epoch": 0.6549971767363072, + "grad_norm": 0.17315097153186798, + "learning_rate": 1.5706868622447084e-05, + "loss": 0.0924, + "step": 2610 + }, + { + "epoch": 0.6562519605997866, + "grad_norm": 0.21010887622833252, + "learning_rate": 1.5690165756588235e-05, + "loss": 0.085, + "step": 2615 + }, + { + "epoch": 0.6575067444632662, + "grad_norm": 0.20467358827590942, + "learning_rate": 1.5673439381938365e-05, + "loss": 0.0857, + "step": 2620 + }, + { + "epoch": 0.6587615283267457, + "grad_norm": 0.0706283450126648, + "learning_rate": 1.565668956760211e-05, + "loss": 0.0921, + "step": 2625 + }, + { + "epoch": 0.6600163121902253, + "grad_norm": 0.10635245591402054, + "learning_rate": 1.563991638278094e-05, + "loss": 0.0901, + "step": 2630 + }, + { + "epoch": 0.6612710960537047, + "grad_norm": 0.2414940446615219, + "learning_rate": 1.56231198967729e-05, + "loss": 0.0979, + "step": 2635 + }, + { + "epoch": 0.6625258799171843, + "grad_norm": 0.3018660843372345, + "learning_rate": 1.560630017897229e-05, + "loss": 0.0774, + "step": 2640 + }, + { + "epoch": 0.6637806637806638, + "grad_norm": 0.08618688583374023, + "learning_rate": 1.558945729886938e-05, + "loss": 0.0917, + "step": 2645 + }, + { + "epoch": 0.6650354476441434, + "grad_norm": 0.3376272916793823, + "learning_rate": 1.5572591326050167e-05, + "loss": 0.0997, + "step": 2650 + }, + { + "epoch": 0.6662902315076228, + "grad_norm": 0.47232964634895325, + "learning_rate": 1.5555702330196024e-05, + "loss": 0.0984, + "step": 2655 + }, + { + "epoch": 0.6675450153711023, + "grad_norm": 0.3733605742454529, + "learning_rate": 1.5538790381083457e-05, + "loss": 0.095, + "step": 2660 + }, + { + "epoch": 0.6687997992345819, + "grad_norm": 0.197517529129982, + "learning_rate": 1.5521855548583807e-05, + "loss": 0.0882, + "step": 2665 + }, + { + "epoch": 0.6700545830980613, + "grad_norm": 0.5959796905517578, + "learning_rate": 1.550489790266294e-05, + "loss": 0.0978, + "step": 2670 + }, + { + "epoch": 0.6713093669615409, + "grad_norm": 0.3124695122241974, + "learning_rate": 1.5487917513381e-05, + "loss": 0.0877, + "step": 2675 + }, + { + "epoch": 0.6725641508250204, + "grad_norm": 0.5856664776802063, + "learning_rate": 1.5470914450892066e-05, + "loss": 0.0871, + "step": 2680 + }, + { + "epoch": 0.6738189346885, + "grad_norm": 0.11247047036886215, + "learning_rate": 1.5453888785443916e-05, + "loss": 0.0787, + "step": 2685 + }, + { + "epoch": 0.6750737185519794, + "grad_norm": 0.21094247698783875, + "learning_rate": 1.54368405873777e-05, + "loss": 0.0905, + "step": 2690 + }, + { + "epoch": 0.6763285024154589, + "grad_norm": 0.32155776023864746, + "learning_rate": 1.5419769927127664e-05, + "loss": 0.0896, + "step": 2695 + }, + { + "epoch": 0.6775832862789385, + "grad_norm": 0.1487855464220047, + "learning_rate": 1.5402676875220847e-05, + "loss": 0.0924, + "step": 2700 + }, + { + "epoch": 0.6788380701424179, + "grad_norm": 0.3109644651412964, + "learning_rate": 1.5385561502276813e-05, + "loss": 0.085, + "step": 2705 + }, + { + "epoch": 0.6800928540058975, + "grad_norm": 0.13988961279392242, + "learning_rate": 1.536842387900733e-05, + "loss": 0.0806, + "step": 2710 + }, + { + "epoch": 0.681347637869377, + "grad_norm": 0.4081459045410156, + "learning_rate": 1.5351264076216114e-05, + "loss": 0.0912, + "step": 2715 + }, + { + "epoch": 0.6826024217328566, + "grad_norm": 0.23864784836769104, + "learning_rate": 1.533408216479849e-05, + "loss": 0.088, + "step": 2720 + }, + { + "epoch": 0.683857205596336, + "grad_norm": 0.24918711185455322, + "learning_rate": 1.531687821574114e-05, + "loss": 0.0904, + "step": 2725 + }, + { + "epoch": 0.6851119894598156, + "grad_norm": 0.23600496351718903, + "learning_rate": 1.5299652300121792e-05, + "loss": 0.0944, + "step": 2730 + }, + { + "epoch": 0.6863667733232951, + "grad_norm": 0.1220758855342865, + "learning_rate": 1.5282404489108925e-05, + "loss": 0.0811, + "step": 2735 + }, + { + "epoch": 0.6876215571867745, + "grad_norm": 0.1529039442539215, + "learning_rate": 1.5265134853961477e-05, + "loss": 0.0985, + "step": 2740 + }, + { + "epoch": 0.6888763410502541, + "grad_norm": 0.3052196502685547, + "learning_rate": 1.524784346602856e-05, + "loss": 0.0917, + "step": 2745 + }, + { + "epoch": 0.6901311249137336, + "grad_norm": 0.3731965720653534, + "learning_rate": 1.5230530396749148e-05, + "loss": 0.0906, + "step": 2750 + }, + { + "epoch": 0.6913859087772132, + "grad_norm": 0.09618979692459106, + "learning_rate": 1.5213195717651793e-05, + "loss": 0.0914, + "step": 2755 + }, + { + "epoch": 0.6926406926406926, + "grad_norm": 0.3307301104068756, + "learning_rate": 1.5195839500354337e-05, + "loss": 0.0859, + "step": 2760 + }, + { + "epoch": 0.6938954765041722, + "grad_norm": 0.12252155691385269, + "learning_rate": 1.5178461816563594e-05, + "loss": 0.0881, + "step": 2765 + }, + { + "epoch": 0.6951502603676517, + "grad_norm": 0.2949330806732178, + "learning_rate": 1.5161062738075068e-05, + "loss": 0.0782, + "step": 2770 + }, + { + "epoch": 0.6964050442311311, + "grad_norm": 0.3150964379310608, + "learning_rate": 1.5143642336772663e-05, + "loss": 0.0866, + "step": 2775 + }, + { + "epoch": 0.6976598280946107, + "grad_norm": 0.44747477769851685, + "learning_rate": 1.5126200684628372e-05, + "loss": 0.0943, + "step": 2780 + }, + { + "epoch": 0.6989146119580902, + "grad_norm": 0.1062508150935173, + "learning_rate": 1.5108737853701981e-05, + "loss": 0.1027, + "step": 2785 + }, + { + "epoch": 0.7001693958215698, + "grad_norm": 0.16572189331054688, + "learning_rate": 1.5091253916140789e-05, + "loss": 0.0892, + "step": 2790 + }, + { + "epoch": 0.7014241796850492, + "grad_norm": 0.1096203476190567, + "learning_rate": 1.5073748944179282e-05, + "loss": 0.0889, + "step": 2795 + }, + { + "epoch": 0.7026789635485288, + "grad_norm": 0.3676256835460663, + "learning_rate": 1.5056223010138857e-05, + "loss": 0.0998, + "step": 2800 + }, + { + "epoch": 0.7039337474120083, + "grad_norm": 0.2482236921787262, + "learning_rate": 1.5038676186427515e-05, + "loss": 0.089, + "step": 2805 + }, + { + "epoch": 0.7051885312754878, + "grad_norm": 0.13800540566444397, + "learning_rate": 1.5021108545539562e-05, + "loss": 0.0923, + "step": 2810 + }, + { + "epoch": 0.7064433151389673, + "grad_norm": 0.33005279302597046, + "learning_rate": 1.5003520160055303e-05, + "loss": 0.0894, + "step": 2815 + }, + { + "epoch": 0.7076980990024468, + "grad_norm": 0.34894976019859314, + "learning_rate": 1.4985911102640762e-05, + "loss": 0.0975, + "step": 2820 + }, + { + "epoch": 0.7089528828659264, + "grad_norm": 0.2921280860900879, + "learning_rate": 1.4968281446047357e-05, + "loss": 0.089, + "step": 2825 + }, + { + "epoch": 0.7102076667294058, + "grad_norm": 0.3317979872226715, + "learning_rate": 1.4950631263111615e-05, + "loss": 0.0889, + "step": 2830 + }, + { + "epoch": 0.7114624505928854, + "grad_norm": 0.16637681424617767, + "learning_rate": 1.4932960626754867e-05, + "loss": 0.0788, + "step": 2835 + }, + { + "epoch": 0.7127172344563649, + "grad_norm": 0.15462718904018402, + "learning_rate": 1.491526960998295e-05, + "loss": 0.0887, + "step": 2840 + }, + { + "epoch": 0.7139720183198444, + "grad_norm": 0.23913145065307617, + "learning_rate": 1.4897558285885896e-05, + "loss": 0.0849, + "step": 2845 + }, + { + "epoch": 0.7152268021833239, + "grad_norm": 0.44012489914894104, + "learning_rate": 1.487982672763764e-05, + "loss": 0.0802, + "step": 2850 + }, + { + "epoch": 0.7164815860468035, + "grad_norm": 0.16587427258491516, + "learning_rate": 1.4862075008495718e-05, + "loss": 0.0913, + "step": 2855 + }, + { + "epoch": 0.717736369910283, + "grad_norm": 0.4225013256072998, + "learning_rate": 1.4844303201800949e-05, + "loss": 0.0828, + "step": 2860 + }, + { + "epoch": 0.7189911537737624, + "grad_norm": 0.3589664399623871, + "learning_rate": 1.4826511380977155e-05, + "loss": 0.0861, + "step": 2865 + }, + { + "epoch": 0.720245937637242, + "grad_norm": 0.13328975439071655, + "learning_rate": 1.4808699619530841e-05, + "loss": 0.0897, + "step": 2870 + }, + { + "epoch": 0.7215007215007215, + "grad_norm": 0.5425406098365784, + "learning_rate": 1.479086799105089e-05, + "loss": 0.0789, + "step": 2875 + }, + { + "epoch": 0.722755505364201, + "grad_norm": 0.22545233368873596, + "learning_rate": 1.4773016569208283e-05, + "loss": 0.0814, + "step": 2880 + }, + { + "epoch": 0.7240102892276805, + "grad_norm": 0.2654625177383423, + "learning_rate": 1.4755145427755755e-05, + "loss": 0.0846, + "step": 2885 + }, + { + "epoch": 0.7252650730911601, + "grad_norm": 0.4927530288696289, + "learning_rate": 1.4737254640527525e-05, + "loss": 0.0892, + "step": 2890 + }, + { + "epoch": 0.7265198569546396, + "grad_norm": 0.317210853099823, + "learning_rate": 1.4719344281438977e-05, + "loss": 0.0858, + "step": 2895 + }, + { + "epoch": 0.727774640818119, + "grad_norm": 0.3418554961681366, + "learning_rate": 1.4701414424486353e-05, + "loss": 0.0859, + "step": 2900 + }, + { + "epoch": 0.7290294246815986, + "grad_norm": 0.21974579989910126, + "learning_rate": 1.4683465143746452e-05, + "loss": 0.0988, + "step": 2905 + }, + { + "epoch": 0.7302842085450781, + "grad_norm": 0.35090965032577515, + "learning_rate": 1.466549651337632e-05, + "loss": 0.0909, + "step": 2910 + }, + { + "epoch": 0.7315389924085576, + "grad_norm": 0.27880313992500305, + "learning_rate": 1.4647508607612952e-05, + "loss": 0.0907, + "step": 2915 + }, + { + "epoch": 0.7327937762720371, + "grad_norm": 0.12414207309484482, + "learning_rate": 1.4629501500772962e-05, + "loss": 0.0912, + "step": 2920 + }, + { + "epoch": 0.7340485601355167, + "grad_norm": 0.3848329782485962, + "learning_rate": 1.4611475267252318e-05, + "loss": 0.0813, + "step": 2925 + }, + { + "epoch": 0.7353033439989962, + "grad_norm": 0.07426943629980087, + "learning_rate": 1.4593429981525985e-05, + "loss": 0.0862, + "step": 2930 + }, + { + "epoch": 0.7365581278624757, + "grad_norm": 0.1772748976945877, + "learning_rate": 1.4575365718147655e-05, + "loss": 0.111, + "step": 2935 + }, + { + "epoch": 0.7378129117259552, + "grad_norm": 0.23796626925468445, + "learning_rate": 1.4557282551749428e-05, + "loss": 0.0852, + "step": 2940 + }, + { + "epoch": 0.7390676955894347, + "grad_norm": 0.22509251534938812, + "learning_rate": 1.4539180557041494e-05, + "loss": 0.0804, + "step": 2945 + }, + { + "epoch": 0.7403224794529142, + "grad_norm": 0.29270699620246887, + "learning_rate": 1.452105980881183e-05, + "loss": 0.0925, + "step": 2950 + }, + { + "epoch": 0.7415772633163937, + "grad_norm": 0.27449578046798706, + "learning_rate": 1.4502920381925905e-05, + "loss": 0.084, + "step": 2955 + }, + { + "epoch": 0.7428320471798733, + "grad_norm": 0.23798063397407532, + "learning_rate": 1.4484762351326344e-05, + "loss": 0.0894, + "step": 2960 + }, + { + "epoch": 0.7440868310433528, + "grad_norm": 0.1779652088880539, + "learning_rate": 1.4466585792032644e-05, + "loss": 0.0829, + "step": 2965 + }, + { + "epoch": 0.7453416149068323, + "grad_norm": 0.10709716379642487, + "learning_rate": 1.4448390779140844e-05, + "loss": 0.0889, + "step": 2970 + }, + { + "epoch": 0.7465963987703118, + "grad_norm": 0.3699777126312256, + "learning_rate": 1.4430177387823232e-05, + "loss": 0.0925, + "step": 2975 + }, + { + "epoch": 0.7478511826337914, + "grad_norm": 0.23383845388889313, + "learning_rate": 1.4411945693328017e-05, + "loss": 0.0802, + "step": 2980 + }, + { + "epoch": 0.7491059664972708, + "grad_norm": 0.17309200763702393, + "learning_rate": 1.4393695770979038e-05, + "loss": 0.092, + "step": 2985 + }, + { + "epoch": 0.7503607503607503, + "grad_norm": 0.08418245613574982, + "learning_rate": 1.4375427696175434e-05, + "loss": 0.088, + "step": 2990 + }, + { + "epoch": 0.7516155342242299, + "grad_norm": 0.4208645820617676, + "learning_rate": 1.4357141544391342e-05, + "loss": 0.0946, + "step": 2995 + }, + { + "epoch": 0.7528703180877094, + "grad_norm": 0.10623873025178909, + "learning_rate": 1.4338837391175582e-05, + "loss": 0.0876, + "step": 3000 + }, + { + "epoch": 0.7541251019511889, + "grad_norm": 0.13417641818523407, + "learning_rate": 1.4320515312151352e-05, + "loss": 0.0853, + "step": 3005 + }, + { + "epoch": 0.7553798858146684, + "grad_norm": 0.11833745986223221, + "learning_rate": 1.4302175383015907e-05, + "loss": 0.0923, + "step": 3010 + }, + { + "epoch": 0.756634669678148, + "grad_norm": 0.13997751474380493, + "learning_rate": 1.4283817679540246e-05, + "loss": 0.0842, + "step": 3015 + }, + { + "epoch": 0.7578894535416274, + "grad_norm": 0.4091770648956299, + "learning_rate": 1.4265442277568808e-05, + "loss": 0.0869, + "step": 3020 + }, + { + "epoch": 0.759144237405107, + "grad_norm": 0.38319870829582214, + "learning_rate": 1.4247049253019148e-05, + "loss": 0.0816, + "step": 3025 + }, + { + "epoch": 0.7603990212685865, + "grad_norm": 0.44087597727775574, + "learning_rate": 1.4228638681881633e-05, + "loss": 0.0925, + "step": 3030 + }, + { + "epoch": 0.761653805132066, + "grad_norm": 0.10818012803792953, + "learning_rate": 1.4210210640219117e-05, + "loss": 0.0873, + "step": 3035 + }, + { + "epoch": 0.7629085889955455, + "grad_norm": 0.3094254434108734, + "learning_rate": 1.4191765204166643e-05, + "loss": 0.0881, + "step": 3040 + }, + { + "epoch": 0.764163372859025, + "grad_norm": 0.14086200296878815, + "learning_rate": 1.4173302449931107e-05, + "loss": 0.0874, + "step": 3045 + }, + { + "epoch": 0.7654181567225046, + "grad_norm": 0.10829006880521774, + "learning_rate": 1.4154822453790963e-05, + "loss": 0.0813, + "step": 3050 + }, + { + "epoch": 0.7666729405859841, + "grad_norm": 0.21264980733394623, + "learning_rate": 1.4136325292095899e-05, + "loss": 0.0823, + "step": 3055 + }, + { + "epoch": 0.7679277244494636, + "grad_norm": 0.07161971926689148, + "learning_rate": 1.4117811041266518e-05, + "loss": 0.0878, + "step": 3060 + }, + { + "epoch": 0.7691825083129431, + "grad_norm": 0.24035820364952087, + "learning_rate": 1.4099279777794026e-05, + "loss": 0.0811, + "step": 3065 + }, + { + "epoch": 0.7704372921764227, + "grad_norm": 0.17685869336128235, + "learning_rate": 1.4080731578239917e-05, + "loss": 0.0854, + "step": 3070 + }, + { + "epoch": 0.7716920760399021, + "grad_norm": 0.33490845561027527, + "learning_rate": 1.4062166519235665e-05, + "loss": 0.0875, + "step": 3075 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 0.24075230956077576, + "learning_rate": 1.4043584677482383e-05, + "loss": 0.0942, + "step": 3080 + }, + { + "epoch": 0.7742016437668612, + "grad_norm": 0.16541673243045807, + "learning_rate": 1.4024986129750535e-05, + "loss": 0.0924, + "step": 3085 + }, + { + "epoch": 0.7754564276303407, + "grad_norm": 0.36374107003211975, + "learning_rate": 1.40063709528796e-05, + "loss": 0.0959, + "step": 3090 + }, + { + "epoch": 0.7767112114938202, + "grad_norm": 0.19677504897117615, + "learning_rate": 1.3987739223777756e-05, + "loss": 0.0841, + "step": 3095 + }, + { + "epoch": 0.7779659953572997, + "grad_norm": 0.24688193202018738, + "learning_rate": 1.3969091019421573e-05, + "loss": 0.0795, + "step": 3100 + }, + { + "epoch": 0.7792207792207793, + "grad_norm": 0.4386122524738312, + "learning_rate": 1.3950426416855685e-05, + "loss": 0.091, + "step": 3105 + }, + { + "epoch": 0.7804755630842587, + "grad_norm": 0.281536340713501, + "learning_rate": 1.3931745493192473e-05, + "loss": 0.0809, + "step": 3110 + }, + { + "epoch": 0.7817303469477382, + "grad_norm": 0.3444419801235199, + "learning_rate": 1.391304832561175e-05, + "loss": 0.0819, + "step": 3115 + }, + { + "epoch": 0.7829851308112178, + "grad_norm": 0.3660227060317993, + "learning_rate": 1.3894334991360448e-05, + "loss": 0.0908, + "step": 3120 + }, + { + "epoch": 0.7842399146746973, + "grad_norm": 0.15960213541984558, + "learning_rate": 1.3875605567752275e-05, + "loss": 0.0817, + "step": 3125 + }, + { + "epoch": 0.7854946985381768, + "grad_norm": 0.10506850481033325, + "learning_rate": 1.3856860132167423e-05, + "loss": 0.0867, + "step": 3130 + }, + { + "epoch": 0.7867494824016563, + "grad_norm": 0.28331154584884644, + "learning_rate": 1.3838098762052237e-05, + "loss": 0.0899, + "step": 3135 + }, + { + "epoch": 0.7880042662651359, + "grad_norm": 0.07872291654348373, + "learning_rate": 1.381932153491889e-05, + "loss": 0.0864, + "step": 3140 + }, + { + "epoch": 0.7892590501286153, + "grad_norm": 0.26927658915519714, + "learning_rate": 1.3800528528345074e-05, + "loss": 0.0928, + "step": 3145 + }, + { + "epoch": 0.7905138339920948, + "grad_norm": 0.4041297733783722, + "learning_rate": 1.378171981997367e-05, + "loss": 0.0891, + "step": 3150 + }, + { + "epoch": 0.7917686178555744, + "grad_norm": 0.29741108417510986, + "learning_rate": 1.3762895487512426e-05, + "loss": 0.0827, + "step": 3155 + }, + { + "epoch": 0.7930234017190539, + "grad_norm": 0.11423831433057785, + "learning_rate": 1.3744055608733654e-05, + "loss": 0.0853, + "step": 3160 + }, + { + "epoch": 0.7942781855825334, + "grad_norm": 0.07605528086423874, + "learning_rate": 1.3725200261473879e-05, + "loss": 0.0958, + "step": 3165 + }, + { + "epoch": 0.7955329694460129, + "grad_norm": 0.27038949728012085, + "learning_rate": 1.3706329523633546e-05, + "loss": 0.0866, + "step": 3170 + }, + { + "epoch": 0.7967877533094925, + "grad_norm": 0.17523351311683655, + "learning_rate": 1.3687443473176678e-05, + "loss": 0.092, + "step": 3175 + }, + { + "epoch": 0.7980425371729719, + "grad_norm": 0.14444954693317413, + "learning_rate": 1.3668542188130567e-05, + "loss": 0.0937, + "step": 3180 + }, + { + "epoch": 0.7992973210364515, + "grad_norm": 0.09110607951879501, + "learning_rate": 1.3649625746585442e-05, + "loss": 0.0894, + "step": 3185 + }, + { + "epoch": 0.800552104899931, + "grad_norm": 0.24283112585544586, + "learning_rate": 1.3630694226694159e-05, + "loss": 0.0879, + "step": 3190 + }, + { + "epoch": 0.8018068887634106, + "grad_norm": 0.2000175565481186, + "learning_rate": 1.3611747706671859e-05, + "loss": 0.0809, + "step": 3195 + }, + { + "epoch": 0.80306167262689, + "grad_norm": 0.13179725408554077, + "learning_rate": 1.3592786264795659e-05, + "loss": 0.0792, + "step": 3200 + }, + { + "epoch": 0.8043164564903695, + "grad_norm": 0.17673130333423615, + "learning_rate": 1.357380997940433e-05, + "loss": 0.0819, + "step": 3205 + }, + { + "epoch": 0.8055712403538491, + "grad_norm": 0.1775507628917694, + "learning_rate": 1.3554818928897965e-05, + "loss": 0.094, + "step": 3210 + }, + { + "epoch": 0.8068260242173285, + "grad_norm": 0.10443057119846344, + "learning_rate": 1.3535813191737663e-05, + "loss": 0.0921, + "step": 3215 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.45298513770103455, + "learning_rate": 1.351679284644519e-05, + "loss": 0.092, + "step": 3220 + }, + { + "epoch": 0.8093355919442876, + "grad_norm": 0.1088099479675293, + "learning_rate": 1.3497757971602677e-05, + "loss": 0.0945, + "step": 3225 + }, + { + "epoch": 0.8105903758077672, + "grad_norm": 0.11513658612966537, + "learning_rate": 1.3478708645852272e-05, + "loss": 0.0906, + "step": 3230 + }, + { + "epoch": 0.8118451596712466, + "grad_norm": 0.15573541820049286, + "learning_rate": 1.3459644947895844e-05, + "loss": 0.0895, + "step": 3235 + }, + { + "epoch": 0.8130999435347261, + "grad_norm": 0.548417866230011, + "learning_rate": 1.344056695649462e-05, + "loss": 0.0957, + "step": 3240 + }, + { + "epoch": 0.8143547273982057, + "grad_norm": 0.24571377038955688, + "learning_rate": 1.3421474750468893e-05, + "loss": 0.0852, + "step": 3245 + }, + { + "epoch": 0.8156095112616851, + "grad_norm": 0.1693846732378006, + "learning_rate": 1.3402368408697681e-05, + "loss": 0.0962, + "step": 3250 + }, + { + "epoch": 0.8168642951251647, + "grad_norm": 0.12856455147266388, + "learning_rate": 1.3383248010118404e-05, + "loss": 0.0926, + "step": 3255 + }, + { + "epoch": 0.8181190789886442, + "grad_norm": 0.3115483820438385, + "learning_rate": 1.336411363372655e-05, + "loss": 0.0943, + "step": 3260 + }, + { + "epoch": 0.8193738628521238, + "grad_norm": 0.44211283326148987, + "learning_rate": 1.3344965358575368e-05, + "loss": 0.102, + "step": 3265 + }, + { + "epoch": 0.8206286467156032, + "grad_norm": 0.1086539626121521, + "learning_rate": 1.3325803263775521e-05, + "loss": 0.0968, + "step": 3270 + }, + { + "epoch": 0.8218834305790828, + "grad_norm": 0.24857747554779053, + "learning_rate": 1.3306627428494769e-05, + "loss": 0.097, + "step": 3275 + }, + { + "epoch": 0.8231382144425623, + "grad_norm": 0.5873998403549194, + "learning_rate": 1.3287437931957642e-05, + "loss": 0.0848, + "step": 3280 + }, + { + "epoch": 0.8243929983060417, + "grad_norm": 0.2507260739803314, + "learning_rate": 1.3268234853445113e-05, + "loss": 0.0917, + "step": 3285 + }, + { + "epoch": 0.8256477821695213, + "grad_norm": 0.2204979509115219, + "learning_rate": 1.3249018272294261e-05, + "loss": 0.083, + "step": 3290 + }, + { + "epoch": 0.8269025660330008, + "grad_norm": 0.19563211500644684, + "learning_rate": 1.3229788267897958e-05, + "loss": 0.0803, + "step": 3295 + }, + { + "epoch": 0.8281573498964804, + "grad_norm": 0.24326251447200775, + "learning_rate": 1.3210544919704539e-05, + "loss": 0.078, + "step": 3300 + }, + { + "epoch": 0.8294121337599598, + "grad_norm": 0.08204273879528046, + "learning_rate": 1.319128830721745e-05, + "loss": 0.0866, + "step": 3305 + }, + { + "epoch": 0.8306669176234394, + "grad_norm": 0.3777156174182892, + "learning_rate": 1.317201850999496e-05, + "loss": 0.0892, + "step": 3310 + }, + { + "epoch": 0.8319217014869189, + "grad_norm": 0.15763606131076813, + "learning_rate": 1.315273560764979e-05, + "loss": 0.0947, + "step": 3315 + }, + { + "epoch": 0.8331764853503983, + "grad_norm": 0.4323311448097229, + "learning_rate": 1.3133439679848824e-05, + "loss": 0.091, + "step": 3320 + }, + { + "epoch": 0.8344312692138779, + "grad_norm": 0.51838618516922, + "learning_rate": 1.3114130806312744e-05, + "loss": 0.0783, + "step": 3325 + }, + { + "epoch": 0.8356860530773574, + "grad_norm": 0.2101561725139618, + "learning_rate": 1.3094809066815731e-05, + "loss": 0.0797, + "step": 3330 + }, + { + "epoch": 0.836940836940837, + "grad_norm": 0.35820868611335754, + "learning_rate": 1.3075474541185104e-05, + "loss": 0.0904, + "step": 3335 + }, + { + "epoch": 0.8381956208043164, + "grad_norm": 0.17395086586475372, + "learning_rate": 1.3056127309301027e-05, + "loss": 0.0773, + "step": 3340 + }, + { + "epoch": 0.839450404667796, + "grad_norm": 0.22628264129161835, + "learning_rate": 1.3036767451096148e-05, + "loss": 0.0833, + "step": 3345 + }, + { + "epoch": 0.8407051885312755, + "grad_norm": 0.14669981598854065, + "learning_rate": 1.3017395046555284e-05, + "loss": 0.0866, + "step": 3350 + }, + { + "epoch": 0.841959972394755, + "grad_norm": 0.45152294635772705, + "learning_rate": 1.2998010175715081e-05, + "loss": 0.0955, + "step": 3355 + }, + { + "epoch": 0.8432147562582345, + "grad_norm": 0.14131848514080048, + "learning_rate": 1.2978612918663702e-05, + "loss": 0.0805, + "step": 3360 + }, + { + "epoch": 0.844469540121714, + "grad_norm": 0.4170994460582733, + "learning_rate": 1.2959203355540466e-05, + "loss": 0.0885, + "step": 3365 + }, + { + "epoch": 0.8457243239851936, + "grad_norm": 0.2687641382217407, + "learning_rate": 1.2939781566535551e-05, + "loss": 0.0859, + "step": 3370 + }, + { + "epoch": 0.846979107848673, + "grad_norm": 0.15929897129535675, + "learning_rate": 1.2920347631889637e-05, + "loss": 0.0876, + "step": 3375 + }, + { + "epoch": 0.8482338917121526, + "grad_norm": 0.1964198797941208, + "learning_rate": 1.2900901631893585e-05, + "loss": 0.0966, + "step": 3380 + }, + { + "epoch": 0.8494886755756321, + "grad_norm": 0.21609993278980255, + "learning_rate": 1.28814436468881e-05, + "loss": 0.0811, + "step": 3385 + }, + { + "epoch": 0.8507434594391116, + "grad_norm": 0.24047359824180603, + "learning_rate": 1.2861973757263416e-05, + "loss": 0.0858, + "step": 3390 + }, + { + "epoch": 0.8519982433025911, + "grad_norm": 0.25712600350379944, + "learning_rate": 1.2842492043458929e-05, + "loss": 0.0793, + "step": 3395 + }, + { + "epoch": 0.8532530271660707, + "grad_norm": 0.1645137518644333, + "learning_rate": 1.2822998585962909e-05, + "loss": 0.0801, + "step": 3400 + }, + { + "epoch": 0.8545078110295502, + "grad_norm": 0.601332426071167, + "learning_rate": 1.280349346531213e-05, + "loss": 0.0891, + "step": 3405 + }, + { + "epoch": 0.8557625948930296, + "grad_norm": 0.3988315761089325, + "learning_rate": 1.2783976762091554e-05, + "loss": 0.0885, + "step": 3410 + }, + { + "epoch": 0.8570173787565092, + "grad_norm": 0.37066715955734253, + "learning_rate": 1.2764448556934001e-05, + "loss": 0.0935, + "step": 3415 + }, + { + "epoch": 0.8582721626199887, + "grad_norm": 0.1261286735534668, + "learning_rate": 1.274490893051981e-05, + "loss": 0.0824, + "step": 3420 + }, + { + "epoch": 0.8595269464834682, + "grad_norm": 0.11686058342456818, + "learning_rate": 1.2725357963576506e-05, + "loss": 0.0963, + "step": 3425 + }, + { + "epoch": 0.8607817303469477, + "grad_norm": 0.1499367356300354, + "learning_rate": 1.2705795736878461e-05, + "loss": 0.0899, + "step": 3430 + }, + { + "epoch": 0.8620365142104273, + "grad_norm": 0.25637149810791016, + "learning_rate": 1.268622233124658e-05, + "loss": 0.09, + "step": 3435 + }, + { + "epoch": 0.8632912980739068, + "grad_norm": 0.2606950104236603, + "learning_rate": 1.2666637827547935e-05, + "loss": 0.0928, + "step": 3440 + }, + { + "epoch": 0.8645460819373862, + "grad_norm": 0.17637059092521667, + "learning_rate": 1.264704230669547e-05, + "loss": 0.0999, + "step": 3445 + }, + { + "epoch": 0.8658008658008658, + "grad_norm": 0.21137331426143646, + "learning_rate": 1.2627435849647629e-05, + "loss": 0.0871, + "step": 3450 + }, + { + "epoch": 0.8670556496643453, + "grad_norm": 0.40598878264427185, + "learning_rate": 1.2607818537408047e-05, + "loss": 0.0821, + "step": 3455 + }, + { + "epoch": 0.8683104335278249, + "grad_norm": 0.11776316165924072, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.0987, + "step": 3460 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.11056658625602722, + "learning_rate": 1.2568551671592106e-05, + "loss": 0.0863, + "step": 3465 + }, + { + "epoch": 0.8708200012547839, + "grad_norm": 0.29835495352745056, + "learning_rate": 1.2548902280245909e-05, + "loss": 0.0876, + "step": 3470 + }, + { + "epoch": 0.8720747851182634, + "grad_norm": 0.3364979922771454, + "learning_rate": 1.252924235816764e-05, + "loss": 0.0788, + "step": 3475 + }, + { + "epoch": 0.8733295689817429, + "grad_norm": 0.1180335134267807, + "learning_rate": 1.2509571986581814e-05, + "loss": 0.0799, + "step": 3480 + }, + { + "epoch": 0.8745843528452224, + "grad_norm": 0.2995622754096985, + "learning_rate": 1.2489891246756131e-05, + "loss": 0.0964, + "step": 3485 + }, + { + "epoch": 0.875839136708702, + "grad_norm": 0.23864442110061646, + "learning_rate": 1.2470200220001122e-05, + "loss": 0.0982, + "step": 3490 + }, + { + "epoch": 0.8770939205721815, + "grad_norm": 0.6277647018432617, + "learning_rate": 1.245049898766982e-05, + "loss": 0.0722, + "step": 3495 + }, + { + "epoch": 0.8783487044356609, + "grad_norm": 0.4860200881958008, + "learning_rate": 1.2430787631157414e-05, + "loss": 0.0952, + "step": 3500 + }, + { + "epoch": 0.8796034882991405, + "grad_norm": 0.30421602725982666, + "learning_rate": 1.2411066231900935e-05, + "loss": 0.0901, + "step": 3505 + }, + { + "epoch": 0.88085827216262, + "grad_norm": 0.20121634006500244, + "learning_rate": 1.239133487137889e-05, + "loss": 0.0833, + "step": 3510 + }, + { + "epoch": 0.8821130560260995, + "grad_norm": 0.3852861225605011, + "learning_rate": 1.2371593631110953e-05, + "loss": 0.0862, + "step": 3515 + }, + { + "epoch": 0.883367839889579, + "grad_norm": 0.45018061995506287, + "learning_rate": 1.2351842592657612e-05, + "loss": 0.098, + "step": 3520 + }, + { + "epoch": 0.8846226237530586, + "grad_norm": 0.40172022581100464, + "learning_rate": 1.2332081837619836e-05, + "loss": 0.0899, + "step": 3525 + }, + { + "epoch": 0.8858774076165381, + "grad_norm": 0.2906516492366791, + "learning_rate": 1.2312311447638731e-05, + "loss": 0.0852, + "step": 3530 + }, + { + "epoch": 0.8871321914800175, + "grad_norm": 0.39876627922058105, + "learning_rate": 1.2292531504395223e-05, + "loss": 0.0938, + "step": 3535 + }, + { + "epoch": 0.8883869753434971, + "grad_norm": 0.1180957704782486, + "learning_rate": 1.2272742089609694e-05, + "loss": 0.0862, + "step": 3540 + }, + { + "epoch": 0.8896417592069766, + "grad_norm": 0.1698416918516159, + "learning_rate": 1.2252943285041662e-05, + "loss": 0.0873, + "step": 3545 + }, + { + "epoch": 0.8908965430704561, + "grad_norm": 0.3837547302246094, + "learning_rate": 1.2233135172489453e-05, + "loss": 0.0888, + "step": 3550 + }, + { + "epoch": 0.8921513269339356, + "grad_norm": 0.09981489181518555, + "learning_rate": 1.221331783378982e-05, + "loss": 0.0802, + "step": 3555 + }, + { + "epoch": 0.8934061107974152, + "grad_norm": 0.21812370419502258, + "learning_rate": 1.2193491350817657e-05, + "loss": 0.0749, + "step": 3560 + }, + { + "epoch": 0.8946608946608947, + "grad_norm": 0.6983737349510193, + "learning_rate": 1.2173655805485627e-05, + "loss": 0.0881, + "step": 3565 + }, + { + "epoch": 0.8959156785243741, + "grad_norm": 0.3287610411643982, + "learning_rate": 1.2153811279743841e-05, + "loss": 0.0859, + "step": 3570 + }, + { + "epoch": 0.8971704623878537, + "grad_norm": 0.3805353045463562, + "learning_rate": 1.2133957855579501e-05, + "loss": 0.0874, + "step": 3575 + }, + { + "epoch": 0.8984252462513332, + "grad_norm": 0.3398018181324005, + "learning_rate": 1.2114095615016585e-05, + "loss": 0.0892, + "step": 3580 + }, + { + "epoch": 0.8996800301148127, + "grad_norm": 0.2726369798183441, + "learning_rate": 1.2094224640115488e-05, + "loss": 0.0846, + "step": 3585 + }, + { + "epoch": 0.9009348139782922, + "grad_norm": 0.2575783133506775, + "learning_rate": 1.2074345012972694e-05, + "loss": 0.0863, + "step": 3590 + }, + { + "epoch": 0.9021895978417718, + "grad_norm": 0.18294771015644073, + "learning_rate": 1.2054456815720432e-05, + "loss": 0.0792, + "step": 3595 + }, + { + "epoch": 0.9034443817052513, + "grad_norm": 0.2737415134906769, + "learning_rate": 1.2034560130526341e-05, + "loss": 0.0811, + "step": 3600 + }, + { + "epoch": 0.9046991655687308, + "grad_norm": 0.22107063233852386, + "learning_rate": 1.2014655039593119e-05, + "loss": 0.0937, + "step": 3605 + }, + { + "epoch": 0.9059539494322103, + "grad_norm": 0.37395817041397095, + "learning_rate": 1.1994741625158206e-05, + "loss": 0.0813, + "step": 3610 + }, + { + "epoch": 0.9072087332956899, + "grad_norm": 0.3313354253768921, + "learning_rate": 1.1974819969493421e-05, + "loss": 0.0789, + "step": 3615 + }, + { + "epoch": 0.9084635171591693, + "grad_norm": 0.3608105182647705, + "learning_rate": 1.195489015490463e-05, + "loss": 0.0859, + "step": 3620 + }, + { + "epoch": 0.9097183010226488, + "grad_norm": 0.37519270181655884, + "learning_rate": 1.1934952263731411e-05, + "loss": 0.0919, + "step": 3625 + }, + { + "epoch": 0.9109730848861284, + "grad_norm": 0.5066865086555481, + "learning_rate": 1.1915006378346719e-05, + "loss": 0.1, + "step": 3630 + }, + { + "epoch": 0.9122278687496079, + "grad_norm": 0.22127996385097504, + "learning_rate": 1.1895052581156516e-05, + "loss": 0.0854, + "step": 3635 + }, + { + "epoch": 0.9134826526130874, + "grad_norm": 0.18663233518600464, + "learning_rate": 1.1875090954599472e-05, + "loss": 0.0962, + "step": 3640 + }, + { + "epoch": 0.9147374364765669, + "grad_norm": 0.5338375568389893, + "learning_rate": 1.1855121581146591e-05, + "loss": 0.086, + "step": 3645 + }, + { + "epoch": 0.9159922203400465, + "grad_norm": 0.10081567615270615, + "learning_rate": 1.183514454330089e-05, + "loss": 0.0863, + "step": 3650 + }, + { + "epoch": 0.9172470042035259, + "grad_norm": 0.09096982330083847, + "learning_rate": 1.1815159923597044e-05, + "loss": 0.084, + "step": 3655 + }, + { + "epoch": 0.9185017880670054, + "grad_norm": 0.2180848866701126, + "learning_rate": 1.1795167804601062e-05, + "loss": 0.0916, + "step": 3660 + }, + { + "epoch": 0.919756571930485, + "grad_norm": 0.2790067791938782, + "learning_rate": 1.177516826890993e-05, + "loss": 0.0914, + "step": 3665 + }, + { + "epoch": 0.9210113557939645, + "grad_norm": 0.26915568113327026, + "learning_rate": 1.1755161399151277e-05, + "loss": 0.0887, + "step": 3670 + }, + { + "epoch": 0.922266139657444, + "grad_norm": 0.050155360251665115, + "learning_rate": 1.1735147277983027e-05, + "loss": 0.0957, + "step": 3675 + }, + { + "epoch": 0.9235209235209235, + "grad_norm": 0.07628196477890015, + "learning_rate": 1.1715125988093075e-05, + "loss": 0.0895, + "step": 3680 + }, + { + "epoch": 0.9247757073844031, + "grad_norm": 0.23782391846179962, + "learning_rate": 1.1695097612198929e-05, + "loss": 0.0878, + "step": 3685 + }, + { + "epoch": 0.9260304912478825, + "grad_norm": 0.28425681591033936, + "learning_rate": 1.1675062233047365e-05, + "loss": 0.0928, + "step": 3690 + }, + { + "epoch": 0.927285275111362, + "grad_norm": 0.36293289065361023, + "learning_rate": 1.16550199334141e-05, + "loss": 0.0976, + "step": 3695 + }, + { + "epoch": 0.9285400589748416, + "grad_norm": 0.5268048048019409, + "learning_rate": 1.1634970796103442e-05, + "loss": 0.0995, + "step": 3700 + }, + { + "epoch": 0.9297948428383211, + "grad_norm": 0.18220224976539612, + "learning_rate": 1.1614914903947952e-05, + "loss": 0.0895, + "step": 3705 + }, + { + "epoch": 0.9310496267018006, + "grad_norm": 0.18643346428871155, + "learning_rate": 1.1594852339808082e-05, + "loss": 0.0977, + "step": 3710 + }, + { + "epoch": 0.9323044105652801, + "grad_norm": 0.1516513228416443, + "learning_rate": 1.1574783186571876e-05, + "loss": 0.0947, + "step": 3715 + }, + { + "epoch": 0.9335591944287597, + "grad_norm": 0.27426013350486755, + "learning_rate": 1.155470752715458e-05, + "loss": 0.0885, + "step": 3720 + }, + { + "epoch": 0.9348139782922391, + "grad_norm": 0.2292599081993103, + "learning_rate": 1.1534625444498325e-05, + "loss": 0.0774, + "step": 3725 + }, + { + "epoch": 0.9360687621557187, + "grad_norm": 0.2502307593822479, + "learning_rate": 1.1514537021571784e-05, + "loss": 0.0816, + "step": 3730 + }, + { + "epoch": 0.9373235460191982, + "grad_norm": 0.1261693835258484, + "learning_rate": 1.1494442341369819e-05, + "loss": 0.0859, + "step": 3735 + }, + { + "epoch": 0.9385783298826778, + "grad_norm": 0.18473610281944275, + "learning_rate": 1.1474341486913146e-05, + "loss": 0.0804, + "step": 3740 + }, + { + "epoch": 0.9398331137461572, + "grad_norm": 0.1409272700548172, + "learning_rate": 1.1454234541247995e-05, + "loss": 0.0985, + "step": 3745 + }, + { + "epoch": 0.9410878976096367, + "grad_norm": 0.32273778319358826, + "learning_rate": 1.1434121587445752e-05, + "loss": 0.0901, + "step": 3750 + }, + { + "epoch": 0.9423426814731163, + "grad_norm": 0.12227199226617813, + "learning_rate": 1.1414002708602632e-05, + "loss": 0.0928, + "step": 3755 + }, + { + "epoch": 0.9435974653365957, + "grad_norm": 0.2481284737586975, + "learning_rate": 1.1393877987839329e-05, + "loss": 0.0853, + "step": 3760 + }, + { + "epoch": 0.9448522492000753, + "grad_norm": 0.25302889943122864, + "learning_rate": 1.1373747508300668e-05, + "loss": 0.0729, + "step": 3765 + }, + { + "epoch": 0.9461070330635548, + "grad_norm": 0.28753232955932617, + "learning_rate": 1.1353611353155272e-05, + "loss": 0.0916, + "step": 3770 + }, + { + "epoch": 0.9473618169270344, + "grad_norm": 0.308005154132843, + "learning_rate": 1.133346960559521e-05, + "loss": 0.0896, + "step": 3775 + }, + { + "epoch": 0.9486166007905138, + "grad_norm": 0.22970429062843323, + "learning_rate": 1.1313322348835658e-05, + "loss": 0.0871, + "step": 3780 + }, + { + "epoch": 0.9498713846539933, + "grad_norm": 0.3094078600406647, + "learning_rate": 1.1293169666114546e-05, + "loss": 0.0929, + "step": 3785 + }, + { + "epoch": 0.9511261685174729, + "grad_norm": 0.17183607816696167, + "learning_rate": 1.127301164069223e-05, + "loss": 0.0901, + "step": 3790 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.3873693346977234, + "learning_rate": 1.1252848355851136e-05, + "loss": 0.0829, + "step": 3795 + }, + { + "epoch": 0.9536357362444319, + "grad_norm": 0.42757073044776917, + "learning_rate": 1.1232679894895417e-05, + "loss": 0.0855, + "step": 3800 + }, + { + "epoch": 0.9548905201079114, + "grad_norm": 0.49850013852119446, + "learning_rate": 1.1212506341150615e-05, + "loss": 0.0834, + "step": 3805 + }, + { + "epoch": 0.956145303971391, + "grad_norm": 0.14661534130573273, + "learning_rate": 1.1192327777963313e-05, + "loss": 0.0798, + "step": 3810 + }, + { + "epoch": 0.9574000878348704, + "grad_norm": 0.29823562502861023, + "learning_rate": 1.117214428870078e-05, + "loss": 0.0786, + "step": 3815 + }, + { + "epoch": 0.95865487169835, + "grad_norm": 0.12722723186016083, + "learning_rate": 1.1151955956750652e-05, + "loss": 0.0807, + "step": 3820 + }, + { + "epoch": 0.9599096555618295, + "grad_norm": 0.10096994042396545, + "learning_rate": 1.1131762865520566e-05, + "loss": 0.0847, + "step": 3825 + }, + { + "epoch": 0.9611644394253089, + "grad_norm": 0.1767021268606186, + "learning_rate": 1.1111565098437815e-05, + "loss": 0.1014, + "step": 3830 + }, + { + "epoch": 0.9624192232887885, + "grad_norm": 0.2014637440443039, + "learning_rate": 1.1091362738949024e-05, + "loss": 0.091, + "step": 3835 + }, + { + "epoch": 0.963674007152268, + "grad_norm": 0.2271222472190857, + "learning_rate": 1.1071155870519777e-05, + "loss": 0.0887, + "step": 3840 + }, + { + "epoch": 0.9649287910157476, + "grad_norm": 0.15430879592895508, + "learning_rate": 1.1050944576634298e-05, + "loss": 0.0842, + "step": 3845 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 0.14442376792430878, + "learning_rate": 1.1030728940795087e-05, + "loss": 0.0935, + "step": 3850 + }, + { + "epoch": 0.9674383587427066, + "grad_norm": 0.1097768247127533, + "learning_rate": 1.101050904652259e-05, + "loss": 0.0842, + "step": 3855 + }, + { + "epoch": 0.9686931426061861, + "grad_norm": 0.14131766557693481, + "learning_rate": 1.0990284977354841e-05, + "loss": 0.0848, + "step": 3860 + }, + { + "epoch": 0.9699479264696657, + "grad_norm": 0.2724064588546753, + "learning_rate": 1.097005681684712e-05, + "loss": 0.0792, + "step": 3865 + }, + { + "epoch": 0.9712027103331451, + "grad_norm": 0.15965338051319122, + "learning_rate": 1.094982464857162e-05, + "loss": 0.0699, + "step": 3870 + }, + { + "epoch": 0.9724574941966246, + "grad_norm": 0.25648635625839233, + "learning_rate": 1.0929588556117086e-05, + "loss": 0.0869, + "step": 3875 + }, + { + "epoch": 0.9737122780601042, + "grad_norm": 0.1974741518497467, + "learning_rate": 1.0909348623088472e-05, + "loss": 0.0814, + "step": 3880 + }, + { + "epoch": 0.9749670619235836, + "grad_norm": 0.1373465359210968, + "learning_rate": 1.0889104933106604e-05, + "loss": 0.0868, + "step": 3885 + }, + { + "epoch": 0.9762218457870632, + "grad_norm": 0.2142958790063858, + "learning_rate": 1.0868857569807831e-05, + "loss": 0.0888, + "step": 3890 + }, + { + "epoch": 0.9774766296505427, + "grad_norm": 0.48555663228034973, + "learning_rate": 1.0848606616843673e-05, + "loss": 0.0839, + "step": 3895 + }, + { + "epoch": 0.9787314135140223, + "grad_norm": 0.2687907814979553, + "learning_rate": 1.0828352157880489e-05, + "loss": 0.0737, + "step": 3900 + }, + { + "epoch": 0.9799861973775017, + "grad_norm": 0.18711687624454498, + "learning_rate": 1.0808094276599113e-05, + "loss": 0.0958, + "step": 3905 + }, + { + "epoch": 0.9812409812409812, + "grad_norm": 0.09380491077899933, + "learning_rate": 1.0787833056694526e-05, + "loss": 0.0839, + "step": 3910 + }, + { + "epoch": 0.9824957651044608, + "grad_norm": 0.2556704878807068, + "learning_rate": 1.0767568581875494e-05, + "loss": 0.0801, + "step": 3915 + }, + { + "epoch": 0.9837505489679402, + "grad_norm": 0.17845787107944489, + "learning_rate": 1.0747300935864245e-05, + "loss": 0.0906, + "step": 3920 + }, + { + "epoch": 0.9850053328314198, + "grad_norm": 0.20369689166545868, + "learning_rate": 1.0727030202396091e-05, + "loss": 0.0813, + "step": 3925 + }, + { + "epoch": 0.9862601166948993, + "grad_norm": 0.3633408546447754, + "learning_rate": 1.0706756465219114e-05, + "loss": 0.091, + "step": 3930 + }, + { + "epoch": 0.9875149005583789, + "grad_norm": 0.14127115905284882, + "learning_rate": 1.0686479808093798e-05, + "loss": 0.1013, + "step": 3935 + }, + { + "epoch": 0.9887696844218583, + "grad_norm": 0.43319714069366455, + "learning_rate": 1.0666200314792695e-05, + "loss": 0.0854, + "step": 3940 + }, + { + "epoch": 0.9900244682853379, + "grad_norm": 0.5234352946281433, + "learning_rate": 1.064591806910007e-05, + "loss": 0.0799, + "step": 3945 + }, + { + "epoch": 0.9912792521488174, + "grad_norm": 0.10436911880970001, + "learning_rate": 1.062563315481156e-05, + "loss": 0.0774, + "step": 3950 + }, + { + "epoch": 0.9925340360122968, + "grad_norm": 0.18167732656002045, + "learning_rate": 1.0605345655733839e-05, + "loss": 0.0922, + "step": 3955 + }, + { + "epoch": 0.9937888198757764, + "grad_norm": 0.08805394172668457, + "learning_rate": 1.058505565568424e-05, + "loss": 0.0908, + "step": 3960 + }, + { + "epoch": 0.9950436037392559, + "grad_norm": 0.186781644821167, + "learning_rate": 1.056476323849044e-05, + "loss": 0.1014, + "step": 3965 + }, + { + "epoch": 0.9962983876027355, + "grad_norm": 0.3317610025405884, + "learning_rate": 1.0544468487990105e-05, + "loss": 0.0959, + "step": 3970 + }, + { + "epoch": 0.9975531714662149, + "grad_norm": 0.159254252910614, + "learning_rate": 1.0524171488030537e-05, + "loss": 0.0961, + "step": 3975 + }, + { + "epoch": 0.9988079553296945, + "grad_norm": 0.5758376717567444, + "learning_rate": 1.0503872322468331e-05, + "loss": 0.0859, + "step": 3980 + }, + { + "epoch": 1.000062739193174, + "grad_norm": 0.1497868299484253, + "learning_rate": 1.048357107516903e-05, + "loss": 0.0865, + "step": 3985 + }, + { + "epoch": 1.0013175230566536, + "grad_norm": 0.3601112365722656, + "learning_rate": 1.0463267830006779e-05, + "loss": 0.0877, + "step": 3990 + }, + { + "epoch": 1.002572306920133, + "grad_norm": 0.06848477572202682, + "learning_rate": 1.0442962670863971e-05, + "loss": 0.0809, + "step": 3995 + }, + { + "epoch": 1.0038270907836124, + "grad_norm": 0.40660738945007324, + "learning_rate": 1.0422655681630917e-05, + "loss": 0.0925, + "step": 4000 + }, + { + "epoch": 1.005081874647092, + "grad_norm": 0.12791000306606293, + "learning_rate": 1.040234694620548e-05, + "loss": 0.0877, + "step": 4005 + }, + { + "epoch": 1.0063366585105715, + "grad_norm": 0.445340096950531, + "learning_rate": 1.0382036548492743e-05, + "loss": 0.0683, + "step": 4010 + }, + { + "epoch": 1.007591442374051, + "grad_norm": 0.29247570037841797, + "learning_rate": 1.0361724572404654e-05, + "loss": 0.0869, + "step": 4015 + }, + { + "epoch": 1.0088462262375306, + "grad_norm": 0.10048985481262207, + "learning_rate": 1.034141110185968e-05, + "loss": 0.072, + "step": 4020 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.19421495497226715, + "learning_rate": 1.0321096220782469e-05, + "loss": 0.0859, + "step": 4025 + }, + { + "epoch": 1.0113557939644897, + "grad_norm": 0.34872177243232727, + "learning_rate": 1.0300780013103488e-05, + "loss": 0.09, + "step": 4030 + }, + { + "epoch": 1.012610577827969, + "grad_norm": 0.3325657844543457, + "learning_rate": 1.028046256275869e-05, + "loss": 0.097, + "step": 4035 + }, + { + "epoch": 1.0138653616914486, + "grad_norm": 0.3407081365585327, + "learning_rate": 1.0260143953689165e-05, + "loss": 0.0872, + "step": 4040 + }, + { + "epoch": 1.0151201455549281, + "grad_norm": 0.18198151886463165, + "learning_rate": 1.0239824269840784e-05, + "loss": 0.0884, + "step": 4045 + }, + { + "epoch": 1.0163749294184077, + "grad_norm": 0.08795715123414993, + "learning_rate": 1.0219503595163857e-05, + "loss": 0.09, + "step": 4050 + }, + { + "epoch": 1.0176297132818872, + "grad_norm": 0.07186929136514664, + "learning_rate": 1.0199182013612797e-05, + "loss": 0.0783, + "step": 4055 + }, + { + "epoch": 1.0188844971453668, + "grad_norm": 0.36318737268447876, + "learning_rate": 1.017885960914576e-05, + "loss": 0.0852, + "step": 4060 + }, + { + "epoch": 1.0201392810088463, + "grad_norm": 0.350274920463562, + "learning_rate": 1.0158536465724291e-05, + "loss": 0.0723, + "step": 4065 + }, + { + "epoch": 1.0213940648723256, + "grad_norm": 0.3536963164806366, + "learning_rate": 1.0138212667313003e-05, + "loss": 0.0898, + "step": 4070 + }, + { + "epoch": 1.0226488487358052, + "grad_norm": 0.37890106439590454, + "learning_rate": 1.011788829787921e-05, + "loss": 0.0871, + "step": 4075 + }, + { + "epoch": 1.0239036325992847, + "grad_norm": 0.2589329183101654, + "learning_rate": 1.0097563441392582e-05, + "loss": 0.0964, + "step": 4080 + }, + { + "epoch": 1.0251584164627643, + "grad_norm": 0.3065876066684723, + "learning_rate": 1.0077238181824804e-05, + "loss": 0.0933, + "step": 4085 + }, + { + "epoch": 1.0264132003262438, + "grad_norm": 0.11984225362539291, + "learning_rate": 1.0056912603149229e-05, + "loss": 0.0845, + "step": 4090 + }, + { + "epoch": 1.0276679841897234, + "grad_norm": 0.19489920139312744, + "learning_rate": 1.0036586789340518e-05, + "loss": 0.071, + "step": 4095 + }, + { + "epoch": 1.028922768053203, + "grad_norm": 0.4552467167377472, + "learning_rate": 1.001626082437432e-05, + "loss": 0.0869, + "step": 4100 + }, + { + "epoch": 1.0301775519166823, + "grad_norm": 0.2607716917991638, + "learning_rate": 9.995934792226892e-06, + "loss": 0.0903, + "step": 4105 + }, + { + "epoch": 1.0314323357801618, + "grad_norm": 0.2067851722240448, + "learning_rate": 9.975608776874775e-06, + "loss": 0.0818, + "step": 4110 + }, + { + "epoch": 1.0326871196436413, + "grad_norm": 0.1449105590581894, + "learning_rate": 9.955282862294447e-06, + "loss": 0.0903, + "step": 4115 + }, + { + "epoch": 1.033941903507121, + "grad_norm": 0.3527059853076935, + "learning_rate": 9.93495713246196e-06, + "loss": 0.0982, + "step": 4120 + }, + { + "epoch": 1.0351966873706004, + "grad_norm": 0.38260769844055176, + "learning_rate": 9.91463167135261e-06, + "loss": 0.0798, + "step": 4125 + }, + { + "epoch": 1.03645147123408, + "grad_norm": 0.19611142575740814, + "learning_rate": 9.894306562940576e-06, + "loss": 0.0771, + "step": 4130 + }, + { + "epoch": 1.0377062550975595, + "grad_norm": 0.09190942347049713, + "learning_rate": 9.873981891198585e-06, + "loss": 0.0887, + "step": 4135 + }, + { + "epoch": 1.0389610389610389, + "grad_norm": 0.3059125244617462, + "learning_rate": 9.853657740097558e-06, + "loss": 0.0804, + "step": 4140 + }, + { + "epoch": 1.0402158228245184, + "grad_norm": 0.2917691171169281, + "learning_rate": 9.833334193606266e-06, + "loss": 0.0876, + "step": 4145 + }, + { + "epoch": 1.041470606687998, + "grad_norm": 0.10409116744995117, + "learning_rate": 9.81301133569098e-06, + "loss": 0.0951, + "step": 4150 + }, + { + "epoch": 1.0427253905514775, + "grad_norm": 0.23820455372333527, + "learning_rate": 9.792689250315126e-06, + "loss": 0.0873, + "step": 4155 + }, + { + "epoch": 1.043980174414957, + "grad_norm": 0.3840187191963196, + "learning_rate": 9.772368021438943e-06, + "loss": 0.0859, + "step": 4160 + }, + { + "epoch": 1.0452349582784366, + "grad_norm": 0.2977546751499176, + "learning_rate": 9.752047733019132e-06, + "loss": 0.0833, + "step": 4165 + }, + { + "epoch": 1.0464897421419161, + "grad_norm": 0.3379094898700714, + "learning_rate": 9.731728469008493e-06, + "loss": 0.0799, + "step": 4170 + }, + { + "epoch": 1.0477445260053955, + "grad_norm": 0.18591825664043427, + "learning_rate": 9.711410313355614e-06, + "loss": 0.0772, + "step": 4175 + }, + { + "epoch": 1.048999309868875, + "grad_norm": 0.1972389817237854, + "learning_rate": 9.691093350004492e-06, + "loss": 0.0912, + "step": 4180 + }, + { + "epoch": 1.0502540937323546, + "grad_norm": 0.2270268350839615, + "learning_rate": 9.670777662894205e-06, + "loss": 0.0733, + "step": 4185 + }, + { + "epoch": 1.0515088775958341, + "grad_norm": 0.3008296489715576, + "learning_rate": 9.650463335958551e-06, + "loss": 0.0821, + "step": 4190 + }, + { + "epoch": 1.0527636614593137, + "grad_norm": 0.11666283011436462, + "learning_rate": 9.630150453125711e-06, + "loss": 0.0782, + "step": 4195 + }, + { + "epoch": 1.0540184453227932, + "grad_norm": 0.1857229471206665, + "learning_rate": 9.609839098317902e-06, + "loss": 0.0811, + "step": 4200 + }, + { + "epoch": 1.0552732291862728, + "grad_norm": 0.24505260586738586, + "learning_rate": 9.589529355451028e-06, + "loss": 0.0747, + "step": 4205 + }, + { + "epoch": 1.056528013049752, + "grad_norm": 0.12762409448623657, + "learning_rate": 9.569221308434336e-06, + "loss": 0.0813, + "step": 4210 + }, + { + "epoch": 1.0577827969132316, + "grad_norm": 0.14525793492794037, + "learning_rate": 9.548915041170049e-06, + "loss": 0.0814, + "step": 4215 + }, + { + "epoch": 1.0590375807767112, + "grad_norm": 0.4550996422767639, + "learning_rate": 9.528610637553063e-06, + "loss": 0.087, + "step": 4220 + }, + { + "epoch": 1.0602923646401907, + "grad_norm": 0.11074529588222504, + "learning_rate": 9.508308181470556e-06, + "loss": 0.0885, + "step": 4225 + }, + { + "epoch": 1.0615471485036703, + "grad_norm": 0.2155328094959259, + "learning_rate": 9.488007756801672e-06, + "loss": 0.0873, + "step": 4230 + }, + { + "epoch": 1.0628019323671498, + "grad_norm": 0.0720556378364563, + "learning_rate": 9.467709447417149e-06, + "loss": 0.0787, + "step": 4235 + }, + { + "epoch": 1.0640567162306294, + "grad_norm": 0.5692139267921448, + "learning_rate": 9.447413337178994e-06, + "loss": 0.0746, + "step": 4240 + }, + { + "epoch": 1.0653115000941087, + "grad_norm": 0.42540258169174194, + "learning_rate": 9.42711950994013e-06, + "loss": 0.0977, + "step": 4245 + }, + { + "epoch": 1.0665662839575882, + "grad_norm": 0.13704167306423187, + "learning_rate": 9.406828049544046e-06, + "loss": 0.0815, + "step": 4250 + }, + { + "epoch": 1.0678210678210678, + "grad_norm": 0.11324842274188995, + "learning_rate": 9.386539039824446e-06, + "loss": 0.0853, + "step": 4255 + }, + { + "epoch": 1.0690758516845473, + "grad_norm": 0.09248417615890503, + "learning_rate": 9.366252564604914e-06, + "loss": 0.0751, + "step": 4260 + }, + { + "epoch": 1.0703306355480269, + "grad_norm": 0.11014354974031448, + "learning_rate": 9.34596870769857e-06, + "loss": 0.0888, + "step": 4265 + }, + { + "epoch": 1.0715854194115064, + "grad_norm": 0.2357379049062729, + "learning_rate": 9.325687552907708e-06, + "loss": 0.0806, + "step": 4270 + }, + { + "epoch": 1.072840203274986, + "grad_norm": 0.23039399087429047, + "learning_rate": 9.305409184023455e-06, + "loss": 0.0899, + "step": 4275 + }, + { + "epoch": 1.0740949871384653, + "grad_norm": 0.3211176097393036, + "learning_rate": 9.285133684825435e-06, + "loss": 0.0921, + "step": 4280 + }, + { + "epoch": 1.0753497710019448, + "grad_norm": 0.08523392677307129, + "learning_rate": 9.264861139081417e-06, + "loss": 0.0742, + "step": 4285 + }, + { + "epoch": 1.0766045548654244, + "grad_norm": 0.446898490190506, + "learning_rate": 9.244591630546964e-06, + "loss": 0.0888, + "step": 4290 + }, + { + "epoch": 1.077859338728904, + "grad_norm": 0.30059829354286194, + "learning_rate": 9.224325242965088e-06, + "loss": 0.0796, + "step": 4295 + }, + { + "epoch": 1.0791141225923835, + "grad_norm": 0.1692143827676773, + "learning_rate": 9.204062060065915e-06, + "loss": 0.0988, + "step": 4300 + }, + { + "epoch": 1.080368906455863, + "grad_norm": 0.5262351632118225, + "learning_rate": 9.18380216556632e-06, + "loss": 0.0817, + "step": 4305 + }, + { + "epoch": 1.0816236903193426, + "grad_norm": 0.2138439118862152, + "learning_rate": 9.163545643169607e-06, + "loss": 0.0921, + "step": 4310 + }, + { + "epoch": 1.082878474182822, + "grad_norm": 0.2504398822784424, + "learning_rate": 9.143292576565142e-06, + "loss": 0.0747, + "step": 4315 + }, + { + "epoch": 1.0841332580463015, + "grad_norm": 0.2210868000984192, + "learning_rate": 9.123043049427996e-06, + "loss": 0.0843, + "step": 4320 + }, + { + "epoch": 1.085388041909781, + "grad_norm": 0.20180197060108185, + "learning_rate": 9.102797145418644e-06, + "loss": 0.0854, + "step": 4325 + }, + { + "epoch": 1.0866428257732605, + "grad_norm": 0.3638245761394501, + "learning_rate": 9.082554948182577e-06, + "loss": 0.0809, + "step": 4330 + }, + { + "epoch": 1.08789760963674, + "grad_norm": 0.28940561413764954, + "learning_rate": 9.062316541349978e-06, + "loss": 0.0806, + "step": 4335 + }, + { + "epoch": 1.0891523935002196, + "grad_norm": 0.15019264817237854, + "learning_rate": 9.042082008535361e-06, + "loss": 0.081, + "step": 4340 + }, + { + "epoch": 1.0904071773636992, + "grad_norm": 0.4596641957759857, + "learning_rate": 9.021851433337243e-06, + "loss": 0.0886, + "step": 4345 + }, + { + "epoch": 1.0916619612271785, + "grad_norm": 0.3484109342098236, + "learning_rate": 9.001624899337785e-06, + "loss": 0.0997, + "step": 4350 + }, + { + "epoch": 1.092916745090658, + "grad_norm": 0.4524548053741455, + "learning_rate": 8.981402490102464e-06, + "loss": 0.0719, + "step": 4355 + }, + { + "epoch": 1.0941715289541376, + "grad_norm": 0.18216001987457275, + "learning_rate": 8.961184289179695e-06, + "loss": 0.0866, + "step": 4360 + }, + { + "epoch": 1.0954263128176172, + "grad_norm": 0.3416767120361328, + "learning_rate": 8.94097038010052e-06, + "loss": 0.0759, + "step": 4365 + }, + { + "epoch": 1.0966810966810967, + "grad_norm": 0.549856960773468, + "learning_rate": 8.920760846378248e-06, + "loss": 0.0875, + "step": 4370 + }, + { + "epoch": 1.0979358805445762, + "grad_norm": 0.12520906329154968, + "learning_rate": 8.900555771508114e-06, + "loss": 0.086, + "step": 4375 + }, + { + "epoch": 1.0991906644080558, + "grad_norm": 0.17632810771465302, + "learning_rate": 8.880355238966923e-06, + "loss": 0.084, + "step": 4380 + }, + { + "epoch": 1.1004454482715351, + "grad_norm": 0.28502127528190613, + "learning_rate": 8.860159332212719e-06, + "loss": 0.0813, + "step": 4385 + }, + { + "epoch": 1.1017002321350147, + "grad_norm": 0.43483448028564453, + "learning_rate": 8.83996813468443e-06, + "loss": 0.0871, + "step": 4390 + }, + { + "epoch": 1.1029550159984942, + "grad_norm": 0.28160524368286133, + "learning_rate": 8.81978172980154e-06, + "loss": 0.0886, + "step": 4395 + }, + { + "epoch": 1.1042097998619738, + "grad_norm": 0.4156465232372284, + "learning_rate": 8.799600200963716e-06, + "loss": 0.083, + "step": 4400 + }, + { + "epoch": 1.1054645837254533, + "grad_norm": 0.24867035448551178, + "learning_rate": 8.77942363155049e-06, + "loss": 0.0785, + "step": 4405 + }, + { + "epoch": 1.1067193675889329, + "grad_norm": 0.3078720271587372, + "learning_rate": 8.7592521049209e-06, + "loss": 0.0817, + "step": 4410 + }, + { + "epoch": 1.1079741514524124, + "grad_norm": 0.12757770717144012, + "learning_rate": 8.739085704413161e-06, + "loss": 0.0773, + "step": 4415 + }, + { + "epoch": 1.1092289353158917, + "grad_norm": 0.2089099884033203, + "learning_rate": 8.718924513344288e-06, + "loss": 0.0826, + "step": 4420 + }, + { + "epoch": 1.1104837191793713, + "grad_norm": 0.28598687052726746, + "learning_rate": 8.698768615009789e-06, + "loss": 0.0898, + "step": 4425 + }, + { + "epoch": 1.1117385030428508, + "grad_norm": 0.16409343481063843, + "learning_rate": 8.678618092683307e-06, + "loss": 0.0904, + "step": 4430 + }, + { + "epoch": 1.1129932869063304, + "grad_norm": 0.17756353318691254, + "learning_rate": 8.658473029616264e-06, + "loss": 0.077, + "step": 4435 + }, + { + "epoch": 1.11424807076981, + "grad_norm": 0.27898117899894714, + "learning_rate": 8.638333509037537e-06, + "loss": 0.0785, + "step": 4440 + }, + { + "epoch": 1.1155028546332895, + "grad_norm": 0.12959200143814087, + "learning_rate": 8.61819961415309e-06, + "loss": 0.0752, + "step": 4445 + }, + { + "epoch": 1.116757638496769, + "grad_norm": 0.0896739810705185, + "learning_rate": 8.598071428145663e-06, + "loss": 0.0962, + "step": 4450 + }, + { + "epoch": 1.1180124223602483, + "grad_norm": 0.37898027896881104, + "learning_rate": 8.577949034174395e-06, + "loss": 0.0864, + "step": 4455 + }, + { + "epoch": 1.1192672062237279, + "grad_norm": 0.19253282248973846, + "learning_rate": 8.55783251537451e-06, + "loss": 0.0883, + "step": 4460 + }, + { + "epoch": 1.1205219900872074, + "grad_norm": 0.35812661051750183, + "learning_rate": 8.537721954856942e-06, + "loss": 0.0839, + "step": 4465 + }, + { + "epoch": 1.121776773950687, + "grad_norm": 0.3062339127063751, + "learning_rate": 8.517617435708011e-06, + "loss": 0.0808, + "step": 4470 + }, + { + "epoch": 1.1230315578141665, + "grad_norm": 0.13663077354431152, + "learning_rate": 8.497519040989096e-06, + "loss": 0.0961, + "step": 4475 + }, + { + "epoch": 1.124286341677646, + "grad_norm": 0.2609081566333771, + "learning_rate": 8.477426853736257e-06, + "loss": 0.0929, + "step": 4480 + }, + { + "epoch": 1.1255411255411256, + "grad_norm": 0.12208747863769531, + "learning_rate": 8.457340956959905e-06, + "loss": 0.0911, + "step": 4485 + }, + { + "epoch": 1.126795909404605, + "grad_norm": 0.19168135523796082, + "learning_rate": 8.437261433644472e-06, + "loss": 0.0863, + "step": 4490 + }, + { + "epoch": 1.1280506932680845, + "grad_norm": 0.20487892627716064, + "learning_rate": 8.417188366748051e-06, + "loss": 0.0883, + "step": 4495 + }, + { + "epoch": 1.129305477131564, + "grad_norm": 0.1829683482646942, + "learning_rate": 8.397121839202069e-06, + "loss": 0.0905, + "step": 4500 + }, + { + "epoch": 1.1305602609950436, + "grad_norm": 0.13941001892089844, + "learning_rate": 8.377061933910924e-06, + "loss": 0.078, + "step": 4505 + }, + { + "epoch": 1.1318150448585231, + "grad_norm": 0.1624182164669037, + "learning_rate": 8.357008733751664e-06, + "loss": 0.0877, + "step": 4510 + }, + { + "epoch": 1.1330698287220027, + "grad_norm": 0.44617176055908203, + "learning_rate": 8.33696232157363e-06, + "loss": 0.0875, + "step": 4515 + }, + { + "epoch": 1.1343246125854822, + "grad_norm": 0.1490097939968109, + "learning_rate": 8.316922780198126e-06, + "loss": 0.0755, + "step": 4520 + }, + { + "epoch": 1.1355793964489616, + "grad_norm": 0.2881050109863281, + "learning_rate": 8.296890192418052e-06, + "loss": 0.0865, + "step": 4525 + }, + { + "epoch": 1.136834180312441, + "grad_norm": 0.19701127707958221, + "learning_rate": 8.276864640997602e-06, + "loss": 0.0837, + "step": 4530 + }, + { + "epoch": 1.1380889641759206, + "grad_norm": 0.4736967086791992, + "learning_rate": 8.256846208671882e-06, + "loss": 0.0843, + "step": 4535 + }, + { + "epoch": 1.1393437480394002, + "grad_norm": 0.22454383969306946, + "learning_rate": 8.236834978146597e-06, + "loss": 0.0805, + "step": 4540 + }, + { + "epoch": 1.1405985319028797, + "grad_norm": 0.1956356167793274, + "learning_rate": 8.216831032097689e-06, + "loss": 0.0877, + "step": 4545 + }, + { + "epoch": 1.1418533157663593, + "grad_norm": 0.3192068338394165, + "learning_rate": 8.196834453171008e-06, + "loss": 0.0773, + "step": 4550 + }, + { + "epoch": 1.1431080996298388, + "grad_norm": 0.21237006783485413, + "learning_rate": 8.17684532398197e-06, + "loss": 0.0799, + "step": 4555 + }, + { + "epoch": 1.1443628834933182, + "grad_norm": 0.3464091718196869, + "learning_rate": 8.15686372711521e-06, + "loss": 0.0946, + "step": 4560 + }, + { + "epoch": 1.1456176673567977, + "grad_norm": 0.44841843843460083, + "learning_rate": 8.136889745124241e-06, + "loss": 0.0937, + "step": 4565 + }, + { + "epoch": 1.1468724512202773, + "grad_norm": 0.12974920868873596, + "learning_rate": 8.116923460531117e-06, + "loss": 0.0866, + "step": 4570 + }, + { + "epoch": 1.1481272350837568, + "grad_norm": 0.2840186655521393, + "learning_rate": 8.09696495582609e-06, + "loss": 0.0759, + "step": 4575 + }, + { + "epoch": 1.1493820189472364, + "grad_norm": 0.2896987497806549, + "learning_rate": 8.077014313467274e-06, + "loss": 0.0905, + "step": 4580 + }, + { + "epoch": 1.150636802810716, + "grad_norm": 0.08831676840782166, + "learning_rate": 8.057071615880297e-06, + "loss": 0.0855, + "step": 4585 + }, + { + "epoch": 1.1518915866741954, + "grad_norm": 0.310893714427948, + "learning_rate": 8.037136945457959e-06, + "loss": 0.0868, + "step": 4590 + }, + { + "epoch": 1.1531463705376748, + "grad_norm": 0.2327936589717865, + "learning_rate": 8.017210384559901e-06, + "loss": 0.0661, + "step": 4595 + }, + { + "epoch": 1.1544011544011543, + "grad_norm": 0.23894554376602173, + "learning_rate": 7.997292015512257e-06, + "loss": 0.0816, + "step": 4600 + }, + { + "epoch": 1.1556559382646339, + "grad_norm": 0.2750934064388275, + "learning_rate": 7.977381920607324e-06, + "loss": 0.0724, + "step": 4605 + }, + { + "epoch": 1.1569107221281134, + "grad_norm": 0.2656705677509308, + "learning_rate": 7.957480182103198e-06, + "loss": 0.0839, + "step": 4610 + }, + { + "epoch": 1.158165505991593, + "grad_norm": 0.08596844226121902, + "learning_rate": 7.93758688222347e-06, + "loss": 0.0873, + "step": 4615 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.2484734207391739, + "learning_rate": 7.91770210315685e-06, + "loss": 0.084, + "step": 4620 + }, + { + "epoch": 1.160675073718552, + "grad_norm": 0.3806692361831665, + "learning_rate": 7.897825927056865e-06, + "loss": 0.0856, + "step": 4625 + }, + { + "epoch": 1.1619298575820314, + "grad_norm": 0.26702991127967834, + "learning_rate": 7.877958436041475e-06, + "loss": 0.0804, + "step": 4630 + }, + { + "epoch": 1.163184641445511, + "grad_norm": 0.19116578996181488, + "learning_rate": 7.858099712192774e-06, + "loss": 0.0882, + "step": 4635 + }, + { + "epoch": 1.1644394253089905, + "grad_norm": 0.08863260596990585, + "learning_rate": 7.83824983755663e-06, + "loss": 0.0761, + "step": 4640 + }, + { + "epoch": 1.16569420917247, + "grad_norm": 0.22460100054740906, + "learning_rate": 7.818408894142351e-06, + "loss": 0.0905, + "step": 4645 + }, + { + "epoch": 1.1669489930359496, + "grad_norm": 0.2022630125284195, + "learning_rate": 7.798576963922347e-06, + "loss": 0.086, + "step": 4650 + }, + { + "epoch": 1.1682037768994291, + "grad_norm": 0.17606490850448608, + "learning_rate": 7.778754128831782e-06, + "loss": 0.0742, + "step": 4655 + }, + { + "epoch": 1.1694585607629087, + "grad_norm": 0.1387161910533905, + "learning_rate": 7.75894047076826e-06, + "loss": 0.0817, + "step": 4660 + }, + { + "epoch": 1.170713344626388, + "grad_norm": 0.09419666230678558, + "learning_rate": 7.739136071591455e-06, + "loss": 0.0918, + "step": 4665 + }, + { + "epoch": 1.1719681284898675, + "grad_norm": 0.30808964371681213, + "learning_rate": 7.719341013122795e-06, + "loss": 0.0724, + "step": 4670 + }, + { + "epoch": 1.173222912353347, + "grad_norm": 0.43276599049568176, + "learning_rate": 7.699555377145113e-06, + "loss": 0.09, + "step": 4675 + }, + { + "epoch": 1.1744776962168266, + "grad_norm": 0.1630372256040573, + "learning_rate": 7.679779245402321e-06, + "loss": 0.0795, + "step": 4680 + }, + { + "epoch": 1.1757324800803062, + "grad_norm": 0.1593324989080429, + "learning_rate": 7.660012699599062e-06, + "loss": 0.0891, + "step": 4685 + }, + { + "epoch": 1.1769872639437857, + "grad_norm": 0.19340068101882935, + "learning_rate": 7.640255821400364e-06, + "loss": 0.098, + "step": 4690 + }, + { + "epoch": 1.1782420478072653, + "grad_norm": 0.10026690363883972, + "learning_rate": 7.620508692431327e-06, + "loss": 0.0809, + "step": 4695 + }, + { + "epoch": 1.1794968316707446, + "grad_norm": 0.24709242582321167, + "learning_rate": 7.600771394276767e-06, + "loss": 0.09, + "step": 4700 + }, + { + "epoch": 1.1807516155342241, + "grad_norm": 0.10687977075576782, + "learning_rate": 7.5810440084808855e-06, + "loss": 0.0948, + "step": 4705 + }, + { + "epoch": 1.1820063993977037, + "grad_norm": 0.5234541893005371, + "learning_rate": 7.561326616546932e-06, + "loss": 0.0857, + "step": 4710 + }, + { + "epoch": 1.1832611832611832, + "grad_norm": 0.5169919729232788, + "learning_rate": 7.541619299936859e-06, + "loss": 0.0716, + "step": 4715 + }, + { + "epoch": 1.1845159671246628, + "grad_norm": 0.13368743658065796, + "learning_rate": 7.521922140071003e-06, + "loss": 0.082, + "step": 4720 + }, + { + "epoch": 1.1857707509881423, + "grad_norm": 0.32512664794921875, + "learning_rate": 7.50223521832773e-06, + "loss": 0.0923, + "step": 4725 + }, + { + "epoch": 1.1870255348516219, + "grad_norm": 0.15283085405826569, + "learning_rate": 7.482558616043123e-06, + "loss": 0.0913, + "step": 4730 + }, + { + "epoch": 1.1882803187151012, + "grad_norm": 0.1149832084774971, + "learning_rate": 7.462892414510605e-06, + "loss": 0.0795, + "step": 4735 + }, + { + "epoch": 1.1895351025785807, + "grad_norm": 0.1783827543258667, + "learning_rate": 7.443236694980649e-06, + "loss": 0.0922, + "step": 4740 + }, + { + "epoch": 1.1907898864420603, + "grad_norm": 0.3042134940624237, + "learning_rate": 7.423591538660416e-06, + "loss": 0.1, + "step": 4745 + }, + { + "epoch": 1.1920446703055398, + "grad_norm": 0.32708263397216797, + "learning_rate": 7.4039570267134266e-06, + "loss": 0.0874, + "step": 4750 + }, + { + "epoch": 1.1932994541690194, + "grad_norm": 0.3249336779117584, + "learning_rate": 7.384333240259216e-06, + "loss": 0.0855, + "step": 4755 + }, + { + "epoch": 1.194554238032499, + "grad_norm": 0.3625956177711487, + "learning_rate": 7.364720260373017e-06, + "loss": 0.0819, + "step": 4760 + }, + { + "epoch": 1.1958090218959785, + "grad_norm": 0.13294678926467896, + "learning_rate": 7.345118168085412e-06, + "loss": 0.0896, + "step": 4765 + }, + { + "epoch": 1.1970638057594578, + "grad_norm": 0.29470571875572205, + "learning_rate": 7.325527044382004e-06, + "loss": 0.0828, + "step": 4770 + }, + { + "epoch": 1.1983185896229374, + "grad_norm": 0.23991379141807556, + "learning_rate": 7.3059469702030725e-06, + "loss": 0.0769, + "step": 4775 + }, + { + "epoch": 1.199573373486417, + "grad_norm": 0.30648887157440186, + "learning_rate": 7.286378026443252e-06, + "loss": 0.0908, + "step": 4780 + }, + { + "epoch": 1.2008281573498965, + "grad_norm": 0.10455156862735748, + "learning_rate": 7.2668202939511946e-06, + "loss": 0.0842, + "step": 4785 + }, + { + "epoch": 1.202082941213376, + "grad_norm": 0.22708240151405334, + "learning_rate": 7.2472738535292295e-06, + "loss": 0.0885, + "step": 4790 + }, + { + "epoch": 1.2033377250768555, + "grad_norm": 0.22052530944347382, + "learning_rate": 7.227738785933025e-06, + "loss": 0.0749, + "step": 4795 + }, + { + "epoch": 1.204592508940335, + "grad_norm": 0.2625311315059662, + "learning_rate": 7.208215171871277e-06, + "loss": 0.0835, + "step": 4800 + }, + { + "epoch": 1.2058472928038144, + "grad_norm": 0.20223468542099, + "learning_rate": 7.188703092005353e-06, + "loss": 0.0855, + "step": 4805 + }, + { + "epoch": 1.207102076667294, + "grad_norm": 0.17523938417434692, + "learning_rate": 7.169202626948973e-06, + "loss": 0.0833, + "step": 4810 + }, + { + "epoch": 1.2083568605307735, + "grad_norm": 0.11394723504781723, + "learning_rate": 7.149713857267862e-06, + "loss": 0.0769, + "step": 4815 + }, + { + "epoch": 1.209611644394253, + "grad_norm": 0.28187689185142517, + "learning_rate": 7.130236863479434e-06, + "loss": 0.0908, + "step": 4820 + }, + { + "epoch": 1.2108664282577326, + "grad_norm": 0.22345809638500214, + "learning_rate": 7.110771726052446e-06, + "loss": 0.087, + "step": 4825 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.19860218465328217, + "learning_rate": 7.091318525406671e-06, + "loss": 0.078, + "step": 4830 + }, + { + "epoch": 1.2133759959846917, + "grad_norm": 0.3908574879169464, + "learning_rate": 7.071877341912576e-06, + "loss": 0.0926, + "step": 4835 + }, + { + "epoch": 1.2146307798481712, + "grad_norm": 0.18266786634922028, + "learning_rate": 7.052448255890958e-06, + "loss": 0.0858, + "step": 4840 + }, + { + "epoch": 1.2158855637116506, + "grad_norm": 0.2580263316631317, + "learning_rate": 7.033031347612655e-06, + "loss": 0.0777, + "step": 4845 + }, + { + "epoch": 1.2171403475751301, + "grad_norm": 0.22712433338165283, + "learning_rate": 7.013626697298182e-06, + "loss": 0.0771, + "step": 4850 + }, + { + "epoch": 1.2183951314386097, + "grad_norm": 0.5155333876609802, + "learning_rate": 6.994234385117414e-06, + "loss": 0.0997, + "step": 4855 + }, + { + "epoch": 1.2196499153020892, + "grad_norm": 0.16782841086387634, + "learning_rate": 6.974854491189243e-06, + "loss": 0.0891, + "step": 4860 + }, + { + "epoch": 1.2209046991655688, + "grad_norm": 0.2045108526945114, + "learning_rate": 6.95548709558127e-06, + "loss": 0.0846, + "step": 4865 + }, + { + "epoch": 1.2221594830290483, + "grad_norm": 0.46996012330055237, + "learning_rate": 6.9361322783094465e-06, + "loss": 0.0845, + "step": 4870 + }, + { + "epoch": 1.2234142668925279, + "grad_norm": 0.3139326870441437, + "learning_rate": 6.916790119337766e-06, + "loss": 0.0842, + "step": 4875 + }, + { + "epoch": 1.2246690507560072, + "grad_norm": 0.27236974239349365, + "learning_rate": 6.897460698577918e-06, + "loss": 0.0929, + "step": 4880 + }, + { + "epoch": 1.2259238346194867, + "grad_norm": 0.3481730818748474, + "learning_rate": 6.878144095888964e-06, + "loss": 0.0832, + "step": 4885 + }, + { + "epoch": 1.2271786184829663, + "grad_norm": 0.11182388663291931, + "learning_rate": 6.858840391077017e-06, + "loss": 0.0888, + "step": 4890 + }, + { + "epoch": 1.2284334023464458, + "grad_norm": 0.5058387517929077, + "learning_rate": 6.839549663894897e-06, + "loss": 0.0784, + "step": 4895 + }, + { + "epoch": 1.2296881862099254, + "grad_norm": 0.3258366584777832, + "learning_rate": 6.820271994041796e-06, + "loss": 0.0834, + "step": 4900 + }, + { + "epoch": 1.230942970073405, + "grad_norm": 0.11910035461187363, + "learning_rate": 6.8010074611629815e-06, + "loss": 0.0806, + "step": 4905 + }, + { + "epoch": 1.2321977539368845, + "grad_norm": 0.10795161128044128, + "learning_rate": 6.781756144849431e-06, + "loss": 0.0877, + "step": 4910 + }, + { + "epoch": 1.2334525378003638, + "grad_norm": 0.07859393954277039, + "learning_rate": 6.762518124637525e-06, + "loss": 0.0789, + "step": 4915 + }, + { + "epoch": 1.2347073216638433, + "grad_norm": 0.36102941632270813, + "learning_rate": 6.743293480008703e-06, + "loss": 0.0973, + "step": 4920 + }, + { + "epoch": 1.2359621055273229, + "grad_norm": 0.07983565330505371, + "learning_rate": 6.724082290389151e-06, + "loss": 0.0922, + "step": 4925 + }, + { + "epoch": 1.2372168893908024, + "grad_norm": 0.18033474683761597, + "learning_rate": 6.704884635149467e-06, + "loss": 0.083, + "step": 4930 + }, + { + "epoch": 1.238471673254282, + "grad_norm": 0.33377745747566223, + "learning_rate": 6.685700593604329e-06, + "loss": 0.0769, + "step": 4935 + }, + { + "epoch": 1.2397264571177615, + "grad_norm": 0.09846628457307816, + "learning_rate": 6.666530245012168e-06, + "loss": 0.0848, + "step": 4940 + }, + { + "epoch": 1.240981240981241, + "grad_norm": 0.28946545720100403, + "learning_rate": 6.647373668574841e-06, + "loss": 0.0846, + "step": 4945 + }, + { + "epoch": 1.2422360248447206, + "grad_norm": 0.2697726786136627, + "learning_rate": 6.628230943437319e-06, + "loss": 0.0741, + "step": 4950 + }, + { + "epoch": 1.2434908087082, + "grad_norm": 0.24468262493610382, + "learning_rate": 6.609102148687333e-06, + "loss": 0.084, + "step": 4955 + }, + { + "epoch": 1.2447455925716795, + "grad_norm": 0.7213346362113953, + "learning_rate": 6.589987363355068e-06, + "loss": 0.0909, + "step": 4960 + }, + { + "epoch": 1.246000376435159, + "grad_norm": 0.1924159675836563, + "learning_rate": 6.570886666412823e-06, + "loss": 0.0967, + "step": 4965 + }, + { + "epoch": 1.2472551602986386, + "grad_norm": 0.2051101177930832, + "learning_rate": 6.551800136774697e-06, + "loss": 0.0744, + "step": 4970 + }, + { + "epoch": 1.2485099441621181, + "grad_norm": 0.2706952393054962, + "learning_rate": 6.532727853296257e-06, + "loss": 0.0763, + "step": 4975 + }, + { + "epoch": 1.2497647280255977, + "grad_norm": 0.21088339388370514, + "learning_rate": 6.513669894774209e-06, + "loss": 0.0879, + "step": 4980 + }, + { + "epoch": 1.2510195118890772, + "grad_norm": 0.339167982339859, + "learning_rate": 6.494626339946075e-06, + "loss": 0.0867, + "step": 4985 + }, + { + "epoch": 1.2522742957525566, + "grad_norm": 0.2325342744588852, + "learning_rate": 6.47559726748987e-06, + "loss": 0.088, + "step": 4990 + }, + { + "epoch": 1.253529079616036, + "grad_norm": 0.24122120440006256, + "learning_rate": 6.456582756023781e-06, + "loss": 0.082, + "step": 4995 + }, + { + "epoch": 1.2547838634795156, + "grad_norm": 0.4205479025840759, + "learning_rate": 6.437582884105835e-06, + "loss": 0.0825, + "step": 5000 + }, + { + "epoch": 1.2560386473429952, + "grad_norm": 0.4158347547054291, + "learning_rate": 6.41859773023356e-06, + "loss": 0.0845, + "step": 5005 + }, + { + "epoch": 1.2572934312064747, + "grad_norm": 0.14699125289916992, + "learning_rate": 6.399627372843699e-06, + "loss": 0.0871, + "step": 5010 + }, + { + "epoch": 1.258548215069954, + "grad_norm": 0.15239457786083221, + "learning_rate": 6.380671890311852e-06, + "loss": 0.0886, + "step": 5015 + }, + { + "epoch": 1.2598029989334338, + "grad_norm": 0.35437726974487305, + "learning_rate": 6.361731360952169e-06, + "loss": 0.0941, + "step": 5020 + }, + { + "epoch": 1.2610577827969132, + "grad_norm": 0.19353091716766357, + "learning_rate": 6.342805863017012e-06, + "loss": 0.0911, + "step": 5025 + }, + { + "epoch": 1.2623125666603927, + "grad_norm": 0.19099637866020203, + "learning_rate": 6.323895474696651e-06, + "loss": 0.0929, + "step": 5030 + }, + { + "epoch": 1.2635673505238723, + "grad_norm": 0.25522175431251526, + "learning_rate": 6.305000274118926e-06, + "loss": 0.0896, + "step": 5035 + }, + { + "epoch": 1.2648221343873518, + "grad_norm": 0.35186290740966797, + "learning_rate": 6.286120339348935e-06, + "loss": 0.0717, + "step": 5040 + }, + { + "epoch": 1.2660769182508314, + "grad_norm": 0.2007788121700287, + "learning_rate": 6.267255748388697e-06, + "loss": 0.0861, + "step": 5045 + }, + { + "epoch": 1.2673317021143107, + "grad_norm": 0.07917312532663345, + "learning_rate": 6.248406579176838e-06, + "loss": 0.0829, + "step": 5050 + }, + { + "epoch": 1.2685864859777904, + "grad_norm": 0.2649421989917755, + "learning_rate": 6.229572909588282e-06, + "loss": 0.0802, + "step": 5055 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.13473555445671082, + "learning_rate": 6.2107548174339085e-06, + "loss": 0.0731, + "step": 5060 + }, + { + "epoch": 1.2710960537047493, + "grad_norm": 0.3079237639904022, + "learning_rate": 6.1919523804602335e-06, + "loss": 0.0874, + "step": 5065 + }, + { + "epoch": 1.2723508375682289, + "grad_norm": 0.1939849853515625, + "learning_rate": 6.173165676349103e-06, + "loss": 0.0782, + "step": 5070 + }, + { + "epoch": 1.2736056214317084, + "grad_norm": 0.3249664902687073, + "learning_rate": 6.15439478271736e-06, + "loss": 0.078, + "step": 5075 + }, + { + "epoch": 1.274860405295188, + "grad_norm": 0.21080322563648224, + "learning_rate": 6.135639777116526e-06, + "loss": 0.0777, + "step": 5080 + }, + { + "epoch": 1.2761151891586673, + "grad_norm": 0.24793751537799835, + "learning_rate": 6.116900737032484e-06, + "loss": 0.0737, + "step": 5085 + }, + { + "epoch": 1.277369973022147, + "grad_norm": 0.2202165126800537, + "learning_rate": 6.0981777398851504e-06, + "loss": 0.0823, + "step": 5090 + }, + { + "epoch": 1.2786247568856264, + "grad_norm": 0.1270621120929718, + "learning_rate": 6.079470863028164e-06, + "loss": 0.0787, + "step": 5095 + }, + { + "epoch": 1.279879540749106, + "grad_norm": 0.1102348044514656, + "learning_rate": 6.0607801837485665e-06, + "loss": 0.0749, + "step": 5100 + }, + { + "epoch": 1.2811343246125855, + "grad_norm": 0.21010182797908783, + "learning_rate": 6.042105779266479e-06, + "loss": 0.0831, + "step": 5105 + }, + { + "epoch": 1.282389108476065, + "grad_norm": 0.6771642565727234, + "learning_rate": 6.023447726734771e-06, + "loss": 0.0867, + "step": 5110 + }, + { + "epoch": 1.2836438923395446, + "grad_norm": 0.22212505340576172, + "learning_rate": 6.004806103238771e-06, + "loss": 0.0935, + "step": 5115 + }, + { + "epoch": 1.284898676203024, + "grad_norm": 0.18700364232063293, + "learning_rate": 5.986180985795927e-06, + "loss": 0.0867, + "step": 5120 + }, + { + "epoch": 1.2861534600665037, + "grad_norm": 0.13250069320201874, + "learning_rate": 5.967572451355486e-06, + "loss": 0.0897, + "step": 5125 + }, + { + "epoch": 1.287408243929983, + "grad_norm": 0.4712745249271393, + "learning_rate": 5.9489805767981845e-06, + "loss": 0.085, + "step": 5130 + }, + { + "epoch": 1.2886630277934625, + "grad_norm": 0.1509675532579422, + "learning_rate": 5.9304054389359354e-06, + "loss": 0.082, + "step": 5135 + }, + { + "epoch": 1.289917811656942, + "grad_norm": 0.37453317642211914, + "learning_rate": 5.911847114511497e-06, + "loss": 0.0786, + "step": 5140 + }, + { + "epoch": 1.2911725955204216, + "grad_norm": 0.18769773840904236, + "learning_rate": 5.893305680198175e-06, + "loss": 0.0829, + "step": 5145 + }, + { + "epoch": 1.2924273793839012, + "grad_norm": 0.4405484199523926, + "learning_rate": 5.874781212599475e-06, + "loss": 0.0804, + "step": 5150 + }, + { + "epoch": 1.2936821632473805, + "grad_norm": 0.20101556181907654, + "learning_rate": 5.856273788248819e-06, + "loss": 0.0827, + "step": 5155 + }, + { + "epoch": 1.2949369471108603, + "grad_norm": 0.2524980902671814, + "learning_rate": 5.837783483609214e-06, + "loss": 0.0766, + "step": 5160 + }, + { + "epoch": 1.2961917309743396, + "grad_norm": 0.2875005304813385, + "learning_rate": 5.819310375072935e-06, + "loss": 0.0839, + "step": 5165 + }, + { + "epoch": 1.2974465148378191, + "grad_norm": 0.3446800410747528, + "learning_rate": 5.800854538961213e-06, + "loss": 0.0906, + "step": 5170 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 0.15456603467464447, + "learning_rate": 5.782416051523909e-06, + "loss": 0.0811, + "step": 5175 + }, + { + "epoch": 1.2999560825647782, + "grad_norm": 0.3075577914714813, + "learning_rate": 5.763994988939223e-06, + "loss": 0.0808, + "step": 5180 + }, + { + "epoch": 1.3012108664282578, + "grad_norm": 0.4799324572086334, + "learning_rate": 5.745591427313365e-06, + "loss": 0.0871, + "step": 5185 + }, + { + "epoch": 1.3024656502917373, + "grad_norm": 0.1860457956790924, + "learning_rate": 5.727205442680218e-06, + "loss": 0.0799, + "step": 5190 + }, + { + "epoch": 1.3037204341552169, + "grad_norm": 0.2134033590555191, + "learning_rate": 5.708837111001069e-06, + "loss": 0.0885, + "step": 5195 + }, + { + "epoch": 1.3049752180186962, + "grad_norm": 0.40847694873809814, + "learning_rate": 5.690486508164268e-06, + "loss": 0.084, + "step": 5200 + }, + { + "epoch": 1.3062300018821758, + "grad_norm": 0.21664460003376007, + "learning_rate": 5.672153709984909e-06, + "loss": 0.085, + "step": 5205 + }, + { + "epoch": 1.3074847857456553, + "grad_norm": 0.22716949880123138, + "learning_rate": 5.653838792204538e-06, + "loss": 0.0807, + "step": 5210 + }, + { + "epoch": 1.3087395696091348, + "grad_norm": 0.2670564353466034, + "learning_rate": 5.6355418304908226e-06, + "loss": 0.086, + "step": 5215 + }, + { + "epoch": 1.3099943534726144, + "grad_norm": 0.18262840807437897, + "learning_rate": 5.617262900437239e-06, + "loss": 0.0784, + "step": 5220 + }, + { + "epoch": 1.311249137336094, + "grad_norm": 0.41220247745513916, + "learning_rate": 5.599002077562779e-06, + "loss": 0.0788, + "step": 5225 + }, + { + "epoch": 1.3125039211995735, + "grad_norm": 0.41886112093925476, + "learning_rate": 5.580759437311624e-06, + "loss": 0.0912, + "step": 5230 + }, + { + "epoch": 1.3137587050630528, + "grad_norm": 0.30091163516044617, + "learning_rate": 5.562535055052818e-06, + "loss": 0.084, + "step": 5235 + }, + { + "epoch": 1.3150134889265324, + "grad_norm": 0.38127401471138, + "learning_rate": 5.544329006079987e-06, + "loss": 0.0769, + "step": 5240 + }, + { + "epoch": 1.316268272790012, + "grad_norm": 0.13492530584335327, + "learning_rate": 5.526141365611018e-06, + "loss": 0.0795, + "step": 5245 + }, + { + "epoch": 1.3175230566534915, + "grad_norm": 0.27479758858680725, + "learning_rate": 5.507972208787728e-06, + "loss": 0.0838, + "step": 5250 + }, + { + "epoch": 1.318777840516971, + "grad_norm": 0.1254061758518219, + "learning_rate": 5.489821610675579e-06, + "loss": 0.0773, + "step": 5255 + }, + { + "epoch": 1.3200326243804505, + "grad_norm": 0.15454204380512238, + "learning_rate": 5.471689646263358e-06, + "loss": 0.0793, + "step": 5260 + }, + { + "epoch": 1.32128740824393, + "grad_norm": 0.2265045940876007, + "learning_rate": 5.453576390462861e-06, + "loss": 0.0881, + "step": 5265 + }, + { + "epoch": 1.3225421921074094, + "grad_norm": 0.14053259789943695, + "learning_rate": 5.435481918108603e-06, + "loss": 0.0836, + "step": 5270 + }, + { + "epoch": 1.323796975970889, + "grad_norm": 0.15693189203739166, + "learning_rate": 5.41740630395748e-06, + "loss": 0.0816, + "step": 5275 + }, + { + "epoch": 1.3250517598343685, + "grad_norm": 0.15856729447841644, + "learning_rate": 5.399349622688479e-06, + "loss": 0.0702, + "step": 5280 + }, + { + "epoch": 1.326306543697848, + "grad_norm": 0.23238258063793182, + "learning_rate": 5.3813119489023766e-06, + "loss": 0.0839, + "step": 5285 + }, + { + "epoch": 1.3275613275613276, + "grad_norm": 0.08567748963832855, + "learning_rate": 5.363293357121422e-06, + "loss": 0.0899, + "step": 5290 + }, + { + "epoch": 1.3288161114248072, + "grad_norm": 0.0990014597773552, + "learning_rate": 5.345293921789e-06, + "loss": 0.0812, + "step": 5295 + }, + { + "epoch": 1.3300708952882867, + "grad_norm": 0.2629724144935608, + "learning_rate": 5.32731371726938e-06, + "loss": 0.08, + "step": 5300 + }, + { + "epoch": 1.331325679151766, + "grad_norm": 0.22977909445762634, + "learning_rate": 5.309352817847374e-06, + "loss": 0.0822, + "step": 5305 + }, + { + "epoch": 1.3325804630152456, + "grad_norm": 0.1563161015510559, + "learning_rate": 5.291411297728027e-06, + "loss": 0.0814, + "step": 5310 + }, + { + "epoch": 1.3338352468787251, + "grad_norm": 0.0872747004032135, + "learning_rate": 5.273489231036321e-06, + "loss": 0.0872, + "step": 5315 + }, + { + "epoch": 1.3350900307422047, + "grad_norm": 0.24948804080486298, + "learning_rate": 5.255586691816874e-06, + "loss": 0.0708, + "step": 5320 + }, + { + "epoch": 1.3363448146056842, + "grad_norm": 0.17048531770706177, + "learning_rate": 5.237703754033616e-06, + "loss": 0.0789, + "step": 5325 + }, + { + "epoch": 1.3375995984691638, + "grad_norm": 0.33402279019355774, + "learning_rate": 5.219840491569503e-06, + "loss": 0.0754, + "step": 5330 + }, + { + "epoch": 1.3388543823326433, + "grad_norm": 0.25611111521720886, + "learning_rate": 5.2019969782262046e-06, + "loss": 0.0792, + "step": 5335 + }, + { + "epoch": 1.3401091661961226, + "grad_norm": 0.392952561378479, + "learning_rate": 5.184173287723782e-06, + "loss": 0.0684, + "step": 5340 + }, + { + "epoch": 1.3413639500596022, + "grad_norm": 0.11270631104707718, + "learning_rate": 5.166369493700412e-06, + "loss": 0.0853, + "step": 5345 + }, + { + "epoch": 1.3426187339230817, + "grad_norm": 0.3116929531097412, + "learning_rate": 5.148585669712074e-06, + "loss": 0.0821, + "step": 5350 + }, + { + "epoch": 1.3438735177865613, + "grad_norm": 0.0726916715502739, + "learning_rate": 5.130821889232228e-06, + "loss": 0.092, + "step": 5355 + }, + { + "epoch": 1.3451283016500408, + "grad_norm": 0.06830952316522598, + "learning_rate": 5.113078225651529e-06, + "loss": 0.085, + "step": 5360 + }, + { + "epoch": 1.3463830855135204, + "grad_norm": 0.18756920099258423, + "learning_rate": 5.095354752277526e-06, + "loss": 0.0887, + "step": 5365 + }, + { + "epoch": 1.347637869377, + "grad_norm": 0.29784929752349854, + "learning_rate": 5.0776515423343445e-06, + "loss": 0.0919, + "step": 5370 + }, + { + "epoch": 1.3488926532404792, + "grad_norm": 0.24415934085845947, + "learning_rate": 5.059968668962401e-06, + "loss": 0.0904, + "step": 5375 + }, + { + "epoch": 1.3501474371039588, + "grad_norm": 0.10524599254131317, + "learning_rate": 5.042306205218082e-06, + "loss": 0.0899, + "step": 5380 + }, + { + "epoch": 1.3514022209674383, + "grad_norm": 0.13240844011306763, + "learning_rate": 5.024664224073454e-06, + "loss": 0.0838, + "step": 5385 + }, + { + "epoch": 1.3526570048309179, + "grad_norm": 0.17067807912826538, + "learning_rate": 5.007042798415969e-06, + "loss": 0.0824, + "step": 5390 + }, + { + "epoch": 1.3539117886943974, + "grad_norm": 0.1325388103723526, + "learning_rate": 4.989442001048151e-06, + "loss": 0.0839, + "step": 5395 + }, + { + "epoch": 1.355166572557877, + "grad_norm": 0.4756743907928467, + "learning_rate": 4.971861904687283e-06, + "loss": 0.0775, + "step": 5400 + }, + { + "epoch": 1.3564213564213565, + "grad_norm": 0.2643389105796814, + "learning_rate": 4.954302581965143e-06, + "loss": 0.089, + "step": 5405 + }, + { + "epoch": 1.3576761402848359, + "grad_norm": 0.33239737153053284, + "learning_rate": 4.93676410542768e-06, + "loss": 0.0945, + "step": 5410 + }, + { + "epoch": 1.3589309241483154, + "grad_norm": 0.315899133682251, + "learning_rate": 4.919246547534709e-06, + "loss": 0.0858, + "step": 5415 + }, + { + "epoch": 1.360185708011795, + "grad_norm": 0.7093632817268372, + "learning_rate": 4.901749980659617e-06, + "loss": 0.0778, + "step": 5420 + }, + { + "epoch": 1.3614404918752745, + "grad_norm": 0.17570814490318298, + "learning_rate": 4.884274477089085e-06, + "loss": 0.0856, + "step": 5425 + }, + { + "epoch": 1.362695275738754, + "grad_norm": 0.1181860864162445, + "learning_rate": 4.866820109022752e-06, + "loss": 0.0844, + "step": 5430 + }, + { + "epoch": 1.3639500596022336, + "grad_norm": 0.18417999148368835, + "learning_rate": 4.84938694857295e-06, + "loss": 0.0687, + "step": 5435 + }, + { + "epoch": 1.3652048434657131, + "grad_norm": 0.10456906259059906, + "learning_rate": 4.831975067764387e-06, + "loss": 0.0765, + "step": 5440 + }, + { + "epoch": 1.3664596273291925, + "grad_norm": 0.16597416996955872, + "learning_rate": 4.814584538533848e-06, + "loss": 0.0748, + "step": 5445 + }, + { + "epoch": 1.367714411192672, + "grad_norm": 0.08436016738414764, + "learning_rate": 4.797215432729913e-06, + "loss": 0.0748, + "step": 5450 + }, + { + "epoch": 1.3689691950561516, + "grad_norm": 0.2234387844800949, + "learning_rate": 4.779867822112658e-06, + "loss": 0.0795, + "step": 5455 + }, + { + "epoch": 1.370223978919631, + "grad_norm": 0.2501266300678253, + "learning_rate": 4.762541778353337e-06, + "loss": 0.0785, + "step": 5460 + }, + { + "epoch": 1.3714787627831106, + "grad_norm": 0.14667633175849915, + "learning_rate": 4.745237373034103e-06, + "loss": 0.0836, + "step": 5465 + }, + { + "epoch": 1.3727335466465902, + "grad_norm": 0.12082650512456894, + "learning_rate": 4.727954677647724e-06, + "loss": 0.0827, + "step": 5470 + }, + { + "epoch": 1.3739883305100697, + "grad_norm": 0.25682708621025085, + "learning_rate": 4.7106937635972565e-06, + "loss": 0.0806, + "step": 5475 + }, + { + "epoch": 1.375243114373549, + "grad_norm": 0.11401407420635223, + "learning_rate": 4.693454702195784e-06, + "loss": 0.085, + "step": 5480 + }, + { + "epoch": 1.3764978982370286, + "grad_norm": 0.13626615703105927, + "learning_rate": 4.676237564666095e-06, + "loss": 0.072, + "step": 5485 + }, + { + "epoch": 1.3777526821005082, + "grad_norm": 0.274503231048584, + "learning_rate": 4.659042422140399e-06, + "loss": 0.0869, + "step": 5490 + }, + { + "epoch": 1.3790074659639877, + "grad_norm": 0.3412606716156006, + "learning_rate": 4.6418693456600424e-06, + "loss": 0.0894, + "step": 5495 + }, + { + "epoch": 1.3802622498274673, + "grad_norm": 0.17439858615398407, + "learning_rate": 4.62471840617521e-06, + "loss": 0.0855, + "step": 5500 + }, + { + "epoch": 1.3815170336909468, + "grad_norm": 0.2983112037181854, + "learning_rate": 4.607589674544603e-06, + "loss": 0.0829, + "step": 5505 + }, + { + "epoch": 1.3827718175544264, + "grad_norm": 0.32758232951164246, + "learning_rate": 4.590483221535198e-06, + "loss": 0.0889, + "step": 5510 + }, + { + "epoch": 1.3840266014179057, + "grad_norm": 0.34168171882629395, + "learning_rate": 4.573399117821922e-06, + "loss": 0.0872, + "step": 5515 + }, + { + "epoch": 1.3852813852813852, + "grad_norm": 0.21264196932315826, + "learning_rate": 4.556337433987359e-06, + "loss": 0.0772, + "step": 5520 + }, + { + "epoch": 1.3865361691448648, + "grad_norm": 0.11897042393684387, + "learning_rate": 4.539298240521463e-06, + "loss": 0.0796, + "step": 5525 + }, + { + "epoch": 1.3877909530083443, + "grad_norm": 0.20385603606700897, + "learning_rate": 4.522281607821288e-06, + "loss": 0.0698, + "step": 5530 + }, + { + "epoch": 1.3890457368718239, + "grad_norm": 0.3090057671070099, + "learning_rate": 4.505287606190658e-06, + "loss": 0.0735, + "step": 5535 + }, + { + "epoch": 1.3903005207353034, + "grad_norm": 0.21723704040050507, + "learning_rate": 4.488316305839911e-06, + "loss": 0.0877, + "step": 5540 + }, + { + "epoch": 1.391555304598783, + "grad_norm": 0.2535438537597656, + "learning_rate": 4.471367776885589e-06, + "loss": 0.0802, + "step": 5545 + }, + { + "epoch": 1.3928100884622623, + "grad_norm": 0.2798052728176117, + "learning_rate": 4.454442089350151e-06, + "loss": 0.0828, + "step": 5550 + }, + { + "epoch": 1.3940648723257418, + "grad_norm": 0.1968078762292862, + "learning_rate": 4.437539313161697e-06, + "loss": 0.0878, + "step": 5555 + }, + { + "epoch": 1.3953196561892214, + "grad_norm": 0.3160542845726013, + "learning_rate": 4.420659518153667e-06, + "loss": 0.0843, + "step": 5560 + }, + { + "epoch": 1.396574440052701, + "grad_norm": 0.10868985950946808, + "learning_rate": 4.403802774064548e-06, + "loss": 0.0898, + "step": 5565 + }, + { + "epoch": 1.3978292239161805, + "grad_norm": 0.18841342628002167, + "learning_rate": 4.386969150537593e-06, + "loss": 0.0839, + "step": 5570 + }, + { + "epoch": 1.39908400777966, + "grad_norm": 0.20673134922981262, + "learning_rate": 4.370158717120544e-06, + "loss": 0.0807, + "step": 5575 + }, + { + "epoch": 1.4003387916431396, + "grad_norm": 0.2445622682571411, + "learning_rate": 4.35337154326532e-06, + "loss": 0.0825, + "step": 5580 + }, + { + "epoch": 1.401593575506619, + "grad_norm": 0.2141215056180954, + "learning_rate": 4.336607698327755e-06, + "loss": 0.0852, + "step": 5585 + }, + { + "epoch": 1.4028483593700984, + "grad_norm": 0.2787396311759949, + "learning_rate": 4.3198672515672925e-06, + "loss": 0.0829, + "step": 5590 + }, + { + "epoch": 1.404103143233578, + "grad_norm": 0.24689462780952454, + "learning_rate": 4.303150272146706e-06, + "loss": 0.0864, + "step": 5595 + }, + { + "epoch": 1.4053579270970575, + "grad_norm": 0.17422537505626678, + "learning_rate": 4.286456829131821e-06, + "loss": 0.086, + "step": 5600 + }, + { + "epoch": 1.406612710960537, + "grad_norm": 0.20011769235134125, + "learning_rate": 4.269786991491222e-06, + "loss": 0.0812, + "step": 5605 + }, + { + "epoch": 1.4078674948240166, + "grad_norm": 0.2414385825395584, + "learning_rate": 4.253140828095964e-06, + "loss": 0.0834, + "step": 5610 + }, + { + "epoch": 1.4091222786874962, + "grad_norm": 0.2885581851005554, + "learning_rate": 4.236518407719289e-06, + "loss": 0.0765, + "step": 5615 + }, + { + "epoch": 1.4103770625509755, + "grad_norm": 0.20998211205005646, + "learning_rate": 4.219919799036359e-06, + "loss": 0.0748, + "step": 5620 + }, + { + "epoch": 1.411631846414455, + "grad_norm": 0.26928192377090454, + "learning_rate": 4.203345070623947e-06, + "loss": 0.0923, + "step": 5625 + }, + { + "epoch": 1.4128866302779346, + "grad_norm": 0.28317445516586304, + "learning_rate": 4.186794290960162e-06, + "loss": 0.08, + "step": 5630 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.08973658084869385, + "learning_rate": 4.170267528424185e-06, + "loss": 0.0775, + "step": 5635 + }, + { + "epoch": 1.4153961980048937, + "grad_norm": 0.4094654321670532, + "learning_rate": 4.153764851295954e-06, + "loss": 0.0969, + "step": 5640 + }, + { + "epoch": 1.4166509818683732, + "grad_norm": 0.17649400234222412, + "learning_rate": 4.137286327755913e-06, + "loss": 0.0809, + "step": 5645 + }, + { + "epoch": 1.4179057657318528, + "grad_norm": 0.1127919927239418, + "learning_rate": 4.120832025884705e-06, + "loss": 0.0782, + "step": 5650 + }, + { + "epoch": 1.419160549595332, + "grad_norm": 0.5189236998558044, + "learning_rate": 4.104402013662901e-06, + "loss": 0.0867, + "step": 5655 + }, + { + "epoch": 1.4204153334588117, + "grad_norm": 0.4165945053100586, + "learning_rate": 4.0879963589707305e-06, + "loss": 0.0836, + "step": 5660 + }, + { + "epoch": 1.4216701173222912, + "grad_norm": 0.28989869356155396, + "learning_rate": 4.071615129587787e-06, + "loss": 0.0818, + "step": 5665 + }, + { + "epoch": 1.4229249011857708, + "grad_norm": 0.101304791867733, + "learning_rate": 4.055258393192746e-06, + "loss": 0.0827, + "step": 5670 + }, + { + "epoch": 1.4241796850492503, + "grad_norm": 0.2473038285970688, + "learning_rate": 4.038926217363089e-06, + "loss": 0.0805, + "step": 5675 + }, + { + "epoch": 1.4254344689127298, + "grad_norm": 0.2448158711194992, + "learning_rate": 4.022618669574839e-06, + "loss": 0.0763, + "step": 5680 + }, + { + "epoch": 1.4266892527762094, + "grad_norm": 0.5424969792366028, + "learning_rate": 4.006335817202256e-06, + "loss": 0.0836, + "step": 5685 + }, + { + "epoch": 1.4279440366396887, + "grad_norm": 0.2771781086921692, + "learning_rate": 3.990077727517573e-06, + "loss": 0.0783, + "step": 5690 + }, + { + "epoch": 1.4291988205031683, + "grad_norm": 0.05690968781709671, + "learning_rate": 3.973844467690727e-06, + "loss": 0.0729, + "step": 5695 + }, + { + "epoch": 1.4304536043666478, + "grad_norm": 0.26809534430503845, + "learning_rate": 3.957636104789056e-06, + "loss": 0.0766, + "step": 5700 + }, + { + "epoch": 1.4317083882301274, + "grad_norm": 0.31916674971580505, + "learning_rate": 3.94145270577705e-06, + "loss": 0.0988, + "step": 5705 + }, + { + "epoch": 1.432963172093607, + "grad_norm": 0.5278578996658325, + "learning_rate": 3.925294337516051e-06, + "loss": 0.0853, + "step": 5710 + }, + { + "epoch": 1.4342179559570865, + "grad_norm": 0.15941303968429565, + "learning_rate": 3.909161066763999e-06, + "loss": 0.0872, + "step": 5715 + }, + { + "epoch": 1.435472739820566, + "grad_norm": 0.46407434344291687, + "learning_rate": 3.893052960175128e-06, + "loss": 0.0746, + "step": 5720 + }, + { + "epoch": 1.4367275236840453, + "grad_norm": 0.13292035460472107, + "learning_rate": 3.876970084299722e-06, + "loss": 0.0829, + "step": 5725 + }, + { + "epoch": 1.4379823075475249, + "grad_norm": 0.18183240294456482, + "learning_rate": 3.860912505583819e-06, + "loss": 0.0797, + "step": 5730 + }, + { + "epoch": 1.4392370914110044, + "grad_norm": 0.16055604815483093, + "learning_rate": 3.844880290368935e-06, + "loss": 0.0805, + "step": 5735 + }, + { + "epoch": 1.440491875274484, + "grad_norm": 0.28802794218063354, + "learning_rate": 3.828873504891813e-06, + "loss": 0.0795, + "step": 5740 + }, + { + "epoch": 1.4417466591379635, + "grad_norm": 0.11848758161067963, + "learning_rate": 3.8128922152841188e-06, + "loss": 0.0856, + "step": 5745 + }, + { + "epoch": 1.443001443001443, + "grad_norm": 0.131498321890831, + "learning_rate": 3.7969364875721914e-06, + "loss": 0.084, + "step": 5750 + }, + { + "epoch": 1.4442562268649226, + "grad_norm": 0.1645013391971588, + "learning_rate": 3.78100638767676e-06, + "loss": 0.0635, + "step": 5755 + }, + { + "epoch": 1.445511010728402, + "grad_norm": 0.1245434433221817, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.0708, + "step": 5760 + }, + { + "epoch": 1.4467657945918815, + "grad_norm": 0.5209866762161255, + "learning_rate": 3.7492233344886073e-06, + "loss": 0.0801, + "step": 5765 + }, + { + "epoch": 1.448020578455361, + "grad_norm": 0.19441930949687958, + "learning_rate": 3.7333705125068576e-06, + "loss": 0.0711, + "step": 5770 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.27351874113082886, + "learning_rate": 3.71754358096299e-06, + "loss": 0.071, + "step": 5775 + }, + { + "epoch": 1.4505301461823201, + "grad_norm": 0.2211633026599884, + "learning_rate": 3.7017426052456086e-06, + "loss": 0.0878, + "step": 5780 + }, + { + "epoch": 1.4517849300457997, + "grad_norm": 0.24535594880580902, + "learning_rate": 3.685967650636095e-06, + "loss": 0.0805, + "step": 5785 + }, + { + "epoch": 1.4530397139092792, + "grad_norm": 0.24481262266635895, + "learning_rate": 3.6702187823083147e-06, + "loss": 0.0893, + "step": 5790 + }, + { + "epoch": 1.4542944977727585, + "grad_norm": 0.16357897222042084, + "learning_rate": 3.6544960653283544e-06, + "loss": 0.0862, + "step": 5795 + }, + { + "epoch": 1.455549281636238, + "grad_norm": 0.34867826104164124, + "learning_rate": 3.6387995646542727e-06, + "loss": 0.0774, + "step": 5800 + }, + { + "epoch": 1.4568040654997176, + "grad_norm": 0.1586691290140152, + "learning_rate": 3.6231293451357994e-06, + "loss": 0.0822, + "step": 5805 + }, + { + "epoch": 1.4580588493631972, + "grad_norm": 0.1658533662557602, + "learning_rate": 3.6074854715140983e-06, + "loss": 0.0921, + "step": 5810 + }, + { + "epoch": 1.4593136332266767, + "grad_norm": 0.26845037937164307, + "learning_rate": 3.591868008421472e-06, + "loss": 0.075, + "step": 5815 + }, + { + "epoch": 1.4605684170901563, + "grad_norm": 0.2408105581998825, + "learning_rate": 3.5762770203811225e-06, + "loss": 0.0927, + "step": 5820 + }, + { + "epoch": 1.4618232009536358, + "grad_norm": 0.15939578413963318, + "learning_rate": 3.560712571806858e-06, + "loss": 0.0794, + "step": 5825 + }, + { + "epoch": 1.4630779848171152, + "grad_norm": 0.07994109392166138, + "learning_rate": 3.5451747270028527e-06, + "loss": 0.0799, + "step": 5830 + }, + { + "epoch": 1.4643327686805947, + "grad_norm": 0.09967489540576935, + "learning_rate": 3.5296635501633558e-06, + "loss": 0.08, + "step": 5835 + }, + { + "epoch": 1.4655875525440742, + "grad_norm": 0.1677519679069519, + "learning_rate": 3.5141791053724405e-06, + "loss": 0.0809, + "step": 5840 + }, + { + "epoch": 1.4668423364075538, + "grad_norm": 0.21794256567955017, + "learning_rate": 3.4987214566037477e-06, + "loss": 0.0792, + "step": 5845 + }, + { + "epoch": 1.4680971202710333, + "grad_norm": 0.2627575993537903, + "learning_rate": 3.483290667720196e-06, + "loss": 0.0942, + "step": 5850 + }, + { + "epoch": 1.4693519041345129, + "grad_norm": 0.15493802726268768, + "learning_rate": 3.4678868024737456e-06, + "loss": 0.0864, + "step": 5855 + }, + { + "epoch": 1.4706066879979924, + "grad_norm": 0.44874611496925354, + "learning_rate": 3.452509924505113e-06, + "loss": 0.0773, + "step": 5860 + }, + { + "epoch": 1.4718614718614718, + "grad_norm": 0.1353214532136917, + "learning_rate": 3.437160097343526e-06, + "loss": 0.0789, + "step": 5865 + }, + { + "epoch": 1.4731162557249513, + "grad_norm": 0.14763125777244568, + "learning_rate": 3.4218373844064433e-06, + "loss": 0.0824, + "step": 5870 + }, + { + "epoch": 1.4743710395884309, + "grad_norm": 0.21625354886054993, + "learning_rate": 3.4065418489993118e-06, + "loss": 0.0736, + "step": 5875 + }, + { + "epoch": 1.4756258234519104, + "grad_norm": 0.24612949788570404, + "learning_rate": 3.3912735543152864e-06, + "loss": 0.0773, + "step": 5880 + }, + { + "epoch": 1.47688060731539, + "grad_norm": 0.1752692461013794, + "learning_rate": 3.376032563434979e-06, + "loss": 0.0826, + "step": 5885 + }, + { + "epoch": 1.4781353911788695, + "grad_norm": 0.2644670605659485, + "learning_rate": 3.3608189393262037e-06, + "loss": 0.0869, + "step": 5890 + }, + { + "epoch": 1.479390175042349, + "grad_norm": 0.20996436476707458, + "learning_rate": 3.345632744843702e-06, + "loss": 0.0902, + "step": 5895 + }, + { + "epoch": 1.4806449589058284, + "grad_norm": 0.2922036647796631, + "learning_rate": 3.3304740427288886e-06, + "loss": 0.0896, + "step": 5900 + }, + { + "epoch": 1.481899742769308, + "grad_norm": 0.37115779519081116, + "learning_rate": 3.3153428956096046e-06, + "loss": 0.0876, + "step": 5905 + }, + { + "epoch": 1.4831545266327875, + "grad_norm": 0.23147353529930115, + "learning_rate": 3.3002393659998357e-06, + "loss": 0.0778, + "step": 5910 + }, + { + "epoch": 1.484409310496267, + "grad_norm": 0.19222880899906158, + "learning_rate": 3.2851635162994788e-06, + "loss": 0.0884, + "step": 5915 + }, + { + "epoch": 1.4856640943597466, + "grad_norm": 0.1700582355260849, + "learning_rate": 3.27011540879406e-06, + "loss": 0.0831, + "step": 5920 + }, + { + "epoch": 1.486918878223226, + "grad_norm": 0.10918935388326645, + "learning_rate": 3.2550951056545e-06, + "loss": 0.0816, + "step": 5925 + }, + { + "epoch": 1.4881736620867057, + "grad_norm": 0.21468310058116913, + "learning_rate": 3.2401026689368363e-06, + "loss": 0.0807, + "step": 5930 + }, + { + "epoch": 1.489428445950185, + "grad_norm": 0.3127315044403076, + "learning_rate": 3.2251381605819876e-06, + "loss": 0.0958, + "step": 5935 + }, + { + "epoch": 1.4906832298136645, + "grad_norm": 0.21210235357284546, + "learning_rate": 3.210201642415477e-06, + "loss": 0.0748, + "step": 5940 + }, + { + "epoch": 1.491938013677144, + "grad_norm": 0.20049948990345, + "learning_rate": 3.1952931761471893e-06, + "loss": 0.0795, + "step": 5945 + }, + { + "epoch": 1.4931927975406236, + "grad_norm": 0.15716488659381866, + "learning_rate": 3.180412823371123e-06, + "loss": 0.0909, + "step": 5950 + }, + { + "epoch": 1.4944475814041032, + "grad_norm": 0.32477137446403503, + "learning_rate": 3.1655606455651134e-06, + "loss": 0.0792, + "step": 5955 + }, + { + "epoch": 1.4957023652675827, + "grad_norm": 0.17082227766513824, + "learning_rate": 3.1507367040905943e-06, + "loss": 0.0898, + "step": 5960 + }, + { + "epoch": 1.4969571491310623, + "grad_norm": 0.1650392711162567, + "learning_rate": 3.135941060192348e-06, + "loss": 0.0843, + "step": 5965 + }, + { + "epoch": 1.4982119329945416, + "grad_norm": 0.10460920631885529, + "learning_rate": 3.121173774998245e-06, + "loss": 0.0879, + "step": 5970 + }, + { + "epoch": 1.4994667168580211, + "grad_norm": 0.13847452402114868, + "learning_rate": 3.106434909518985e-06, + "loss": 0.0928, + "step": 5975 + }, + { + "epoch": 1.5007215007215007, + "grad_norm": 0.1215418130159378, + "learning_rate": 3.091724524647861e-06, + "loss": 0.0807, + "step": 5980 + }, + { + "epoch": 1.5019762845849802, + "grad_norm": 0.26588910818099976, + "learning_rate": 3.0770426811604946e-06, + "loss": 0.081, + "step": 5985 + }, + { + "epoch": 1.5032310684484598, + "grad_norm": 0.4260256290435791, + "learning_rate": 3.0623894397145837e-06, + "loss": 0.0778, + "step": 5990 + }, + { + "epoch": 1.5044858523119393, + "grad_norm": 0.20512565970420837, + "learning_rate": 3.0477648608496726e-06, + "loss": 0.0746, + "step": 5995 + }, + { + "epoch": 1.5057406361754189, + "grad_norm": 0.3236079812049866, + "learning_rate": 3.0331690049868733e-06, + "loss": 0.0818, + "step": 6000 + }, + { + "epoch": 1.5069954200388982, + "grad_norm": 0.5079814791679382, + "learning_rate": 3.018601932428632e-06, + "loss": 0.0782, + "step": 6005 + }, + { + "epoch": 1.508250203902378, + "grad_norm": 0.15258589386940002, + "learning_rate": 3.004063703358484e-06, + "loss": 0.0885, + "step": 6010 + }, + { + "epoch": 1.5095049877658573, + "grad_norm": 0.3337784707546234, + "learning_rate": 2.9895543778407875e-06, + "loss": 0.0973, + "step": 6015 + }, + { + "epoch": 1.5107597716293368, + "grad_norm": 0.1551678329706192, + "learning_rate": 2.9750740158205005e-06, + "loss": 0.0877, + "step": 6020 + }, + { + "epoch": 1.5120145554928164, + "grad_norm": 0.2518864870071411, + "learning_rate": 2.960622677122903e-06, + "loss": 0.0777, + "step": 6025 + }, + { + "epoch": 1.513269339356296, + "grad_norm": 0.22310376167297363, + "learning_rate": 2.9462004214533803e-06, + "loss": 0.0812, + "step": 6030 + }, + { + "epoch": 1.5145241232197755, + "grad_norm": 0.23948289453983307, + "learning_rate": 2.9318073083971486e-06, + "loss": 0.0879, + "step": 6035 + }, + { + "epoch": 1.5157789070832548, + "grad_norm": 0.24216221272945404, + "learning_rate": 2.9174433974190365e-06, + "loss": 0.0858, + "step": 6040 + }, + { + "epoch": 1.5170336909467346, + "grad_norm": 0.19065602123737335, + "learning_rate": 2.9031087478632116e-06, + "loss": 0.0868, + "step": 6045 + }, + { + "epoch": 1.518288474810214, + "grad_norm": 0.2692478597164154, + "learning_rate": 2.8888034189529524e-06, + "loss": 0.0737, + "step": 6050 + }, + { + "epoch": 1.5195432586736934, + "grad_norm": 0.20995523035526276, + "learning_rate": 2.874527469790408e-06, + "loss": 0.0824, + "step": 6055 + }, + { + "epoch": 1.520798042537173, + "grad_norm": 0.10530667006969452, + "learning_rate": 2.860280959356336e-06, + "loss": 0.0783, + "step": 6060 + }, + { + "epoch": 1.5220528264006525, + "grad_norm": 0.39904889464378357, + "learning_rate": 2.846063946509868e-06, + "loss": 0.0917, + "step": 6065 + }, + { + "epoch": 1.523307610264132, + "grad_norm": 0.2294001579284668, + "learning_rate": 2.8318764899882745e-06, + "loss": 0.0794, + "step": 6070 + }, + { + "epoch": 1.5245623941276114, + "grad_norm": 0.31115418672561646, + "learning_rate": 2.8177186484067143e-06, + "loss": 0.0803, + "step": 6075 + }, + { + "epoch": 1.5258171779910912, + "grad_norm": 0.21022425591945648, + "learning_rate": 2.803590480257985e-06, + "loss": 0.0809, + "step": 6080 + }, + { + "epoch": 1.5270719618545705, + "grad_norm": 0.4131692051887512, + "learning_rate": 2.7894920439122907e-06, + "loss": 0.0776, + "step": 6085 + }, + { + "epoch": 1.52832674571805, + "grad_norm": 0.08908002078533173, + "learning_rate": 2.77542339761701e-06, + "loss": 0.0797, + "step": 6090 + }, + { + "epoch": 1.5295815295815296, + "grad_norm": 0.13143645226955414, + "learning_rate": 2.7613845994964296e-06, + "loss": 0.0838, + "step": 6095 + }, + { + "epoch": 1.5308363134450091, + "grad_norm": 0.3454774022102356, + "learning_rate": 2.7473757075515305e-06, + "loss": 0.0785, + "step": 6100 + }, + { + "epoch": 1.5320910973084887, + "grad_norm": 0.2272334098815918, + "learning_rate": 2.7333967796597317e-06, + "loss": 0.0773, + "step": 6105 + }, + { + "epoch": 1.533345881171968, + "grad_norm": 0.2906242311000824, + "learning_rate": 2.7194478735746543e-06, + "loss": 0.0897, + "step": 6110 + }, + { + "epoch": 1.5346006650354478, + "grad_norm": 0.21385426819324493, + "learning_rate": 2.70552904692589e-06, + "loss": 0.0788, + "step": 6115 + }, + { + "epoch": 1.5358554488989271, + "grad_norm": 0.2976323962211609, + "learning_rate": 2.691640357218759e-06, + "loss": 0.0921, + "step": 6120 + }, + { + "epoch": 1.5371102327624067, + "grad_norm": 0.20897260308265686, + "learning_rate": 2.6777818618340667e-06, + "loss": 0.0831, + "step": 6125 + }, + { + "epoch": 1.5383650166258862, + "grad_norm": 0.22841490805149078, + "learning_rate": 2.663953618027869e-06, + "loss": 0.0829, + "step": 6130 + }, + { + "epoch": 1.5396198004893658, + "grad_norm": 0.34173956513404846, + "learning_rate": 2.6501556829312492e-06, + "loss": 0.0851, + "step": 6135 + }, + { + "epoch": 1.5408745843528453, + "grad_norm": 0.10126471519470215, + "learning_rate": 2.6363881135500567e-06, + "loss": 0.0763, + "step": 6140 + }, + { + "epoch": 1.5421293682163246, + "grad_norm": 0.1869829148054123, + "learning_rate": 2.6226509667646993e-06, + "loss": 0.0882, + "step": 6145 + }, + { + "epoch": 1.5433841520798044, + "grad_norm": 0.08933491259813309, + "learning_rate": 2.6089442993298854e-06, + "loss": 0.0928, + "step": 6150 + }, + { + "epoch": 1.5446389359432837, + "grad_norm": 0.2921507656574249, + "learning_rate": 2.595268167874396e-06, + "loss": 0.0824, + "step": 6155 + }, + { + "epoch": 1.5458937198067633, + "grad_norm": 0.27281445264816284, + "learning_rate": 2.581622628900868e-06, + "loss": 0.0822, + "step": 6160 + }, + { + "epoch": 1.5471485036702428, + "grad_norm": 0.20460334420204163, + "learning_rate": 2.568007738785533e-06, + "loss": 0.0819, + "step": 6165 + }, + { + "epoch": 1.5484032875337224, + "grad_norm": 0.5249515175819397, + "learning_rate": 2.5544235537779962e-06, + "loss": 0.0768, + "step": 6170 + }, + { + "epoch": 1.549658071397202, + "grad_norm": 0.2648284137248993, + "learning_rate": 2.540870130001015e-06, + "loss": 0.0843, + "step": 6175 + }, + { + "epoch": 1.5509128552606812, + "grad_norm": 0.47869521379470825, + "learning_rate": 2.5273475234502565e-06, + "loss": 0.089, + "step": 6180 + }, + { + "epoch": 1.552167639124161, + "grad_norm": 0.3596646785736084, + "learning_rate": 2.5138557899940595e-06, + "loss": 0.0802, + "step": 6185 + }, + { + "epoch": 1.5534224229876403, + "grad_norm": 0.13940221071243286, + "learning_rate": 2.5003949853732135e-06, + "loss": 0.0844, + "step": 6190 + }, + { + "epoch": 1.5546772068511199, + "grad_norm": 0.11128222197294235, + "learning_rate": 2.486965165200733e-06, + "loss": 0.0806, + "step": 6195 + }, + { + "epoch": 1.5559319907145994, + "grad_norm": 0.1166953295469284, + "learning_rate": 2.4735663849616098e-06, + "loss": 0.0896, + "step": 6200 + }, + { + "epoch": 1.557186774578079, + "grad_norm": 0.2891371250152588, + "learning_rate": 2.460198700012608e-06, + "loss": 0.0809, + "step": 6205 + }, + { + "epoch": 1.5584415584415585, + "grad_norm": 0.3150325417518616, + "learning_rate": 2.4468621655820125e-06, + "loss": 0.0852, + "step": 6210 + }, + { + "epoch": 1.5596963423050378, + "grad_norm": 0.4029218554496765, + "learning_rate": 2.433556836769411e-06, + "loss": 0.0751, + "step": 6215 + }, + { + "epoch": 1.5609511261685176, + "grad_norm": 0.19041700661182404, + "learning_rate": 2.420282768545469e-06, + "loss": 0.0852, + "step": 6220 + }, + { + "epoch": 1.562205910031997, + "grad_norm": 0.12898743152618408, + "learning_rate": 2.4070400157517036e-06, + "loss": 0.0909, + "step": 6225 + }, + { + "epoch": 1.5634606938954765, + "grad_norm": 0.07892940193414688, + "learning_rate": 2.3938286331002458e-06, + "loss": 0.0879, + "step": 6230 + }, + { + "epoch": 1.564715477758956, + "grad_norm": 0.21748192608356476, + "learning_rate": 2.380648675173619e-06, + "loss": 0.0877, + "step": 6235 + }, + { + "epoch": 1.5659702616224356, + "grad_norm": 0.13437186181545258, + "learning_rate": 2.367500196424529e-06, + "loss": 0.0812, + "step": 6240 + }, + { + "epoch": 1.5672250454859151, + "grad_norm": 0.21154047548770905, + "learning_rate": 2.3543832511756113e-06, + "loss": 0.0896, + "step": 6245 + }, + { + "epoch": 1.5684798293493944, + "grad_norm": 0.16429923474788666, + "learning_rate": 2.3412978936192343e-06, + "loss": 0.0875, + "step": 6250 + }, + { + "epoch": 1.5697346132128742, + "grad_norm": 0.29867643117904663, + "learning_rate": 2.328244177817254e-06, + "loss": 0.0784, + "step": 6255 + }, + { + "epoch": 1.5709893970763535, + "grad_norm": 0.0878303125500679, + "learning_rate": 2.315222157700797e-06, + "loss": 0.0809, + "step": 6260 + }, + { + "epoch": 1.572244180939833, + "grad_norm": 0.24482104182243347, + "learning_rate": 2.3022318870700533e-06, + "loss": 0.0865, + "step": 6265 + }, + { + "epoch": 1.5734989648033126, + "grad_norm": 0.3083607852458954, + "learning_rate": 2.289273419594027e-06, + "loss": 0.0895, + "step": 6270 + }, + { + "epoch": 1.5747537486667922, + "grad_norm": 0.3820037543773651, + "learning_rate": 2.2763468088103315e-06, + "loss": 0.08, + "step": 6275 + }, + { + "epoch": 1.5760085325302717, + "grad_norm": 0.09033305943012238, + "learning_rate": 2.263452108124968e-06, + "loss": 0.0795, + "step": 6280 + }, + { + "epoch": 1.577263316393751, + "grad_norm": 0.1263839453458786, + "learning_rate": 2.250589370812105e-06, + "loss": 0.0771, + "step": 6285 + }, + { + "epoch": 1.5785181002572308, + "grad_norm": 0.45043227076530457, + "learning_rate": 2.237758650013847e-06, + "loss": 0.075, + "step": 6290 + }, + { + "epoch": 1.5797728841207102, + "grad_norm": 0.10394462198019028, + "learning_rate": 2.2249599987400237e-06, + "loss": 0.0894, + "step": 6295 + }, + { + "epoch": 1.5810276679841897, + "grad_norm": 0.394104540348053, + "learning_rate": 2.2121934698679793e-06, + "loss": 0.0932, + "step": 6300 + }, + { + "epoch": 1.5822824518476692, + "grad_norm": 0.12392072379589081, + "learning_rate": 2.1994591161423327e-06, + "loss": 0.078, + "step": 6305 + }, + { + "epoch": 1.5835372357111488, + "grad_norm": 0.3277107775211334, + "learning_rate": 2.186756990174783e-06, + "loss": 0.0815, + "step": 6310 + }, + { + "epoch": 1.5847920195746283, + "grad_norm": 0.35756587982177734, + "learning_rate": 2.174087144443875e-06, + "loss": 0.0915, + "step": 6315 + }, + { + "epoch": 1.5860468034381077, + "grad_norm": 0.3044070303440094, + "learning_rate": 2.161449631294785e-06, + "loss": 0.0844, + "step": 6320 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.1297033727169037, + "learning_rate": 2.148844502939117e-06, + "loss": 0.0834, + "step": 6325 + }, + { + "epoch": 1.5885563711650668, + "grad_norm": 0.1339099109172821, + "learning_rate": 2.1362718114546777e-06, + "loss": 0.0886, + "step": 6330 + }, + { + "epoch": 1.5898111550285463, + "grad_norm": 0.15922382473945618, + "learning_rate": 2.1237316087852465e-06, + "loss": 0.0965, + "step": 6335 + }, + { + "epoch": 1.5910659388920259, + "grad_norm": 0.19360999763011932, + "learning_rate": 2.111223946740394e-06, + "loss": 0.0896, + "step": 6340 + }, + { + "epoch": 1.5923207227555054, + "grad_norm": 0.11068299412727356, + "learning_rate": 2.0987488769952436e-06, + "loss": 0.0759, + "step": 6345 + }, + { + "epoch": 1.593575506618985, + "grad_norm": 0.0986902266740799, + "learning_rate": 2.0863064510902586e-06, + "loss": 0.0912, + "step": 6350 + }, + { + "epoch": 1.5948302904824643, + "grad_norm": 0.23040059208869934, + "learning_rate": 2.0738967204310455e-06, + "loss": 0.0857, + "step": 6355 + }, + { + "epoch": 1.596085074345944, + "grad_norm": 0.1746888905763626, + "learning_rate": 2.0615197362881234e-06, + "loss": 0.0814, + "step": 6360 + }, + { + "epoch": 1.5973398582094234, + "grad_norm": 0.19930033385753632, + "learning_rate": 2.0491755497967183e-06, + "loss": 0.0848, + "step": 6365 + }, + { + "epoch": 1.598594642072903, + "grad_norm": 0.17192873358726501, + "learning_rate": 2.0368642119565617e-06, + "loss": 0.088, + "step": 6370 + }, + { + "epoch": 1.5998494259363825, + "grad_norm": 0.26457393169403076, + "learning_rate": 2.024585773631671e-06, + "loss": 0.0843, + "step": 6375 + }, + { + "epoch": 1.601104209799862, + "grad_norm": 0.15081310272216797, + "learning_rate": 2.012340285550126e-06, + "loss": 0.0822, + "step": 6380 + }, + { + "epoch": 1.6023589936633416, + "grad_norm": 0.1679522842168808, + "learning_rate": 2.0001277983038904e-06, + "loss": 0.0895, + "step": 6385 + }, + { + "epoch": 1.6036137775268209, + "grad_norm": 0.12190816551446915, + "learning_rate": 1.9879483623485786e-06, + "loss": 0.0764, + "step": 6390 + }, + { + "epoch": 1.6048685613903007, + "grad_norm": 0.30287179350852966, + "learning_rate": 1.975802028003253e-06, + "loss": 0.076, + "step": 6395 + }, + { + "epoch": 1.60612334525378, + "grad_norm": 0.3039589524269104, + "learning_rate": 1.963688845450218e-06, + "loss": 0.0883, + "step": 6400 + }, + { + "epoch": 1.6073781291172595, + "grad_norm": 0.10443169623613358, + "learning_rate": 1.9516088647348164e-06, + "loss": 0.0792, + "step": 6405 + }, + { + "epoch": 1.608632912980739, + "grad_norm": 0.10957269370555878, + "learning_rate": 1.9395621357652117e-06, + "loss": 0.0862, + "step": 6410 + }, + { + "epoch": 1.6098876968442186, + "grad_norm": 0.14673763513565063, + "learning_rate": 1.9275487083121946e-06, + "loss": 0.0808, + "step": 6415 + }, + { + "epoch": 1.6111424807076982, + "grad_norm": 0.32992008328437805, + "learning_rate": 1.9155686320089684e-06, + "loss": 0.0935, + "step": 6420 + }, + { + "epoch": 1.6123972645711775, + "grad_norm": 0.17369329929351807, + "learning_rate": 1.9036219563509439e-06, + "loss": 0.0838, + "step": 6425 + }, + { + "epoch": 1.6136520484346573, + "grad_norm": 0.22150751948356628, + "learning_rate": 1.891708730695544e-06, + "loss": 0.0873, + "step": 6430 + }, + { + "epoch": 1.6149068322981366, + "grad_norm": 0.18616770207881927, + "learning_rate": 1.8798290042619949e-06, + "loss": 0.0934, + "step": 6435 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.314390629529953, + "learning_rate": 1.8679828261311073e-06, + "loss": 0.0769, + "step": 6440 + }, + { + "epoch": 1.6174164000250957, + "grad_norm": 0.35372182726860046, + "learning_rate": 1.8561702452451047e-06, + "loss": 0.0808, + "step": 6445 + }, + { + "epoch": 1.6186711838885752, + "grad_norm": 0.16589049994945526, + "learning_rate": 1.8443913104073984e-06, + "loss": 0.0827, + "step": 6450 + }, + { + "epoch": 1.6199259677520548, + "grad_norm": 0.18396617472171783, + "learning_rate": 1.83264607028239e-06, + "loss": 0.0795, + "step": 6455 + }, + { + "epoch": 1.621180751615534, + "grad_norm": 0.0726623684167862, + "learning_rate": 1.82093457339527e-06, + "loss": 0.0816, + "step": 6460 + }, + { + "epoch": 1.6224355354790139, + "grad_norm": 0.37276962399482727, + "learning_rate": 1.809256868131828e-06, + "loss": 0.0881, + "step": 6465 + }, + { + "epoch": 1.6236903193424932, + "grad_norm": 0.2739810645580292, + "learning_rate": 1.7976130027382332e-06, + "loss": 0.0772, + "step": 6470 + }, + { + "epoch": 1.6249451032059727, + "grad_norm": 0.18522068858146667, + "learning_rate": 1.786003025320856e-06, + "loss": 0.0858, + "step": 6475 + }, + { + "epoch": 1.6261998870694523, + "grad_norm": 0.19277586042881012, + "learning_rate": 1.774426983846058e-06, + "loss": 0.082, + "step": 6480 + }, + { + "epoch": 1.6274546709329318, + "grad_norm": 0.36260509490966797, + "learning_rate": 1.7628849261399839e-06, + "loss": 0.0751, + "step": 6485 + }, + { + "epoch": 1.6287094547964114, + "grad_norm": 0.2178385704755783, + "learning_rate": 1.7513768998883896e-06, + "loss": 0.083, + "step": 6490 + }, + { + "epoch": 1.6299642386598907, + "grad_norm": 0.1693069338798523, + "learning_rate": 1.7399029526364254e-06, + "loss": 0.0827, + "step": 6495 + }, + { + "epoch": 1.6312190225233705, + "grad_norm": 0.2321835607290268, + "learning_rate": 1.7284631317884448e-06, + "loss": 0.0859, + "step": 6500 + }, + { + "epoch": 1.6324738063868498, + "grad_norm": 0.21040475368499756, + "learning_rate": 1.7170574846078037e-06, + "loss": 0.0829, + "step": 6505 + }, + { + "epoch": 1.6337285902503293, + "grad_norm": 0.1375027894973755, + "learning_rate": 1.7056860582166823e-06, + "loss": 0.0771, + "step": 6510 + }, + { + "epoch": 1.634983374113809, + "grad_norm": 0.07659853994846344, + "learning_rate": 1.6943488995958647e-06, + "loss": 0.0859, + "step": 6515 + }, + { + "epoch": 1.6362381579772884, + "grad_norm": 0.13758735358715057, + "learning_rate": 1.6830460555845719e-06, + "loss": 0.0846, + "step": 6520 + }, + { + "epoch": 1.637492941840768, + "grad_norm": 0.2629936933517456, + "learning_rate": 1.6717775728802432e-06, + "loss": 0.083, + "step": 6525 + }, + { + "epoch": 1.6387477257042473, + "grad_norm": 0.184475377202034, + "learning_rate": 1.6605434980383594e-06, + "loss": 0.0836, + "step": 6530 + }, + { + "epoch": 1.640002509567727, + "grad_norm": 0.4641217887401581, + "learning_rate": 1.649343877472248e-06, + "loss": 0.0808, + "step": 6535 + }, + { + "epoch": 1.6412572934312064, + "grad_norm": 0.23128898441791534, + "learning_rate": 1.638178757452894e-06, + "loss": 0.0745, + "step": 6540 + }, + { + "epoch": 1.642512077294686, + "grad_norm": 0.09639477729797363, + "learning_rate": 1.627048184108726e-06, + "loss": 0.0849, + "step": 6545 + }, + { + "epoch": 1.6437668611581655, + "grad_norm": 0.305663138628006, + "learning_rate": 1.6159522034254628e-06, + "loss": 0.0756, + "step": 6550 + }, + { + "epoch": 1.645021645021645, + "grad_norm": 0.14679822325706482, + "learning_rate": 1.604890861245898e-06, + "loss": 0.0698, + "step": 6555 + }, + { + "epoch": 1.6462764288851246, + "grad_norm": 0.16359837353229523, + "learning_rate": 1.593864203269716e-06, + "loss": 0.079, + "step": 6560 + }, + { + "epoch": 1.647531212748604, + "grad_norm": 0.12374599277973175, + "learning_rate": 1.582872275053301e-06, + "loss": 0.0822, + "step": 6565 + }, + { + "epoch": 1.6487859966120837, + "grad_norm": 0.2265704870223999, + "learning_rate": 1.5719151220095596e-06, + "loss": 0.0856, + "step": 6570 + }, + { + "epoch": 1.650040780475563, + "grad_norm": 0.1844184547662735, + "learning_rate": 1.5609927894077193e-06, + "loss": 0.072, + "step": 6575 + }, + { + "epoch": 1.6512955643390426, + "grad_norm": 0.0777512937784195, + "learning_rate": 1.5501053223731532e-06, + "loss": 0.0754, + "step": 6580 + }, + { + "epoch": 1.6525503482025221, + "grad_norm": 0.3023741841316223, + "learning_rate": 1.5392527658871813e-06, + "loss": 0.0788, + "step": 6585 + }, + { + "epoch": 1.6538051320660017, + "grad_norm": 0.42789706587791443, + "learning_rate": 1.5284351647868956e-06, + "loss": 0.0671, + "step": 6590 + }, + { + "epoch": 1.6550599159294812, + "grad_norm": 0.1094786524772644, + "learning_rate": 1.5176525637649708e-06, + "loss": 0.0815, + "step": 6595 + }, + { + "epoch": 1.6563146997929605, + "grad_norm": 0.2907751798629761, + "learning_rate": 1.5069050073694813e-06, + "loss": 0.0796, + "step": 6600 + }, + { + "epoch": 1.6575694836564403, + "grad_norm": 0.11227094382047653, + "learning_rate": 1.4961925400037102e-06, + "loss": 0.0883, + "step": 6605 + }, + { + "epoch": 1.6588242675199196, + "grad_norm": 0.34500670433044434, + "learning_rate": 1.4855152059259737e-06, + "loss": 0.0809, + "step": 6610 + }, + { + "epoch": 1.6600790513833992, + "grad_norm": 0.1734425127506256, + "learning_rate": 1.474873049249439e-06, + "loss": 0.0819, + "step": 6615 + }, + { + "epoch": 1.6613338352468787, + "grad_norm": 0.16895422339439392, + "learning_rate": 1.4642661139419302e-06, + "loss": 0.0874, + "step": 6620 + }, + { + "epoch": 1.6625886191103583, + "grad_norm": 0.3084584176540375, + "learning_rate": 1.453694443825766e-06, + "loss": 0.0811, + "step": 6625 + }, + { + "epoch": 1.6638434029738378, + "grad_norm": 0.09408409893512726, + "learning_rate": 1.4431580825775604e-06, + "loss": 0.0826, + "step": 6630 + }, + { + "epoch": 1.6650981868373171, + "grad_norm": 0.06825320422649384, + "learning_rate": 1.4326570737280488e-06, + "loss": 0.0747, + "step": 6635 + }, + { + "epoch": 1.666352970700797, + "grad_norm": 0.30554118752479553, + "learning_rate": 1.4221914606619135e-06, + "loss": 0.0773, + "step": 6640 + }, + { + "epoch": 1.6676077545642762, + "grad_norm": 0.3845992982387543, + "learning_rate": 1.4117612866176022e-06, + "loss": 0.0893, + "step": 6645 + }, + { + "epoch": 1.6688625384277558, + "grad_norm": 0.08592917770147324, + "learning_rate": 1.4013665946871347e-06, + "loss": 0.0817, + "step": 6650 + }, + { + "epoch": 1.6701173222912353, + "grad_norm": 0.12979847192764282, + "learning_rate": 1.391007427815949e-06, + "loss": 0.0754, + "step": 6655 + }, + { + "epoch": 1.6713721061547149, + "grad_norm": 0.2696765661239624, + "learning_rate": 1.3806838288027113e-06, + "loss": 0.0782, + "step": 6660 + }, + { + "epoch": 1.6726268900181944, + "grad_norm": 0.20548641681671143, + "learning_rate": 1.3703958402991345e-06, + "loss": 0.0815, + "step": 6665 + }, + { + "epoch": 1.6738816738816737, + "grad_norm": 0.4033859968185425, + "learning_rate": 1.36014350480981e-06, + "loss": 0.0826, + "step": 6670 + }, + { + "epoch": 1.6751364577451535, + "grad_norm": 0.29467886686325073, + "learning_rate": 1.3499268646920317e-06, + "loss": 0.0746, + "step": 6675 + }, + { + "epoch": 1.6763912416086328, + "grad_norm": 0.17417952418327332, + "learning_rate": 1.339745962155613e-06, + "loss": 0.0739, + "step": 6680 + }, + { + "epoch": 1.6776460254721124, + "grad_norm": 0.30542805790901184, + "learning_rate": 1.329600839262728e-06, + "loss": 0.0873, + "step": 6685 + }, + { + "epoch": 1.678900809335592, + "grad_norm": 0.48298323154449463, + "learning_rate": 1.3194915379277195e-06, + "loss": 0.0866, + "step": 6690 + }, + { + "epoch": 1.6801555931990715, + "grad_norm": 0.2085920125246048, + "learning_rate": 1.3094180999169348e-06, + "loss": 0.0801, + "step": 6695 + }, + { + "epoch": 1.681410377062551, + "grad_norm": 0.22730374336242676, + "learning_rate": 1.299380566848557e-06, + "loss": 0.0725, + "step": 6700 + }, + { + "epoch": 1.6826651609260304, + "grad_norm": 0.2080390900373459, + "learning_rate": 1.2893789801924328e-06, + "loss": 0.0912, + "step": 6705 + }, + { + "epoch": 1.6839199447895101, + "grad_norm": 0.2778363525867462, + "learning_rate": 1.2794133812698794e-06, + "loss": 0.0804, + "step": 6710 + }, + { + "epoch": 1.6851747286529895, + "grad_norm": 0.11341708898544312, + "learning_rate": 1.269483811253549e-06, + "loss": 0.0884, + "step": 6715 + }, + { + "epoch": 1.686429512516469, + "grad_norm": 0.11615065485239029, + "learning_rate": 1.259590311167238e-06, + "loss": 0.0817, + "step": 6720 + }, + { + "epoch": 1.6876842963799485, + "grad_norm": 0.13391433656215668, + "learning_rate": 1.2497329218857135e-06, + "loss": 0.0831, + "step": 6725 + }, + { + "epoch": 1.688939080243428, + "grad_norm": 0.36114147305488586, + "learning_rate": 1.2399116841345605e-06, + "loss": 0.0841, + "step": 6730 + }, + { + "epoch": 1.6901938641069076, + "grad_norm": 0.13782304525375366, + "learning_rate": 1.230126638489998e-06, + "loss": 0.0861, + "step": 6735 + }, + { + "epoch": 1.691448647970387, + "grad_norm": 0.7375853061676025, + "learning_rate": 1.2203778253787191e-06, + "loss": 0.0926, + "step": 6740 + }, + { + "epoch": 1.6927034318338667, + "grad_norm": 0.15916559100151062, + "learning_rate": 1.2106652850777257e-06, + "loss": 0.0725, + "step": 6745 + }, + { + "epoch": 1.693958215697346, + "grad_norm": 0.38723546266555786, + "learning_rate": 1.2009890577141625e-06, + "loss": 0.082, + "step": 6750 + }, + { + "epoch": 1.6952129995608256, + "grad_norm": 0.2894856035709381, + "learning_rate": 1.1913491832651359e-06, + "loss": 0.0908, + "step": 6755 + }, + { + "epoch": 1.6964677834243052, + "grad_norm": 0.10918894410133362, + "learning_rate": 1.181745701557574e-06, + "loss": 0.079, + "step": 6760 + }, + { + "epoch": 1.6977225672877847, + "grad_norm": 0.27660104632377625, + "learning_rate": 1.1721786522680445e-06, + "loss": 0.0791, + "step": 6765 + }, + { + "epoch": 1.6989773511512642, + "grad_norm": 0.13653279840946198, + "learning_rate": 1.1626480749225932e-06, + "loss": 0.0804, + "step": 6770 + }, + { + "epoch": 1.7002321350147436, + "grad_norm": 0.29900994896888733, + "learning_rate": 1.1531540088965842e-06, + "loss": 0.0865, + "step": 6775 + }, + { + "epoch": 1.7014869188782233, + "grad_norm": 0.10758146643638611, + "learning_rate": 1.143696493414539e-06, + "loss": 0.0898, + "step": 6780 + }, + { + "epoch": 1.7027417027417027, + "grad_norm": 0.40997231006622314, + "learning_rate": 1.134275567549965e-06, + "loss": 0.0932, + "step": 6785 + }, + { + "epoch": 1.7039964866051822, + "grad_norm": 0.1141999289393425, + "learning_rate": 1.124891270225208e-06, + "loss": 0.0903, + "step": 6790 + }, + { + "epoch": 1.7052512704686618, + "grad_norm": 0.6266759037971497, + "learning_rate": 1.1155436402112785e-06, + "loss": 0.0825, + "step": 6795 + }, + { + "epoch": 1.7065060543321413, + "grad_norm": 0.16094879806041718, + "learning_rate": 1.1062327161276965e-06, + "loss": 0.0833, + "step": 6800 + }, + { + "epoch": 1.7077608381956209, + "grad_norm": 0.2038365751504898, + "learning_rate": 1.0969585364423352e-06, + "loss": 0.091, + "step": 6805 + }, + { + "epoch": 1.7090156220591002, + "grad_norm": 0.1327473223209381, + "learning_rate": 1.0877211394712617e-06, + "loss": 0.0799, + "step": 6810 + }, + { + "epoch": 1.71027040592258, + "grad_norm": 0.3634199798107147, + "learning_rate": 1.0785205633785666e-06, + "loss": 0.0812, + "step": 6815 + }, + { + "epoch": 1.7115251897860593, + "grad_norm": 0.27833735942840576, + "learning_rate": 1.0693568461762238e-06, + "loss": 0.0825, + "step": 6820 + }, + { + "epoch": 1.7127799736495388, + "grad_norm": 0.3433382511138916, + "learning_rate": 1.0602300257239262e-06, + "loss": 0.0731, + "step": 6825 + }, + { + "epoch": 1.7140347575130184, + "grad_norm": 0.5607882738113403, + "learning_rate": 1.0511401397289233e-06, + "loss": 0.0769, + "step": 6830 + }, + { + "epoch": 1.715289541376498, + "grad_norm": 0.3127841353416443, + "learning_rate": 1.0420872257458725e-06, + "loss": 0.0901, + "step": 6835 + }, + { + "epoch": 1.7165443252399775, + "grad_norm": 0.10875175893306732, + "learning_rate": 1.0330713211766864e-06, + "loss": 0.0897, + "step": 6840 + }, + { + "epoch": 1.7177991091034568, + "grad_norm": 0.5205060839653015, + "learning_rate": 1.0240924632703676e-06, + "loss": 0.0774, + "step": 6845 + }, + { + "epoch": 1.7190538929669366, + "grad_norm": 0.15961715579032898, + "learning_rate": 1.0151506891228636e-06, + "loss": 0.0765, + "step": 6850 + }, + { + "epoch": 1.7203086768304159, + "grad_norm": 0.6552174687385559, + "learning_rate": 1.0062460356769189e-06, + "loss": 0.0854, + "step": 6855 + }, + { + "epoch": 1.7215634606938954, + "grad_norm": 0.10555896908044815, + "learning_rate": 9.973785397218982e-07, + "loss": 0.0886, + "step": 6860 + }, + { + "epoch": 1.722818244557375, + "grad_norm": 0.21054576337337494, + "learning_rate": 9.88548237893664e-07, + "loss": 0.0841, + "step": 6865 + }, + { + "epoch": 1.7240730284208545, + "grad_norm": 0.10787925869226456, + "learning_rate": 9.79755166674411e-07, + "loss": 0.076, + "step": 6870 + }, + { + "epoch": 1.725327812284334, + "grad_norm": 0.24701017141342163, + "learning_rate": 9.709993623925118e-07, + "loss": 0.0804, + "step": 6875 + }, + { + "epoch": 1.7265825961478134, + "grad_norm": 0.2187194675207138, + "learning_rate": 9.622808612223722e-07, + "loss": 0.0794, + "step": 6880 + }, + { + "epoch": 1.7278373800112932, + "grad_norm": 0.16389504075050354, + "learning_rate": 9.535996991842855e-07, + "loss": 0.09, + "step": 6885 + }, + { + "epoch": 1.7290921638747725, + "grad_norm": 0.33097338676452637, + "learning_rate": 9.449559121442731e-07, + "loss": 0.0903, + "step": 6890 + }, + { + "epoch": 1.730346947738252, + "grad_norm": 0.1447478085756302, + "learning_rate": 9.363495358139485e-07, + "loss": 0.0778, + "step": 6895 + }, + { + "epoch": 1.7316017316017316, + "grad_norm": 0.24518853425979614, + "learning_rate": 9.277806057503592e-07, + "loss": 0.0972, + "step": 6900 + }, + { + "epoch": 1.7328565154652111, + "grad_norm": 0.09704507142305374, + "learning_rate": 9.192491573558438e-07, + "loss": 0.0884, + "step": 6905 + }, + { + "epoch": 1.7341112993286907, + "grad_norm": 0.1697137951850891, + "learning_rate": 9.107552258778907e-07, + "loss": 0.0799, + "step": 6910 + }, + { + "epoch": 1.73536608319217, + "grad_norm": 0.11926918476819992, + "learning_rate": 9.022988464089888e-07, + "loss": 0.0726, + "step": 6915 + }, + { + "epoch": 1.7366208670556498, + "grad_norm": 0.16987641155719757, + "learning_rate": 8.9388005388647e-07, + "loss": 0.0835, + "step": 6920 + }, + { + "epoch": 1.737875650919129, + "grad_norm": 0.24412445724010468, + "learning_rate": 8.854988830923905e-07, + "loss": 0.0727, + "step": 6925 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.33428534865379333, + "learning_rate": 8.771553686533684e-07, + "loss": 0.0844, + "step": 6930 + }, + { + "epoch": 1.7403852186460882, + "grad_norm": 0.4698134958744049, + "learning_rate": 8.688495450404444e-07, + "loss": 0.0815, + "step": 6935 + }, + { + "epoch": 1.7416400025095677, + "grad_norm": 0.3183957040309906, + "learning_rate": 8.605814465689366e-07, + "loss": 0.0779, + "step": 6940 + }, + { + "epoch": 1.7428947863730473, + "grad_norm": 0.1109415665268898, + "learning_rate": 8.523511073983127e-07, + "loss": 0.0782, + "step": 6945 + }, + { + "epoch": 1.7441495702365266, + "grad_norm": 0.14376573264598846, + "learning_rate": 8.441585615320269e-07, + "loss": 0.0865, + "step": 6950 + }, + { + "epoch": 1.7454043541000064, + "grad_norm": 0.19789855182170868, + "learning_rate": 8.360038428174022e-07, + "loss": 0.0932, + "step": 6955 + }, + { + "epoch": 1.7466591379634857, + "grad_norm": 0.2379181683063507, + "learning_rate": 8.278869849454718e-07, + "loss": 0.0809, + "step": 6960 + }, + { + "epoch": 1.7479139218269653, + "grad_norm": 0.3969587981700897, + "learning_rate": 8.198080214508486e-07, + "loss": 0.0843, + "step": 6965 + }, + { + "epoch": 1.7491687056904448, + "grad_norm": 0.23574315011501312, + "learning_rate": 8.117669857115895e-07, + "loss": 0.0853, + "step": 6970 + }, + { + "epoch": 1.7504234895539243, + "grad_norm": 0.3272697925567627, + "learning_rate": 8.037639109490524e-07, + "loss": 0.0818, + "step": 6975 + }, + { + "epoch": 1.751678273417404, + "grad_norm": 0.2460939586162567, + "learning_rate": 7.957988302277597e-07, + "loss": 0.0811, + "step": 6980 + }, + { + "epoch": 1.7529330572808832, + "grad_norm": 0.3471406102180481, + "learning_rate": 7.87871776455259e-07, + "loss": 0.0651, + "step": 6985 + }, + { + "epoch": 1.754187841144363, + "grad_norm": 0.22290903329849243, + "learning_rate": 7.799827823819972e-07, + "loss": 0.0793, + "step": 6990 + }, + { + "epoch": 1.7554426250078423, + "grad_norm": 0.1208643913269043, + "learning_rate": 7.721318806011713e-07, + "loss": 0.0765, + "step": 6995 + }, + { + "epoch": 1.7566974088713219, + "grad_norm": 0.176291823387146, + "learning_rate": 7.643191035486086e-07, + "loss": 0.0892, + "step": 7000 + }, + { + "epoch": 1.7579521927348014, + "grad_norm": 0.518677294254303, + "learning_rate": 7.56544483502617e-07, + "loss": 0.0762, + "step": 7005 + }, + { + "epoch": 1.759206976598281, + "grad_norm": 0.24120575189590454, + "learning_rate": 7.488080525838636e-07, + "loss": 0.0979, + "step": 7010 + }, + { + "epoch": 1.7604617604617605, + "grad_norm": 0.16398218274116516, + "learning_rate": 7.411098427552377e-07, + "loss": 0.0815, + "step": 7015 + }, + { + "epoch": 1.7617165443252398, + "grad_norm": 0.15948547422885895, + "learning_rate": 7.334498858217231e-07, + "loss": 0.082, + "step": 7020 + }, + { + "epoch": 1.7629713281887196, + "grad_norm": 0.163728266954422, + "learning_rate": 7.258282134302519e-07, + "loss": 0.0888, + "step": 7025 + }, + { + "epoch": 1.764226112052199, + "grad_norm": 0.18428277969360352, + "learning_rate": 7.182448570695944e-07, + "loss": 0.0796, + "step": 7030 + }, + { + "epoch": 1.7654808959156785, + "grad_norm": 0.2015131711959839, + "learning_rate": 7.106998480702165e-07, + "loss": 0.0741, + "step": 7035 + }, + { + "epoch": 1.766735679779158, + "grad_norm": 0.24829888343811035, + "learning_rate": 7.031932176041522e-07, + "loss": 0.0785, + "step": 7040 + }, + { + "epoch": 1.7679904636426376, + "grad_norm": 0.15142977237701416, + "learning_rate": 6.957249966848711e-07, + "loss": 0.0837, + "step": 7045 + }, + { + "epoch": 1.7692452475061171, + "grad_norm": 0.25044482946395874, + "learning_rate": 6.882952161671652e-07, + "loss": 0.083, + "step": 7050 + }, + { + "epoch": 1.7705000313695964, + "grad_norm": 0.27821311354637146, + "learning_rate": 6.809039067469991e-07, + "loss": 0.0851, + "step": 7055 + }, + { + "epoch": 1.7717548152330762, + "grad_norm": 0.21314631402492523, + "learning_rate": 6.735510989614047e-07, + "loss": 0.0891, + "step": 7060 + }, + { + "epoch": 1.7730095990965555, + "grad_norm": 0.24855484068393707, + "learning_rate": 6.662368231883388e-07, + "loss": 0.0828, + "step": 7065 + }, + { + "epoch": 1.774264382960035, + "grad_norm": 0.09157968312501907, + "learning_rate": 6.589611096465642e-07, + "loss": 0.0827, + "step": 7070 + }, + { + "epoch": 1.7755191668235146, + "grad_norm": 0.17919617891311646, + "learning_rate": 6.517239883955295e-07, + "loss": 0.082, + "step": 7075 + }, + { + "epoch": 1.7767739506869942, + "grad_norm": 0.32990872859954834, + "learning_rate": 6.445254893352381e-07, + "loss": 0.0847, + "step": 7080 + }, + { + "epoch": 1.7780287345504737, + "grad_norm": 0.20937122404575348, + "learning_rate": 6.373656422061247e-07, + "loss": 0.0729, + "step": 7085 + }, + { + "epoch": 1.779283518413953, + "grad_norm": 0.34432610869407654, + "learning_rate": 6.302444765889337e-07, + "loss": 0.0836, + "step": 7090 + }, + { + "epoch": 1.7805383022774328, + "grad_norm": 0.23893016576766968, + "learning_rate": 6.23162021904603e-07, + "loss": 0.085, + "step": 7095 + }, + { + "epoch": 1.7817930861409121, + "grad_norm": 0.26055169105529785, + "learning_rate": 6.161183074141319e-07, + "loss": 0.0836, + "step": 7100 + }, + { + "epoch": 1.7830478700043917, + "grad_norm": 0.3493628203868866, + "learning_rate": 6.091133622184664e-07, + "loss": 0.0879, + "step": 7105 + }, + { + "epoch": 1.7843026538678712, + "grad_norm": 0.15807025134563446, + "learning_rate": 6.021472152583818e-07, + "loss": 0.0846, + "step": 7110 + }, + { + "epoch": 1.7855574377313508, + "grad_norm": 0.19779103994369507, + "learning_rate": 5.952198953143539e-07, + "loss": 0.0914, + "step": 7115 + }, + { + "epoch": 1.7868122215948303, + "grad_norm": 0.2763902246952057, + "learning_rate": 5.883314310064492e-07, + "loss": 0.0752, + "step": 7120 + }, + { + "epoch": 1.7880670054583097, + "grad_norm": 0.22704683244228363, + "learning_rate": 5.814818507942055e-07, + "loss": 0.0785, + "step": 7125 + }, + { + "epoch": 1.7893217893217894, + "grad_norm": 0.3757963478565216, + "learning_rate": 5.746711829765017e-07, + "loss": 0.0814, + "step": 7130 + }, + { + "epoch": 1.7905765731852687, + "grad_norm": 0.1545080542564392, + "learning_rate": 5.678994556914618e-07, + "loss": 0.0743, + "step": 7135 + }, + { + "epoch": 1.7918313570487485, + "grad_norm": 0.18506070971488953, + "learning_rate": 5.611666969163243e-07, + "loss": 0.0831, + "step": 7140 + }, + { + "epoch": 1.7930861409122278, + "grad_norm": 0.17606933414936066, + "learning_rate": 5.544729344673294e-07, + "loss": 0.0753, + "step": 7145 + }, + { + "epoch": 1.7943409247757074, + "grad_norm": 0.30049043893814087, + "learning_rate": 5.47818195999602e-07, + "loss": 0.0893, + "step": 7150 + }, + { + "epoch": 1.795595708639187, + "grad_norm": 0.07360345870256424, + "learning_rate": 5.412025090070483e-07, + "loss": 0.0791, + "step": 7155 + }, + { + "epoch": 1.7968504925026663, + "grad_norm": 0.21283096075057983, + "learning_rate": 5.346259008222243e-07, + "loss": 0.0813, + "step": 7160 + }, + { + "epoch": 1.798105276366146, + "grad_norm": 0.38110825419425964, + "learning_rate": 5.280883986162433e-07, + "loss": 0.0791, + "step": 7165 + }, + { + "epoch": 1.7993600602296254, + "grad_norm": 0.12914220988750458, + "learning_rate": 5.215900293986431e-07, + "loss": 0.0758, + "step": 7170 + }, + { + "epoch": 1.8006148440931051, + "grad_norm": 0.5188528299331665, + "learning_rate": 5.151308200172911e-07, + "loss": 0.0823, + "step": 7175 + }, + { + "epoch": 1.8018696279565845, + "grad_norm": 0.23793403804302216, + "learning_rate": 5.087107971582628e-07, + "loss": 0.0786, + "step": 7180 + }, + { + "epoch": 1.803124411820064, + "grad_norm": 0.18523241579532623, + "learning_rate": 5.02329987345741e-07, + "loss": 0.0864, + "step": 7185 + }, + { + "epoch": 1.8043791956835435, + "grad_norm": 0.18162751197814941, + "learning_rate": 4.959884169418949e-07, + "loss": 0.079, + "step": 7190 + }, + { + "epoch": 1.8056339795470229, + "grad_norm": 0.15588027238845825, + "learning_rate": 4.896861121467778e-07, + "loss": 0.0791, + "step": 7195 + }, + { + "epoch": 1.8068887634105026, + "grad_norm": 0.16926927864551544, + "learning_rate": 4.834230989982214e-07, + "loss": 0.0762, + "step": 7200 + }, + { + "epoch": 1.808143547273982, + "grad_norm": 0.29264163970947266, + "learning_rate": 4.77199403371722e-07, + "loss": 0.0805, + "step": 7205 + }, + { + "epoch": 1.8093983311374617, + "grad_norm": 0.15263986587524414, + "learning_rate": 4.7101505098033575e-07, + "loss": 0.0893, + "step": 7210 + }, + { + "epoch": 1.810653115000941, + "grad_norm": 0.12115020304918289, + "learning_rate": 4.6487006737457765e-07, + "loss": 0.0773, + "step": 7215 + }, + { + "epoch": 1.8119078988644206, + "grad_norm": 0.08451762050390244, + "learning_rate": 4.5876447794230504e-07, + "loss": 0.0772, + "step": 7220 + }, + { + "epoch": 1.8131626827279002, + "grad_norm": 0.2606160640716553, + "learning_rate": 4.5269830790862444e-07, + "loss": 0.0787, + "step": 7225 + }, + { + "epoch": 1.8144174665913795, + "grad_norm": 0.18426910042762756, + "learning_rate": 4.4667158233577925e-07, + "loss": 0.0788, + "step": 7230 + }, + { + "epoch": 1.8156722504548592, + "grad_norm": 0.1966276913881302, + "learning_rate": 4.40684326123052e-07, + "loss": 0.0955, + "step": 7235 + }, + { + "epoch": 1.8169270343183386, + "grad_norm": 0.19326919317245483, + "learning_rate": 4.3473656400665256e-07, + "loss": 0.0984, + "step": 7240 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.2253500372171402, + "learning_rate": 4.2882832055962885e-07, + "loss": 0.0747, + "step": 7245 + }, + { + "epoch": 1.8194366020452977, + "grad_norm": 0.28359681367874146, + "learning_rate": 4.22959620191753e-07, + "loss": 0.0751, + "step": 7250 + }, + { + "epoch": 1.8206913859087772, + "grad_norm": 0.12297849357128143, + "learning_rate": 4.171304871494264e-07, + "loss": 0.0708, + "step": 7255 + }, + { + "epoch": 1.8219461697722568, + "grad_norm": 0.049460649490356445, + "learning_rate": 4.113409455155837e-07, + "loss": 0.0758, + "step": 7260 + }, + { + "epoch": 1.823200953635736, + "grad_norm": 0.1316637098789215, + "learning_rate": 4.0559101920958243e-07, + "loss": 0.075, + "step": 7265 + }, + { + "epoch": 1.8244557374992159, + "grad_norm": 0.11034820973873138, + "learning_rate": 3.9988073198711564e-07, + "loss": 0.0816, + "step": 7270 + }, + { + "epoch": 1.8257105213626952, + "grad_norm": 0.5154989957809448, + "learning_rate": 3.942101074401028e-07, + "loss": 0.0787, + "step": 7275 + }, + { + "epoch": 1.826965305226175, + "grad_norm": 0.4702956974506378, + "learning_rate": 3.885791689966023e-07, + "loss": 0.0862, + "step": 7280 + }, + { + "epoch": 1.8282200890896543, + "grad_norm": 0.41101551055908203, + "learning_rate": 3.8298793992070814e-07, + "loss": 0.0888, + "step": 7285 + }, + { + "epoch": 1.8294748729531338, + "grad_norm": 0.16913031041622162, + "learning_rate": 3.774364433124578e-07, + "loss": 0.0761, + "step": 7290 + }, + { + "epoch": 1.8307296568166134, + "grad_norm": 0.1570875644683838, + "learning_rate": 3.7192470210773435e-07, + "loss": 0.0713, + "step": 7295 + }, + { + "epoch": 1.8319844406800927, + "grad_norm": 0.45383045077323914, + "learning_rate": 3.6645273907816805e-07, + "loss": 0.0774, + "step": 7300 + }, + { + "epoch": 1.8332392245435725, + "grad_norm": 0.12889161705970764, + "learning_rate": 3.6102057683105596e-07, + "loss": 0.0824, + "step": 7305 + }, + { + "epoch": 1.8344940084070518, + "grad_norm": 0.34759390354156494, + "learning_rate": 3.5562823780924906e-07, + "loss": 0.0912, + "step": 7310 + }, + { + "epoch": 1.8357487922705316, + "grad_norm": 0.35057494044303894, + "learning_rate": 3.5027574429107536e-07, + "loss": 0.0732, + "step": 7315 + }, + { + "epoch": 1.8370035761340109, + "grad_norm": 0.1586838960647583, + "learning_rate": 3.4496311839024133e-07, + "loss": 0.0896, + "step": 7320 + }, + { + "epoch": 1.8382583599974904, + "grad_norm": 0.13935531675815582, + "learning_rate": 3.396903820557385e-07, + "loss": 0.0796, + "step": 7325 + }, + { + "epoch": 1.83951314386097, + "grad_norm": 0.20520001649856567, + "learning_rate": 3.344575570717612e-07, + "loss": 0.0839, + "step": 7330 + }, + { + "epoch": 1.8407679277244493, + "grad_norm": 0.13597147166728973, + "learning_rate": 3.292646650576037e-07, + "loss": 0.0852, + "step": 7335 + }, + { + "epoch": 1.842022711587929, + "grad_norm": 0.1146201342344284, + "learning_rate": 3.2411172746758424e-07, + "loss": 0.0845, + "step": 7340 + }, + { + "epoch": 1.8432774954514084, + "grad_norm": 0.24954962730407715, + "learning_rate": 3.1899876559094657e-07, + "loss": 0.0705, + "step": 7345 + }, + { + "epoch": 1.8445322793148882, + "grad_norm": 0.43199557065963745, + "learning_rate": 3.1392580055177867e-07, + "loss": 0.0894, + "step": 7350 + }, + { + "epoch": 1.8457870631783675, + "grad_norm": 0.3313222825527191, + "learning_rate": 3.0889285330891973e-07, + "loss": 0.0888, + "step": 7355 + }, + { + "epoch": 1.847041847041847, + "grad_norm": 0.08865600824356079, + "learning_rate": 3.038999446558755e-07, + "loss": 0.086, + "step": 7360 + }, + { + "epoch": 1.8482966309053266, + "grad_norm": 0.21259349584579468, + "learning_rate": 2.989470952207385e-07, + "loss": 0.0911, + "step": 7365 + }, + { + "epoch": 1.849551414768806, + "grad_norm": 0.28370821475982666, + "learning_rate": 2.940343254660905e-07, + "loss": 0.0883, + "step": 7370 + }, + { + "epoch": 1.8508061986322857, + "grad_norm": 0.1723158210515976, + "learning_rate": 2.891616556889321e-07, + "loss": 0.0745, + "step": 7375 + }, + { + "epoch": 1.852060982495765, + "grad_norm": 0.2103600651025772, + "learning_rate": 2.843291060205855e-07, + "loss": 0.0821, + "step": 7380 + }, + { + "epoch": 1.8533157663592448, + "grad_norm": 0.19018976390361786, + "learning_rate": 2.7953669642662107e-07, + "loss": 0.0885, + "step": 7385 + }, + { + "epoch": 1.854570550222724, + "grad_norm": 0.38891276717185974, + "learning_rate": 2.747844467067706e-07, + "loss": 0.0832, + "step": 7390 + }, + { + "epoch": 1.8558253340862036, + "grad_norm": 0.17385374009609222, + "learning_rate": 2.7007237649484763e-07, + "loss": 0.0788, + "step": 7395 + }, + { + "epoch": 1.8570801179496832, + "grad_norm": 0.453713059425354, + "learning_rate": 2.654005052586628e-07, + "loss": 0.0777, + "step": 7400 + }, + { + "epoch": 1.8583349018131625, + "grad_norm": 0.20061570405960083, + "learning_rate": 2.607688522999441e-07, + "loss": 0.0742, + "step": 7405 + }, + { + "epoch": 1.8595896856766423, + "grad_norm": 0.1047121062874794, + "learning_rate": 2.5617743675426354e-07, + "loss": 0.0917, + "step": 7410 + }, + { + "epoch": 1.8608444695401216, + "grad_norm": 0.24845577776432037, + "learning_rate": 2.516262775909506e-07, + "loss": 0.0806, + "step": 7415 + }, + { + "epoch": 1.8620992534036014, + "grad_norm": 0.3548727035522461, + "learning_rate": 2.471153936130133e-07, + "loss": 0.0913, + "step": 7420 + }, + { + "epoch": 1.8633540372670807, + "grad_norm": 0.2546403706073761, + "learning_rate": 2.4264480345707053e-07, + "loss": 0.0855, + "step": 7425 + }, + { + "epoch": 1.8646088211305603, + "grad_norm": 0.2284100353717804, + "learning_rate": 2.3821452559326218e-07, + "loss": 0.0849, + "step": 7430 + }, + { + "epoch": 1.8658636049940398, + "grad_norm": 0.25266894698143005, + "learning_rate": 2.3382457832518134e-07, + "loss": 0.0778, + "step": 7435 + }, + { + "epoch": 1.8671183888575191, + "grad_norm": 0.19911006093025208, + "learning_rate": 2.294749797897955e-07, + "loss": 0.0781, + "step": 7440 + }, + { + "epoch": 1.868373172720999, + "grad_norm": 0.35023069381713867, + "learning_rate": 2.2516574795737323e-07, + "loss": 0.0787, + "step": 7445 + }, + { + "epoch": 1.8696279565844782, + "grad_norm": 0.19626636803150177, + "learning_rate": 2.2089690063140766e-07, + "loss": 0.0833, + "step": 7450 + }, + { + "epoch": 1.870882740447958, + "grad_norm": 0.1876339167356491, + "learning_rate": 2.1666845544854542e-07, + "loss": 0.084, + "step": 7455 + }, + { + "epoch": 1.8721375243114373, + "grad_norm": 0.09253226220607758, + "learning_rate": 2.1248042987851325e-07, + "loss": 0.0875, + "step": 7460 + }, + { + "epoch": 1.8733923081749169, + "grad_norm": 0.31013625860214233, + "learning_rate": 2.083328412240404e-07, + "loss": 0.083, + "step": 7465 + }, + { + "epoch": 1.8746470920383964, + "grad_norm": 0.17299607396125793, + "learning_rate": 2.0422570662079866e-07, + "loss": 0.0887, + "step": 7470 + }, + { + "epoch": 1.8759018759018757, + "grad_norm": 0.1518164724111557, + "learning_rate": 2.0015904303732126e-07, + "loss": 0.0711, + "step": 7475 + }, + { + "epoch": 1.8771566597653555, + "grad_norm": 0.16225028038024902, + "learning_rate": 1.961328672749352e-07, + "loss": 0.0743, + "step": 7480 + }, + { + "epoch": 1.8784114436288348, + "grad_norm": 0.39483585953712463, + "learning_rate": 1.921471959676957e-07, + "loss": 0.0856, + "step": 7485 + }, + { + "epoch": 1.8796662274923146, + "grad_norm": 0.43932509422302246, + "learning_rate": 1.8820204558231415e-07, + "loss": 0.0813, + "step": 7490 + }, + { + "epoch": 1.880921011355794, + "grad_norm": 0.29697397351264954, + "learning_rate": 1.8429743241808795e-07, + "loss": 0.079, + "step": 7495 + }, + { + "epoch": 1.8821757952192735, + "grad_norm": 0.1230466440320015, + "learning_rate": 1.804333726068408e-07, + "loss": 0.0918, + "step": 7500 + }, + { + "epoch": 1.883430579082753, + "grad_norm": 0.18823686242103577, + "learning_rate": 1.766098821128459e-07, + "loss": 0.0755, + "step": 7505 + }, + { + "epoch": 1.8846853629462323, + "grad_norm": 0.3333011865615845, + "learning_rate": 1.7282697673276837e-07, + "loss": 0.0908, + "step": 7510 + }, + { + "epoch": 1.8859401468097121, + "grad_norm": 0.15476582944393158, + "learning_rate": 1.6908467209559853e-07, + "loss": 0.0869, + "step": 7515 + }, + { + "epoch": 1.8871949306731914, + "grad_norm": 0.20987029373645782, + "learning_rate": 1.6538298366257975e-07, + "loss": 0.074, + "step": 7520 + }, + { + "epoch": 1.8884497145366712, + "grad_norm": 0.13596096634864807, + "learning_rate": 1.6172192672715525e-07, + "loss": 0.0886, + "step": 7525 + }, + { + "epoch": 1.8897044984001505, + "grad_norm": 0.22909042239189148, + "learning_rate": 1.5810151641489912e-07, + "loss": 0.0846, + "step": 7530 + }, + { + "epoch": 1.89095928226363, + "grad_norm": 0.24221235513687134, + "learning_rate": 1.545217676834554e-07, + "loss": 0.0815, + "step": 7535 + }, + { + "epoch": 1.8922140661271096, + "grad_norm": 0.14234666526317596, + "learning_rate": 1.5098269532247357e-07, + "loss": 0.0863, + "step": 7540 + }, + { + "epoch": 1.8934688499905892, + "grad_norm": 0.24849990010261536, + "learning_rate": 1.4748431395355088e-07, + "loss": 0.0819, + "step": 7545 + }, + { + "epoch": 1.8947236338540687, + "grad_norm": 0.2965373992919922, + "learning_rate": 1.4402663803017249e-07, + "loss": 0.0795, + "step": 7550 + }, + { + "epoch": 1.895978417717548, + "grad_norm": 0.3867776691913605, + "learning_rate": 1.4060968183764678e-07, + "loss": 0.0904, + "step": 7555 + }, + { + "epoch": 1.8972332015810278, + "grad_norm": 0.31307804584503174, + "learning_rate": 1.3723345949305245e-07, + "loss": 0.0798, + "step": 7560 + }, + { + "epoch": 1.8984879854445071, + "grad_norm": 0.3106546700000763, + "learning_rate": 1.338979849451738e-07, + "loss": 0.0864, + "step": 7565 + }, + { + "epoch": 1.8997427693079867, + "grad_norm": 0.22093643248081207, + "learning_rate": 1.3060327197444767e-07, + "loss": 0.0776, + "step": 7570 + }, + { + "epoch": 1.9009975531714662, + "grad_norm": 0.1310010552406311, + "learning_rate": 1.2734933419290996e-07, + "loss": 0.077, + "step": 7575 + }, + { + "epoch": 1.9022523370349458, + "grad_norm": 0.23409627377986908, + "learning_rate": 1.2413618504412806e-07, + "loss": 0.0777, + "step": 7580 + }, + { + "epoch": 1.9035071208984253, + "grad_norm": 0.2928149402141571, + "learning_rate": 1.2096383780315411e-07, + "loss": 0.0899, + "step": 7585 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.26420190930366516, + "learning_rate": 1.1783230557647075e-07, + "loss": 0.0875, + "step": 7590 + }, + { + "epoch": 1.9060166886253844, + "grad_norm": 0.17748254537582397, + "learning_rate": 1.1474160130193313e-07, + "loss": 0.0878, + "step": 7595 + }, + { + "epoch": 1.9072714724888638, + "grad_norm": 0.41803818941116333, + "learning_rate": 1.1169173774871478e-07, + "loss": 0.0881, + "step": 7600 + }, + { + "epoch": 1.9085262563523433, + "grad_norm": 0.22346888482570648, + "learning_rate": 1.086827275172575e-07, + "loss": 0.0794, + "step": 7605 + }, + { + "epoch": 1.9097810402158228, + "grad_norm": 0.1962645947933197, + "learning_rate": 1.0571458303922033e-07, + "loss": 0.0811, + "step": 7610 + }, + { + "epoch": 1.9110358240793024, + "grad_norm": 0.3406698703765869, + "learning_rate": 1.0278731657742292e-07, + "loss": 0.0816, + "step": 7615 + }, + { + "epoch": 1.912290607942782, + "grad_norm": 0.14449894428253174, + "learning_rate": 9.990094022580332e-08, + "loss": 0.0856, + "step": 7620 + }, + { + "epoch": 1.9135453918062613, + "grad_norm": 0.1097446084022522, + "learning_rate": 9.70554659093581e-08, + "loss": 0.0911, + "step": 7625 + }, + { + "epoch": 1.914800175669741, + "grad_norm": 0.43302279710769653, + "learning_rate": 9.425090538409898e-08, + "loss": 0.0877, + "step": 7630 + }, + { + "epoch": 1.9160549595332204, + "grad_norm": 0.26026082038879395, + "learning_rate": 9.148727023700731e-08, + "loss": 0.0803, + "step": 7635 + }, + { + "epoch": 1.9173097433967, + "grad_norm": 0.2453165352344513, + "learning_rate": 8.876457188597642e-08, + "loss": 0.0843, + "step": 7640 + }, + { + "epoch": 1.9185645272601795, + "grad_norm": 0.4708387851715088, + "learning_rate": 8.608282157977488e-08, + "loss": 0.0788, + "step": 7645 + }, + { + "epoch": 1.919819311123659, + "grad_norm": 0.12131650745868683, + "learning_rate": 8.344203039799214e-08, + "loss": 0.0806, + "step": 7650 + }, + { + "epoch": 1.9210740949871385, + "grad_norm": 0.21256065368652344, + "learning_rate": 8.084220925099751e-08, + "loss": 0.087, + "step": 7655 + }, + { + "epoch": 1.9223288788506179, + "grad_norm": 0.2987827658653259, + "learning_rate": 7.82833688798934e-08, + "loss": 0.0807, + "step": 7660 + }, + { + "epoch": 1.9235836627140976, + "grad_norm": 0.12467091530561447, + "learning_rate": 7.576551985647107e-08, + "loss": 0.0995, + "step": 7665 + }, + { + "epoch": 1.924838446577577, + "grad_norm": 0.15592101216316223, + "learning_rate": 7.328867258316608e-08, + "loss": 0.0881, + "step": 7670 + }, + { + "epoch": 1.9260932304410565, + "grad_norm": 0.1511273980140686, + "learning_rate": 7.085283729301728e-08, + "loss": 0.0847, + "step": 7675 + }, + { + "epoch": 1.927348014304536, + "grad_norm": 0.13429687917232513, + "learning_rate": 6.845802404962243e-08, + "loss": 0.0766, + "step": 7680 + }, + { + "epoch": 1.9286027981680156, + "grad_norm": 0.10453824698925018, + "learning_rate": 6.610424274710037e-08, + "loss": 0.077, + "step": 7685 + }, + { + "epoch": 1.9298575820314952, + "grad_norm": 0.18403221666812897, + "learning_rate": 6.379150311004224e-08, + "loss": 0.0802, + "step": 7690 + }, + { + "epoch": 1.9311123658949745, + "grad_norm": 0.09399021416902542, + "learning_rate": 6.151981469348034e-08, + "loss": 0.0947, + "step": 7695 + }, + { + "epoch": 1.9323671497584543, + "grad_norm": 0.19107823073863983, + "learning_rate": 5.928918688284602e-08, + "loss": 0.0799, + "step": 7700 + }, + { + "epoch": 1.9336219336219336, + "grad_norm": 0.18196965754032135, + "learning_rate": 5.709962889392628e-08, + "loss": 0.0696, + "step": 7705 + }, + { + "epoch": 1.9348767174854131, + "grad_norm": 0.32829707860946655, + "learning_rate": 5.495114977282945e-08, + "loss": 0.0825, + "step": 7710 + }, + { + "epoch": 1.9361315013488927, + "grad_norm": 0.21754246950149536, + "learning_rate": 5.284375839594958e-08, + "loss": 0.0936, + "step": 7715 + }, + { + "epoch": 1.9373862852123722, + "grad_norm": 0.13130666315555573, + "learning_rate": 5.0777463469925406e-08, + "loss": 0.089, + "step": 7720 + }, + { + "epoch": 1.9386410690758518, + "grad_norm": 0.1840493381023407, + "learning_rate": 4.8752273531609276e-08, + "loss": 0.0847, + "step": 7725 + }, + { + "epoch": 1.939895852939331, + "grad_norm": 0.12967444956302643, + "learning_rate": 4.676819694802604e-08, + "loss": 0.0782, + "step": 7730 + }, + { + "epoch": 1.9411506368028109, + "grad_norm": 0.1979934573173523, + "learning_rate": 4.4825241916344184e-08, + "loss": 0.0797, + "step": 7735 + }, + { + "epoch": 1.9424054206662902, + "grad_norm": 0.12666776776313782, + "learning_rate": 4.292341646383813e-08, + "loss": 0.0794, + "step": 7740 + }, + { + "epoch": 1.9436602045297697, + "grad_norm": 0.11725469678640366, + "learning_rate": 4.106272844785486e-08, + "loss": 0.0777, + "step": 7745 + }, + { + "epoch": 1.9449149883932493, + "grad_norm": 0.4441820979118347, + "learning_rate": 3.924318555578843e-08, + "loss": 0.0829, + "step": 7750 + }, + { + "epoch": 1.9461697722567288, + "grad_norm": 0.26033830642700195, + "learning_rate": 3.7464795305036664e-08, + "loss": 0.0806, + "step": 7755 + }, + { + "epoch": 1.9474245561202084, + "grad_norm": 0.19797271490097046, + "learning_rate": 3.572756504297892e-08, + "loss": 0.0681, + "step": 7760 + }, + { + "epoch": 1.9486793399836877, + "grad_norm": 0.15105128288269043, + "learning_rate": 3.4031501946942826e-08, + "loss": 0.0868, + "step": 7765 + }, + { + "epoch": 1.9499341238471675, + "grad_norm": 0.2811802625656128, + "learning_rate": 3.2376613024175384e-08, + "loss": 0.0895, + "step": 7770 + }, + { + "epoch": 1.9511889077106468, + "grad_norm": 0.3569127917289734, + "learning_rate": 3.0762905111811904e-08, + "loss": 0.0927, + "step": 7775 + }, + { + "epoch": 1.9524436915741263, + "grad_norm": 0.5091187953948975, + "learning_rate": 2.9190384876849333e-08, + "loss": 0.0864, + "step": 7780 + }, + { + "epoch": 1.9536984754376059, + "grad_norm": 0.228666290640831, + "learning_rate": 2.7659058816121855e-08, + "loss": 0.0897, + "step": 7785 + }, + { + "epoch": 1.9549532593010854, + "grad_norm": 0.07750795036554337, + "learning_rate": 2.616893325626646e-08, + "loss": 0.0733, + "step": 7790 + }, + { + "epoch": 1.956208043164565, + "grad_norm": 0.2381664514541626, + "learning_rate": 2.472001435370297e-08, + "loss": 0.0729, + "step": 7795 + }, + { + "epoch": 1.9574628270280443, + "grad_norm": 0.27068066596984863, + "learning_rate": 2.3312308094607382e-08, + "loss": 0.0957, + "step": 7800 + }, + { + "epoch": 1.958717610891524, + "grad_norm": 0.37700343132019043, + "learning_rate": 2.1945820294888564e-08, + "loss": 0.0766, + "step": 7805 + }, + { + "epoch": 1.9599723947550034, + "grad_norm": 0.32152286171913147, + "learning_rate": 2.062055660015716e-08, + "loss": 0.0764, + "step": 7810 + }, + { + "epoch": 1.961227178618483, + "grad_norm": 0.164317324757576, + "learning_rate": 1.9336522485710053e-08, + "loss": 0.0825, + "step": 7815 + }, + { + "epoch": 1.9624819624819625, + "grad_norm": 0.1482664942741394, + "learning_rate": 1.8093723256507044e-08, + "loss": 0.0761, + "step": 7820 + }, + { + "epoch": 1.963736746345442, + "grad_norm": 0.15529005229473114, + "learning_rate": 1.689216404714311e-08, + "loss": 0.0879, + "step": 7825 + }, + { + "epoch": 1.9649915302089216, + "grad_norm": 0.2088499665260315, + "learning_rate": 1.5731849821833955e-08, + "loss": 0.0852, + "step": 7830 + }, + { + "epoch": 1.966246314072401, + "grad_norm": 0.2643318176269531, + "learning_rate": 1.4612785374392701e-08, + "loss": 0.0798, + "step": 7835 + }, + { + "epoch": 1.9675010979358807, + "grad_norm": 0.16332080960273743, + "learning_rate": 1.3534975328205468e-08, + "loss": 0.0802, + "step": 7840 + }, + { + "epoch": 1.96875588179936, + "grad_norm": 0.1658352017402649, + "learning_rate": 1.2498424136223597e-08, + "loss": 0.0821, + "step": 7845 + }, + { + "epoch": 1.9700106656628396, + "grad_norm": 0.35251694917678833, + "learning_rate": 1.1503136080932565e-08, + "loss": 0.0822, + "step": 7850 + }, + { + "epoch": 1.971265449526319, + "grad_norm": 0.1039208248257637, + "learning_rate": 1.0549115274344213e-08, + "loss": 0.0847, + "step": 7855 + }, + { + "epoch": 1.9725202333897986, + "grad_norm": 0.3832782208919525, + "learning_rate": 9.636365657971215e-09, + "loss": 0.0732, + "step": 7860 + }, + { + "epoch": 1.9737750172532782, + "grad_norm": 0.35436856746673584, + "learning_rate": 8.764891002821519e-09, + "loss": 0.0796, + "step": 7865 + }, + { + "epoch": 1.9750298011167575, + "grad_norm": 0.3917001187801361, + "learning_rate": 7.93469490936949e-09, + "loss": 0.0747, + "step": 7870 + }, + { + "epoch": 1.9762845849802373, + "grad_norm": 0.20635858178138733, + "learning_rate": 7.145780807553681e-09, + "loss": 0.0804, + "step": 7875 + }, + { + "epoch": 1.9775393688437166, + "grad_norm": 0.12219113856554031, + "learning_rate": 6.398151956754639e-09, + "loss": 0.0854, + "step": 7880 + }, + { + "epoch": 1.9787941527071962, + "grad_norm": 0.08329229801893234, + "learning_rate": 5.69181144578268e-09, + "loss": 0.0863, + "step": 7885 + }, + { + "epoch": 1.9800489365706757, + "grad_norm": 0.2005588710308075, + "learning_rate": 5.026762192870127e-09, + "loss": 0.077, + "step": 7890 + }, + { + "epoch": 1.9813037204341553, + "grad_norm": 0.1955815702676773, + "learning_rate": 4.403006945650212e-09, + "loss": 0.0843, + "step": 7895 + }, + { + "epoch": 1.9825585042976348, + "grad_norm": 0.17901791632175446, + "learning_rate": 3.820548281154857e-09, + "loss": 0.0731, + "step": 7900 + }, + { + "epoch": 1.9838132881611141, + "grad_norm": 0.1416298747062683, + "learning_rate": 3.2793886057991277e-09, + "loss": 0.0729, + "step": 7905 + }, + { + "epoch": 1.985068072024594, + "grad_norm": 0.3370745778083801, + "learning_rate": 2.7795301553712463e-09, + "loss": 0.0794, + "step": 7910 + }, + { + "epoch": 1.9863228558880732, + "grad_norm": 0.16576644778251648, + "learning_rate": 2.3209749950259264e-09, + "loss": 0.075, + "step": 7915 + }, + { + "epoch": 1.9875776397515528, + "grad_norm": 0.2831018567085266, + "learning_rate": 1.9037250192732728e-09, + "loss": 0.0754, + "step": 7920 + }, + { + "epoch": 1.9888324236150323, + "grad_norm": 0.1539755016565323, + "learning_rate": 1.527781951971008e-09, + "loss": 0.0859, + "step": 7925 + }, + { + "epoch": 1.9900872074785119, + "grad_norm": 0.18794752657413483, + "learning_rate": 1.1931473463200339e-09, + "loss": 0.0924, + "step": 7930 + }, + { + "epoch": 1.9913419913419914, + "grad_norm": 0.1152065321803093, + "learning_rate": 8.998225848566577e-10, + "loss": 0.0793, + "step": 7935 + }, + { + "epoch": 1.9925967752054707, + "grad_norm": 0.36054694652557373, + "learning_rate": 6.478088794448223e-10, + "loss": 0.0732, + "step": 7940 + }, + { + "epoch": 1.9938515590689505, + "grad_norm": 0.05416973680257797, + "learning_rate": 4.3710727127277417e-10, + "loss": 0.0812, + "step": 7945 + }, + { + "epoch": 1.9951063429324298, + "grad_norm": 0.19745150208473206, + "learning_rate": 2.677186308497337e-10, + "loss": 0.0805, + "step": 7950 + }, + { + "epoch": 1.9963611267959094, + "grad_norm": 0.11429372429847717, + "learning_rate": 1.3964365800145374e-10, + "loss": 0.0796, + "step": 7955 + }, + { + "epoch": 1.997615910659389, + "grad_norm": 0.16175100207328796, + "learning_rate": 5.288288186688917e-11, + "loss": 0.0791, + "step": 7960 + }, + { + "epoch": 1.9988706945228685, + "grad_norm": 0.3085331618785858, + "learning_rate": 7.436660894866165e-12, + "loss": 0.0807, + "step": 7965 + }, + { + "epoch": 1.9996235648409562, + "step": 7968, + "total_flos": 0.0, + "train_loss": 0.0878100987938962, + "train_runtime": 323838.4314, + "train_samples_per_second": 1.575, + "train_steps_per_second": 0.025 + } + ], + "logging_steps": 5, + "max_steps": 7968, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}