{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 1000, "global_step": 2955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01692047377326565, "grad_norm": 1.2259714603424072, "learning_rate": 6.7567567567567575e-06, "loss": 1.5997, "step": 10 }, { "epoch": 0.0338409475465313, "grad_norm": 1.427184820175171, "learning_rate": 1.3513513513513515e-05, "loss": 1.5776, "step": 20 }, { "epoch": 0.050761421319796954, "grad_norm": 1.1350350379943848, "learning_rate": 2.0270270270270273e-05, "loss": 1.4915, "step": 30 }, { "epoch": 0.0676818950930626, "grad_norm": 0.7067303657531738, "learning_rate": 2.702702702702703e-05, "loss": 1.3124, "step": 40 }, { "epoch": 0.08460236886632826, "grad_norm": 0.6259365677833557, "learning_rate": 3.3783783783783784e-05, "loss": 1.199, "step": 50 }, { "epoch": 0.10152284263959391, "grad_norm": 0.4322277009487152, "learning_rate": 4.0540540540540545e-05, "loss": 1.0944, "step": 60 }, { "epoch": 0.11844331641285956, "grad_norm": 0.5546955466270447, "learning_rate": 4.72972972972973e-05, "loss": 1.1023, "step": 70 }, { "epoch": 0.1353637901861252, "grad_norm": 0.5008642673492432, "learning_rate": 5.405405405405406e-05, "loss": 1.1058, "step": 80 }, { "epoch": 0.15228426395939088, "grad_norm": 0.43932950496673584, "learning_rate": 6.0810810810810814e-05, "loss": 1.0681, "step": 90 }, { "epoch": 0.1692047377326565, "grad_norm": 0.48154979944229126, "learning_rate": 6.756756756756757e-05, "loss": 1.0741, "step": 100 }, { "epoch": 0.18612521150592218, "grad_norm": 0.6456084251403809, "learning_rate": 7.432432432432433e-05, "loss": 1.088, "step": 110 }, { "epoch": 0.20304568527918782, "grad_norm": 0.4756552577018738, "learning_rate": 8.108108108108109e-05, "loss": 1.0637, "step": 120 }, { "epoch": 0.21996615905245348, "grad_norm": 0.6161950826644897, "learning_rate": 8.783783783783784e-05, "loss": 1.0807, "step": 130 }, { "epoch": 0.23688663282571912, "grad_norm": 0.5582019686698914, "learning_rate": 9.45945945945946e-05, "loss": 1.0755, "step": 140 }, { "epoch": 0.25380710659898476, "grad_norm": 0.5398248434066772, "learning_rate": 0.00010135135135135136, "loss": 1.0935, "step": 150 }, { "epoch": 0.2707275803722504, "grad_norm": 0.4599798023700714, "learning_rate": 0.00010810810810810812, "loss": 1.0533, "step": 160 }, { "epoch": 0.2876480541455161, "grad_norm": 0.5725008249282837, "learning_rate": 0.00011486486486486487, "loss": 1.0489, "step": 170 }, { "epoch": 0.30456852791878175, "grad_norm": 0.6546281576156616, "learning_rate": 0.00012162162162162163, "loss": 1.0251, "step": 180 }, { "epoch": 0.32148900169204736, "grad_norm": 0.6009621024131775, "learning_rate": 0.0001283783783783784, "loss": 1.0235, "step": 190 }, { "epoch": 0.338409475465313, "grad_norm": 0.5415365695953369, "learning_rate": 0.00013513513513513514, "loss": 1.061, "step": 200 }, { "epoch": 0.3553299492385787, "grad_norm": 0.5579381585121155, "learning_rate": 0.00014189189189189188, "loss": 1.0309, "step": 210 }, { "epoch": 0.37225042301184436, "grad_norm": 0.4756716191768646, "learning_rate": 0.00014864864864864866, "loss": 1.0496, "step": 220 }, { "epoch": 0.38917089678510997, "grad_norm": 0.5466011166572571, "learning_rate": 0.0001554054054054054, "loss": 1.0235, "step": 230 }, { "epoch": 0.40609137055837563, "grad_norm": 0.49792057275772095, "learning_rate": 0.00016216216216216218, "loss": 1.044, "step": 240 }, { "epoch": 0.4230118443316413, "grad_norm": 0.5117172598838806, "learning_rate": 0.00016891891891891893, "loss": 1.0268, "step": 250 }, { "epoch": 0.43993231810490696, "grad_norm": 0.44799599051475525, "learning_rate": 0.00017567567567567568, "loss": 1.0259, "step": 260 }, { "epoch": 0.45685279187817257, "grad_norm": 0.6032038927078247, "learning_rate": 0.00018243243243243245, "loss": 1.0602, "step": 270 }, { "epoch": 0.47377326565143824, "grad_norm": 0.5627055764198303, "learning_rate": 0.0001891891891891892, "loss": 1.0632, "step": 280 }, { "epoch": 0.4906937394247039, "grad_norm": 0.5613240599632263, "learning_rate": 0.00019594594594594594, "loss": 1.0256, "step": 290 }, { "epoch": 0.5076142131979695, "grad_norm": 1.3261655569076538, "learning_rate": 0.00019999888325954442, "loss": 1.0541, "step": 300 }, { "epoch": 0.5245346869712352, "grad_norm": 0.5617411136627197, "learning_rate": 0.0001999863202158626, "loss": 1.0307, "step": 310 }, { "epoch": 0.5414551607445008, "grad_norm": 0.5320747494697571, "learning_rate": 0.00019995979996246553, "loss": 1.0644, "step": 320 }, { "epoch": 0.5583756345177665, "grad_norm": 0.5442279577255249, "learning_rate": 0.00019991932620134706, "loss": 1.0447, "step": 330 }, { "epoch": 0.5752961082910322, "grad_norm": 0.4834861159324646, "learning_rate": 0.00019986490458228775, "loss": 1.0272, "step": 340 }, { "epoch": 0.5922165820642978, "grad_norm": 0.48604169487953186, "learning_rate": 0.00019979654270206636, "loss": 1.0696, "step": 350 }, { "epoch": 0.6091370558375635, "grad_norm": 0.4888402223587036, "learning_rate": 0.00019971425010339923, "loss": 1.0821, "step": 360 }, { "epoch": 0.626057529610829, "grad_norm": 0.700931191444397, "learning_rate": 0.00019961803827360847, "loss": 1.0182, "step": 370 }, { "epoch": 0.6429780033840947, "grad_norm": 0.5287917256355286, "learning_rate": 0.00019950792064301812, "loss": 1.0469, "step": 380 }, { "epoch": 0.6598984771573604, "grad_norm": 0.4720386266708374, "learning_rate": 0.0001993839125830796, "loss": 1.0235, "step": 390 }, { "epoch": 0.676818950930626, "grad_norm": 0.5388404726982117, "learning_rate": 0.00019924603140422596, "loss": 1.0548, "step": 400 }, { "epoch": 0.6937394247038917, "grad_norm": 0.34490785002708435, "learning_rate": 0.0001990942963534554, "loss": 1.0723, "step": 410 }, { "epoch": 0.7106598984771574, "grad_norm": 0.4453361928462982, "learning_rate": 0.00019892872861164467, "loss": 1.0667, "step": 420 }, { "epoch": 0.727580372250423, "grad_norm": 0.4319419264793396, "learning_rate": 0.0001987493512905924, "loss": 1.0545, "step": 430 }, { "epoch": 0.7445008460236887, "grad_norm": 0.43239086866378784, "learning_rate": 0.00019855618942979272, "loss": 0.9922, "step": 440 }, { "epoch": 0.7614213197969543, "grad_norm": 0.5506799817085266, "learning_rate": 0.0001983492699929403, "loss": 1.0453, "step": 450 }, { "epoch": 0.7783417935702199, "grad_norm": 0.41813233494758606, "learning_rate": 0.0001981286218641662, "loss": 1.0118, "step": 460 }, { "epoch": 0.7952622673434856, "grad_norm": 0.4467175304889679, "learning_rate": 0.00019789427584400584, "loss": 0.9767, "step": 470 }, { "epoch": 0.8121827411167513, "grad_norm": 0.4833202660083771, "learning_rate": 0.00019764626464509978, "loss": 0.9934, "step": 480 }, { "epoch": 0.8291032148900169, "grad_norm": 0.6021937727928162, "learning_rate": 0.0001973846228876271, "loss": 1.0241, "step": 490 }, { "epoch": 0.8460236886632826, "grad_norm": 0.6700023412704468, "learning_rate": 0.00019710938709447288, "loss": 1.0248, "step": 500 }, { "epoch": 0.8629441624365483, "grad_norm": 0.6895177960395813, "learning_rate": 0.00019682059568612982, "loss": 1.0287, "step": 510 }, { "epoch": 0.8798646362098139, "grad_norm": 0.41973355412483215, "learning_rate": 0.0001965182889753351, "loss": 1.0231, "step": 520 }, { "epoch": 0.8967851099830795, "grad_norm": 0.4463100731372833, "learning_rate": 0.00019620250916144313, "loss": 1.0407, "step": 530 }, { "epoch": 0.9137055837563451, "grad_norm": 0.506348192691803, "learning_rate": 0.00019587330032453483, "loss": 1.0269, "step": 540 }, { "epoch": 0.9306260575296108, "grad_norm": 0.38442033529281616, "learning_rate": 0.00019553070841926443, "loss": 1.0483, "step": 550 }, { "epoch": 0.9475465313028765, "grad_norm": 0.4524074196815491, "learning_rate": 0.0001951747812684447, "loss": 1.0466, "step": 560 }, { "epoch": 0.9644670050761421, "grad_norm": 0.38016098737716675, "learning_rate": 0.00019480556855637127, "loss": 0.9867, "step": 570 }, { "epoch": 0.9813874788494078, "grad_norm": 0.47468554973602295, "learning_rate": 0.00019442312182188696, "loss": 1.0135, "step": 580 }, { "epoch": 0.9983079526226735, "grad_norm": 0.453453928232193, "learning_rate": 0.00019402749445118772, "loss": 1.029, "step": 590 }, { "epoch": 1.015228426395939, "grad_norm": 0.5637357831001282, "learning_rate": 0.0001936187416703702, "loss": 0.9363, "step": 600 }, { "epoch": 1.0321489001692048, "grad_norm": 0.45976507663726807, "learning_rate": 0.00019319692053772265, "loss": 0.9843, "step": 610 }, { "epoch": 1.0490693739424704, "grad_norm": 0.4763481914997101, "learning_rate": 0.0001927620899357602, "loss": 0.9561, "step": 620 }, { "epoch": 1.0659898477157361, "grad_norm": 0.5057137608528137, "learning_rate": 0.0001923143105630053, "loss": 0.9741, "step": 630 }, { "epoch": 1.0829103214890017, "grad_norm": 0.48306360840797424, "learning_rate": 0.0001918536449255147, "loss": 0.9392, "step": 640 }, { "epoch": 1.0998307952622675, "grad_norm": 0.4458140432834625, "learning_rate": 0.00019138015732815438, "loss": 0.9463, "step": 650 }, { "epoch": 1.116751269035533, "grad_norm": 0.6166822910308838, "learning_rate": 0.00019089391386562283, "loss": 0.9376, "step": 660 }, { "epoch": 1.1336717428087986, "grad_norm": 0.4895755350589752, "learning_rate": 0.00019039498241322505, "loss": 0.9752, "step": 670 }, { "epoch": 1.1505922165820643, "grad_norm": 0.4752449095249176, "learning_rate": 0.00018988343261739767, "loss": 0.9088, "step": 680 }, { "epoch": 1.16751269035533, "grad_norm": 0.5145062208175659, "learning_rate": 0.0001893593358859869, "loss": 0.9918, "step": 690 }, { "epoch": 1.1844331641285957, "grad_norm": 0.5580092668533325, "learning_rate": 0.00018882276537828072, "loss": 0.9644, "step": 700 }, { "epoch": 1.2013536379018612, "grad_norm": 0.46470576524734497, "learning_rate": 0.0001882737959947964, "loss": 0.9361, "step": 710 }, { "epoch": 1.218274111675127, "grad_norm": 0.4674146771430969, "learning_rate": 0.00018771250436682503, "loss": 0.9318, "step": 720 }, { "epoch": 1.2351945854483926, "grad_norm": 0.5492033958435059, "learning_rate": 0.00018713896884573457, "loss": 0.9439, "step": 730 }, { "epoch": 1.252115059221658, "grad_norm": 0.5646327137947083, "learning_rate": 0.00018655326949203259, "loss": 0.9292, "step": 740 }, { "epoch": 1.2690355329949239, "grad_norm": 0.5472354888916016, "learning_rate": 0.0001859554880641905, "loss": 0.9083, "step": 750 }, { "epoch": 1.2859560067681894, "grad_norm": 0.45085784792900085, "learning_rate": 0.000185345708007231, "loss": 0.98, "step": 760 }, { "epoch": 1.3028764805414552, "grad_norm": 0.6279247999191284, "learning_rate": 0.00018472401444107964, "loss": 0.9245, "step": 770 }, { "epoch": 1.3197969543147208, "grad_norm": 0.5216350555419922, "learning_rate": 0.00018409049414868297, "loss": 0.9901, "step": 780 }, { "epoch": 1.3367174280879865, "grad_norm": 0.5439789295196533, "learning_rate": 0.00018344523556389433, "loss": 0.9325, "step": 790 }, { "epoch": 1.353637901861252, "grad_norm": 0.691190242767334, "learning_rate": 0.0001827883287591293, "loss": 0.9836, "step": 800 }, { "epoch": 1.3705583756345177, "grad_norm": 0.6296900510787964, "learning_rate": 0.00018211986543279244, "loss": 0.9223, "step": 810 }, { "epoch": 1.3874788494077834, "grad_norm": 0.5122337341308594, "learning_rate": 0.00018143993889647688, "loss": 0.9759, "step": 820 }, { "epoch": 1.404399323181049, "grad_norm": 0.5923859477043152, "learning_rate": 0.0001807486440619389, "loss": 0.9678, "step": 830 }, { "epoch": 1.4213197969543148, "grad_norm": 0.5583952069282532, "learning_rate": 0.00018004607742784916, "loss": 0.9936, "step": 840 }, { "epoch": 1.4382402707275803, "grad_norm": 0.5816633105278015, "learning_rate": 0.0001793323370663222, "loss": 0.9496, "step": 850 }, { "epoch": 1.455160744500846, "grad_norm": 0.4728400707244873, "learning_rate": 0.00017860752260922652, "loss": 0.9335, "step": 860 }, { "epoch": 1.4720812182741116, "grad_norm": 0.6433700919151306, "learning_rate": 0.00017787173523427688, "loss": 0.9713, "step": 870 }, { "epoch": 1.4890016920473772, "grad_norm": 0.4833492934703827, "learning_rate": 0.0001771250776509106, "loss": 0.9575, "step": 880 }, { "epoch": 1.505922165820643, "grad_norm": 0.5448775887489319, "learning_rate": 0.00017636765408595055, "loss": 0.9599, "step": 890 }, { "epoch": 1.5228426395939088, "grad_norm": 0.6575478911399841, "learning_rate": 0.00017559957026905563, "loss": 0.963, "step": 900 }, { "epoch": 1.5397631133671743, "grad_norm": 0.49584710597991943, "learning_rate": 0.00017482093341796218, "loss": 0.936, "step": 910 }, { "epoch": 1.5566835871404399, "grad_norm": 0.538006603717804, "learning_rate": 0.00017403185222351704, "loss": 0.9529, "step": 920 }, { "epoch": 1.5736040609137056, "grad_norm": 0.5086967945098877, "learning_rate": 0.00017323243683450552, "loss": 0.9807, "step": 930 }, { "epoch": 1.5905245346869712, "grad_norm": 0.6087446808815002, "learning_rate": 0.00017242279884227535, "loss": 0.957, "step": 940 }, { "epoch": 1.6074450084602367, "grad_norm": 0.6025837063789368, "learning_rate": 0.00017160305126515972, "loss": 0.9622, "step": 950 }, { "epoch": 1.6243654822335025, "grad_norm": 0.49924030900001526, "learning_rate": 0.00017077330853270087, "loss": 0.9724, "step": 960 }, { "epoch": 1.6412859560067683, "grad_norm": 0.5102123022079468, "learning_rate": 0.00016993368646967658, "loss": 0.966, "step": 970 }, { "epoch": 1.6582064297800339, "grad_norm": 0.6529399752616882, "learning_rate": 0.00016908430227993227, "loss": 0.9161, "step": 980 }, { "epoch": 1.6751269035532994, "grad_norm": 0.5354600548744202, "learning_rate": 0.00016822527453002023, "loss": 0.972, "step": 990 }, { "epoch": 1.6920473773265652, "grad_norm": 0.6311420202255249, "learning_rate": 0.00016735672313264883, "loss": 0.9322, "step": 1000 }, { "epoch": 1.6920473773265652, "eval_loss": 1.0022894144058228, "eval_runtime": 9.3099, "eval_samples_per_second": 106.983, "eval_steps_per_second": 13.427, "step": 1000 }, { "epoch": 1.708967851099831, "grad_norm": 0.5729072690010071, "learning_rate": 0.00016647876932994373, "loss": 0.9525, "step": 1010 }, { "epoch": 1.7258883248730963, "grad_norm": 0.8473700284957886, "learning_rate": 0.00016559153567652363, "loss": 0.9479, "step": 1020 }, { "epoch": 1.742808798646362, "grad_norm": 0.6889095306396484, "learning_rate": 0.00016469514602239252, "loss": 0.9369, "step": 1030 }, { "epoch": 1.7597292724196278, "grad_norm": 0.4434736967086792, "learning_rate": 0.0001637897254956517, "loss": 0.947, "step": 1040 }, { "epoch": 1.7766497461928934, "grad_norm": 0.925630509853363, "learning_rate": 0.00016287540048503244, "loss": 0.9714, "step": 1050 }, { "epoch": 1.793570219966159, "grad_norm": 0.7930585741996765, "learning_rate": 0.00016195229862225378, "loss": 0.9665, "step": 1060 }, { "epoch": 1.8104906937394247, "grad_norm": 0.5998470187187195, "learning_rate": 0.00016102054876420592, "loss": 0.9476, "step": 1070 }, { "epoch": 1.8274111675126905, "grad_norm": 0.6296101212501526, "learning_rate": 0.00016008028097496308, "loss": 0.997, "step": 1080 }, { "epoch": 1.844331641285956, "grad_norm": 0.6930395364761353, "learning_rate": 0.0001591316265076276, "loss": 0.9438, "step": 1090 }, { "epoch": 1.8612521150592216, "grad_norm": 0.6003424525260925, "learning_rate": 0.0001581747177860082, "loss": 0.9711, "step": 1100 }, { "epoch": 1.8781725888324874, "grad_norm": 0.48814520239830017, "learning_rate": 0.00015720968838613497, "loss": 0.9653, "step": 1110 }, { "epoch": 1.895093062605753, "grad_norm": 0.5773620009422302, "learning_rate": 0.00015623667301761294, "loss": 0.9782, "step": 1120 }, { "epoch": 1.9120135363790185, "grad_norm": 0.6531692147254944, "learning_rate": 0.0001552558075048182, "loss": 0.9546, "step": 1130 }, { "epoch": 1.9289340101522843, "grad_norm": 0.5369117259979248, "learning_rate": 0.00015426722876793779, "loss": 0.9598, "step": 1140 }, { "epoch": 1.94585448392555, "grad_norm": 0.635637640953064, "learning_rate": 0.0001532710748038568, "loss": 0.9992, "step": 1150 }, { "epoch": 1.9627749576988156, "grad_norm": 0.7120236754417419, "learning_rate": 0.00015226748466689552, "loss": 0.98, "step": 1160 }, { "epoch": 1.9796954314720812, "grad_norm": 0.49258938431739807, "learning_rate": 0.00015125659844939833, "loss": 0.9703, "step": 1170 }, { "epoch": 1.996615905245347, "grad_norm": 0.6356224417686462, "learning_rate": 0.0001502385572621783, "loss": 1.0121, "step": 1180 }, { "epoch": 2.0135363790186127, "grad_norm": 0.5743930339813232, "learning_rate": 0.00014921350321481905, "loss": 0.8652, "step": 1190 }, { "epoch": 2.030456852791878, "grad_norm": 0.9692409634590149, "learning_rate": 0.00014818157939583803, "loss": 0.796, "step": 1200 }, { "epoch": 2.047377326565144, "grad_norm": 0.7314022779464722, "learning_rate": 0.00014714292985271206, "loss": 0.808, "step": 1210 }, { "epoch": 2.0642978003384096, "grad_norm": 0.7976405024528503, "learning_rate": 0.00014609769957176993, "loss": 0.8067, "step": 1220 }, { "epoch": 2.081218274111675, "grad_norm": 0.54390549659729, "learning_rate": 0.0001450460344579534, "loss": 0.8239, "step": 1230 }, { "epoch": 2.0981387478849407, "grad_norm": 0.7505801916122437, "learning_rate": 0.00014398808131445032, "loss": 0.8165, "step": 1240 }, { "epoch": 2.1150592216582065, "grad_norm": 0.6022424697875977, "learning_rate": 0.00014292398782220203, "loss": 0.8244, "step": 1250 }, { "epoch": 2.1319796954314723, "grad_norm": 0.6155591607093811, "learning_rate": 0.00014185390251928844, "loss": 0.8392, "step": 1260 }, { "epoch": 2.1489001692047376, "grad_norm": 0.7514758110046387, "learning_rate": 0.0001407779747801936, "loss": 0.8271, "step": 1270 }, { "epoch": 2.1658206429780034, "grad_norm": 0.6205560564994812, "learning_rate": 0.00013969635479495408, "loss": 0.8004, "step": 1280 }, { "epoch": 2.182741116751269, "grad_norm": 0.6635140180587769, "learning_rate": 0.0001386091935481939, "loss": 0.8035, "step": 1290 }, { "epoch": 2.199661590524535, "grad_norm": 0.5528069138526917, "learning_rate": 0.00013751664279804842, "loss": 0.8381, "step": 1300 }, { "epoch": 2.2165820642978002, "grad_norm": 0.5508107542991638, "learning_rate": 0.00013641885505498016, "loss": 0.8232, "step": 1310 }, { "epoch": 2.233502538071066, "grad_norm": 0.7910854816436768, "learning_rate": 0.0001353159835604898, "loss": 0.802, "step": 1320 }, { "epoch": 2.250423011844332, "grad_norm": 0.8323054909706116, "learning_rate": 0.0001342081822657248, "loss": 0.8026, "step": 1330 }, { "epoch": 2.267343485617597, "grad_norm": 0.622173547744751, "learning_rate": 0.00013309560580998956, "loss": 0.8385, "step": 1340 }, { "epoch": 2.284263959390863, "grad_norm": 0.8506642580032349, "learning_rate": 0.00013197840949915867, "loss": 0.8216, "step": 1350 }, { "epoch": 2.3011844331641287, "grad_norm": 0.9256905317306519, "learning_rate": 0.0001308567492839979, "loss": 0.814, "step": 1360 }, { "epoch": 2.3181049069373945, "grad_norm": 0.9968581795692444, "learning_rate": 0.00012973078173839477, "loss": 0.8147, "step": 1370 }, { "epoch": 2.33502538071066, "grad_norm": 0.6434370279312134, "learning_rate": 0.00012860066403750213, "loss": 0.7992, "step": 1380 }, { "epoch": 2.3519458544839256, "grad_norm": 1.0632697343826294, "learning_rate": 0.00012746655393579802, "loss": 0.8443, "step": 1390 }, { "epoch": 2.3688663282571913, "grad_norm": 0.6085894107818604, "learning_rate": 0.00012632860974506443, "loss": 0.8297, "step": 1400 }, { "epoch": 2.3857868020304567, "grad_norm": 0.7611764073371887, "learning_rate": 0.00012518699031228848, "loss": 0.8222, "step": 1410 }, { "epoch": 2.4027072758037225, "grad_norm": 1.16623854637146, "learning_rate": 0.00012404185499748858, "loss": 0.8006, "step": 1420 }, { "epoch": 2.4196277495769882, "grad_norm": 0.7282375693321228, "learning_rate": 0.00012289336365146943, "loss": 0.7958, "step": 1430 }, { "epoch": 2.436548223350254, "grad_norm": 0.9697285890579224, "learning_rate": 0.00012174167659350805, "loss": 0.858, "step": 1440 }, { "epoch": 2.4534686971235193, "grad_norm": 0.6591439247131348, "learning_rate": 0.0001205869545889748, "loss": 0.8013, "step": 1450 }, { "epoch": 2.470389170896785, "grad_norm": 0.7019118666648865, "learning_rate": 0.00011942935882689177, "loss": 0.8002, "step": 1460 }, { "epoch": 2.487309644670051, "grad_norm": 1.2326512336730957, "learning_rate": 0.00011826905089743228, "loss": 0.8113, "step": 1470 }, { "epoch": 2.504230118443316, "grad_norm": 0.9669883251190186, "learning_rate": 0.00011710619276936441, "loss": 0.8567, "step": 1480 }, { "epoch": 2.521150592216582, "grad_norm": 0.6777768135070801, "learning_rate": 0.0001159409467674414, "loss": 0.8158, "step": 1490 }, { "epoch": 2.5380710659898478, "grad_norm": 0.6380109190940857, "learning_rate": 0.00011477347554974278, "loss": 0.8402, "step": 1500 }, { "epoch": 2.5549915397631136, "grad_norm": 1.0947014093399048, "learning_rate": 0.0001136039420849685, "loss": 0.7928, "step": 1510 }, { "epoch": 2.571912013536379, "grad_norm": 0.6936770081520081, "learning_rate": 0.00011243250962969008, "loss": 0.8269, "step": 1520 }, { "epoch": 2.5888324873096447, "grad_norm": 0.9474750757217407, "learning_rate": 0.0001112593417055614, "loss": 0.8305, "step": 1530 }, { "epoch": 2.6057529610829104, "grad_norm": 0.710216224193573, "learning_rate": 0.00011008460207649242, "loss": 0.8548, "step": 1540 }, { "epoch": 2.6226734348561758, "grad_norm": 0.6556210517883301, "learning_rate": 0.00010890845472578947, "loss": 0.8315, "step": 1550 }, { "epoch": 2.6395939086294415, "grad_norm": 0.6288842558860779, "learning_rate": 0.00010773106383326417, "loss": 0.8224, "step": 1560 }, { "epoch": 2.6565143824027073, "grad_norm": 0.8483306169509888, "learning_rate": 0.00010655259375231583, "loss": 0.8345, "step": 1570 }, { "epoch": 2.673434856175973, "grad_norm": 0.7669579386711121, "learning_rate": 0.00010537320898698882, "loss": 0.8165, "step": 1580 }, { "epoch": 2.6903553299492384, "grad_norm": 0.7949408292770386, "learning_rate": 0.00010419307416900947, "loss": 0.7951, "step": 1590 }, { "epoch": 2.707275803722504, "grad_norm": 0.8271426558494568, "learning_rate": 0.00010301235403480487, "loss": 0.8385, "step": 1600 }, { "epoch": 2.72419627749577, "grad_norm": 0.6712045073509216, "learning_rate": 0.00010183121340250699, "loss": 0.832, "step": 1610 }, { "epoch": 2.7411167512690353, "grad_norm": 0.768624484539032, "learning_rate": 0.00010064981714894582, "loss": 0.8365, "step": 1620 }, { "epoch": 2.758037225042301, "grad_norm": 0.8181334733963013, "learning_rate": 9.946833018663359e-05, "loss": 0.8448, "step": 1630 }, { "epoch": 2.774957698815567, "grad_norm": 0.7336423993110657, "learning_rate": 9.828691744074483e-05, "loss": 0.8259, "step": 1640 }, { "epoch": 2.7918781725888326, "grad_norm": 0.5810146927833557, "learning_rate": 9.710574382609416e-05, "loss": 0.8443, "step": 1650 }, { "epoch": 2.808798646362098, "grad_norm": 0.7470970153808594, "learning_rate": 9.59249742241154e-05, "loss": 0.8428, "step": 1660 }, { "epoch": 2.8257191201353637, "grad_norm": 0.6533593535423279, "learning_rate": 9.474477345984592e-05, "loss": 0.8078, "step": 1670 }, { "epoch": 2.8426395939086295, "grad_norm": 0.8344042897224426, "learning_rate": 9.356530627891827e-05, "loss": 0.8132, "step": 1680 }, { "epoch": 2.859560067681895, "grad_norm": 0.6438416242599487, "learning_rate": 9.238673732456323e-05, "loss": 0.8553, "step": 1690 }, { "epoch": 2.8764805414551606, "grad_norm": 0.9393163323402405, "learning_rate": 9.120923111462715e-05, "loss": 0.8263, "step": 1700 }, { "epoch": 2.8934010152284264, "grad_norm": 0.8571431040763855, "learning_rate": 9.003295201860652e-05, "loss": 0.8161, "step": 1710 }, { "epoch": 2.910321489001692, "grad_norm": 0.7677854895591736, "learning_rate": 8.885806423470356e-05, "loss": 0.8384, "step": 1720 }, { "epoch": 2.927241962774958, "grad_norm": 0.7890878319740295, "learning_rate": 8.76847317669056e-05, "loss": 0.7891, "step": 1730 }, { "epoch": 2.9441624365482233, "grad_norm": 0.6829676628112793, "learning_rate": 8.651311840209145e-05, "loss": 0.7662, "step": 1740 }, { "epoch": 2.961082910321489, "grad_norm": 0.6441112756729126, "learning_rate": 8.534338768716845e-05, "loss": 0.8046, "step": 1750 }, { "epoch": 2.9780033840947544, "grad_norm": 0.7504361271858215, "learning_rate": 8.417570290624246e-05, "loss": 0.8523, "step": 1760 }, { "epoch": 2.99492385786802, "grad_norm": 0.7779954075813293, "learning_rate": 8.301022705782498e-05, "loss": 0.7861, "step": 1770 }, { "epoch": 3.011844331641286, "grad_norm": 0.6992425322532654, "learning_rate": 8.184712283208004e-05, "loss": 0.72, "step": 1780 }, { "epoch": 3.0287648054145517, "grad_norm": 0.8021610379219055, "learning_rate": 8.068655258811404e-05, "loss": 0.7146, "step": 1790 }, { "epoch": 3.045685279187817, "grad_norm": 0.763023316860199, "learning_rate": 7.952867833131176e-05, "loss": 0.6373, "step": 1800 }, { "epoch": 3.062605752961083, "grad_norm": 0.9050746560096741, "learning_rate": 7.837366169072202e-05, "loss": 0.6911, "step": 1810 }, { "epoch": 3.0795262267343486, "grad_norm": 0.9052395820617676, "learning_rate": 7.722166389649548e-05, "loss": 0.6528, "step": 1820 }, { "epoch": 3.0964467005076144, "grad_norm": 0.7168397903442383, "learning_rate": 7.607284575737848e-05, "loss": 0.691, "step": 1830 }, { "epoch": 3.1133671742808797, "grad_norm": 0.8127416372299194, "learning_rate": 7.492736763826553e-05, "loss": 0.6903, "step": 1840 }, { "epoch": 3.1302876480541455, "grad_norm": 0.9594517350196838, "learning_rate": 7.378538943781381e-05, "loss": 0.6682, "step": 1850 }, { "epoch": 3.1472081218274113, "grad_norm": 1.0214790105819702, "learning_rate": 7.264707056612252e-05, "loss": 0.6508, "step": 1860 }, { "epoch": 3.164128595600677, "grad_norm": 1.0813875198364258, "learning_rate": 7.151256992248097e-05, "loss": 0.6635, "step": 1870 }, { "epoch": 3.1810490693739424, "grad_norm": 0.8683750629425049, "learning_rate": 7.038204587318728e-05, "loss": 0.6959, "step": 1880 }, { "epoch": 3.197969543147208, "grad_norm": 0.8788993954658508, "learning_rate": 6.92556562294422e-05, "loss": 0.6487, "step": 1890 }, { "epoch": 3.214890016920474, "grad_norm": 0.8151355981826782, "learning_rate": 6.813355822531984e-05, "loss": 0.6615, "step": 1900 }, { "epoch": 3.2318104906937393, "grad_norm": 0.875066876411438, "learning_rate": 6.701590849581907e-05, "loss": 0.6716, "step": 1910 }, { "epoch": 3.248730964467005, "grad_norm": 0.6827503442764282, "learning_rate": 6.590286305499895e-05, "loss": 0.6533, "step": 1920 }, { "epoch": 3.265651438240271, "grad_norm": 1.032798171043396, "learning_rate": 6.479457727420038e-05, "loss": 0.6691, "step": 1930 }, { "epoch": 3.2825719120135366, "grad_norm": 0.7288352251052856, "learning_rate": 6.369120586035757e-05, "loss": 0.7198, "step": 1940 }, { "epoch": 3.299492385786802, "grad_norm": 1.0082377195358276, "learning_rate": 6.259290283440243e-05, "loss": 0.68, "step": 1950 }, { "epoch": 3.3164128595600677, "grad_norm": 0.7989615797996521, "learning_rate": 6.149982150976453e-05, "loss": 0.6623, "step": 1960 }, { "epoch": 3.3333333333333335, "grad_norm": 0.8861303925514221, "learning_rate": 6.0412114470969925e-05, "loss": 0.7331, "step": 1970 }, { "epoch": 3.350253807106599, "grad_norm": 0.9690698981285095, "learning_rate": 5.932993355234177e-05, "loss": 0.661, "step": 1980 }, { "epoch": 3.3671742808798646, "grad_norm": 0.9235168099403381, "learning_rate": 5.825342981680544e-05, "loss": 0.6648, "step": 1990 }, { "epoch": 3.3840947546531304, "grad_norm": 1.0237386226654053, "learning_rate": 5.718275353480155e-05, "loss": 0.7121, "step": 2000 }, { "epoch": 3.3840947546531304, "eval_loss": 1.1582704782485962, "eval_runtime": 9.4671, "eval_samples_per_second": 105.206, "eval_steps_per_second": 13.204, "step": 2000 }, { "epoch": 3.401015228426396, "grad_norm": 1.019852876663208, "learning_rate": 5.611805416330955e-05, "loss": 0.6862, "step": 2010 }, { "epoch": 3.4179357021996615, "grad_norm": 0.8144916296005249, "learning_rate": 5.505948032498481e-05, "loss": 0.6798, "step": 2020 }, { "epoch": 3.4348561759729273, "grad_norm": 0.7801826596260071, "learning_rate": 5.400717978741223e-05, "loss": 0.6704, "step": 2030 }, { "epoch": 3.451776649746193, "grad_norm": 0.8852076530456543, "learning_rate": 5.296129944247917e-05, "loss": 0.6604, "step": 2040 }, { "epoch": 3.4686971235194584, "grad_norm": 1.0277949571609497, "learning_rate": 5.1921985285870666e-05, "loss": 0.6787, "step": 2050 }, { "epoch": 3.485617597292724, "grad_norm": 0.8751595616340637, "learning_rate": 5.088938239668957e-05, "loss": 0.6816, "step": 2060 }, { "epoch": 3.50253807106599, "grad_norm": 0.8326672315597534, "learning_rate": 4.986363491720508e-05, "loss": 0.6683, "step": 2070 }, { "epoch": 3.5194585448392557, "grad_norm": 0.9706553220748901, "learning_rate": 4.884488603273153e-05, "loss": 0.6619, "step": 2080 }, { "epoch": 3.536379018612521, "grad_norm": 0.9801552891731262, "learning_rate": 4.78332779516409e-05, "loss": 0.6801, "step": 2090 }, { "epoch": 3.553299492385787, "grad_norm": 0.9298548102378845, "learning_rate": 4.682895188551205e-05, "loss": 0.6684, "step": 2100 }, { "epoch": 3.5702199661590526, "grad_norm": 0.9834477305412292, "learning_rate": 4.583204802941861e-05, "loss": 0.6934, "step": 2110 }, { "epoch": 3.587140439932318, "grad_norm": 0.8049277067184448, "learning_rate": 4.4842705542359164e-05, "loss": 0.7124, "step": 2120 }, { "epoch": 3.6040609137055837, "grad_norm": 0.7810288071632385, "learning_rate": 4.386106252783162e-05, "loss": 0.6825, "step": 2130 }, { "epoch": 3.6209813874788495, "grad_norm": 0.8989554047584534, "learning_rate": 4.288725601455543e-05, "loss": 0.6328, "step": 2140 }, { "epoch": 3.6379018612521152, "grad_norm": 1.3646354675292969, "learning_rate": 4.192142193734344e-05, "loss": 0.6594, "step": 2150 }, { "epoch": 3.6548223350253806, "grad_norm": 1.177277684211731, "learning_rate": 4.096369511812669e-05, "loss": 0.6728, "step": 2160 }, { "epoch": 3.6717428087986463, "grad_norm": 1.1561075448989868, "learning_rate": 4.001420924713435e-05, "loss": 0.6511, "step": 2170 }, { "epoch": 3.688663282571912, "grad_norm": 0.9678720831871033, "learning_rate": 3.9073096864231815e-05, "loss": 0.6789, "step": 2180 }, { "epoch": 3.7055837563451774, "grad_norm": 0.9476550817489624, "learning_rate": 3.814048934041934e-05, "loss": 0.6378, "step": 2190 }, { "epoch": 3.7225042301184432, "grad_norm": 0.9431784749031067, "learning_rate": 3.72165168594936e-05, "loss": 0.6823, "step": 2200 }, { "epoch": 3.739424703891709, "grad_norm": 0.92593914270401, "learning_rate": 3.630130839987553e-05, "loss": 0.6861, "step": 2210 }, { "epoch": 3.7563451776649748, "grad_norm": 0.9857144951820374, "learning_rate": 3.539499171660581e-05, "loss": 0.6585, "step": 2220 }, { "epoch": 3.77326565143824, "grad_norm": 0.8839899301528931, "learning_rate": 3.4497693323511326e-05, "loss": 0.6802, "step": 2230 }, { "epoch": 3.790186125211506, "grad_norm": 0.9294420480728149, "learning_rate": 3.3609538475545196e-05, "loss": 0.6914, "step": 2240 }, { "epoch": 3.8071065989847717, "grad_norm": 1.0007585287094116, "learning_rate": 3.273065115130223e-05, "loss": 0.6954, "step": 2250 }, { "epoch": 3.824027072758037, "grad_norm": 0.9174544215202332, "learning_rate": 3.186115403571245e-05, "loss": 0.6564, "step": 2260 }, { "epoch": 3.8409475465313028, "grad_norm": 0.9077910780906677, "learning_rate": 3.10011685029154e-05, "loss": 0.682, "step": 2270 }, { "epoch": 3.8578680203045685, "grad_norm": 1.0111888647079468, "learning_rate": 3.0150814599317556e-05, "loss": 0.7105, "step": 2280 }, { "epoch": 3.8747884940778343, "grad_norm": 1.0113632678985596, "learning_rate": 2.93102110268347e-05, "loss": 0.6659, "step": 2290 }, { "epoch": 3.8917089678511, "grad_norm": 0.9199107885360718, "learning_rate": 2.847947512632232e-05, "loss": 0.7125, "step": 2300 }, { "epoch": 3.9086294416243654, "grad_norm": 0.7901623845100403, "learning_rate": 2.765872286119575e-05, "loss": 0.6582, "step": 2310 }, { "epoch": 3.925549915397631, "grad_norm": 0.873816967010498, "learning_rate": 2.6848068801242797e-05, "loss": 0.71, "step": 2320 }, { "epoch": 3.9424703891708965, "grad_norm": 0.8795329332351685, "learning_rate": 2.6047626106630764e-05, "loss": 0.6864, "step": 2330 }, { "epoch": 3.9593908629441623, "grad_norm": 1.2641881704330444, "learning_rate": 2.5257506512110173e-05, "loss": 0.7093, "step": 2340 }, { "epoch": 3.976311336717428, "grad_norm": 0.9996365904808044, "learning_rate": 2.4477820311417866e-05, "loss": 0.6928, "step": 2350 }, { "epoch": 3.993231810490694, "grad_norm": 1.2021087408065796, "learning_rate": 2.3708676341880665e-05, "loss": 0.6513, "step": 2360 }, { "epoch": 4.01015228426396, "grad_norm": 0.7619489431381226, "learning_rate": 2.295018196922285e-05, "loss": 0.5933, "step": 2370 }, { "epoch": 4.027072758037225, "grad_norm": 0.9575318098068237, "learning_rate": 2.220244307257865e-05, "loss": 0.5914, "step": 2380 }, { "epoch": 4.04399323181049, "grad_norm": 1.0752573013305664, "learning_rate": 2.1465564029712704e-05, "loss": 0.6044, "step": 2390 }, { "epoch": 4.060913705583756, "grad_norm": 0.8898993730545044, "learning_rate": 2.073964770244967e-05, "loss": 0.5523, "step": 2400 }, { "epoch": 4.077834179357022, "grad_norm": 1.2780104875564575, "learning_rate": 2.002479542231558e-05, "loss": 0.5536, "step": 2410 }, { "epoch": 4.094754653130288, "grad_norm": 1.157820463180542, "learning_rate": 1.9321106976392998e-05, "loss": 0.6153, "step": 2420 }, { "epoch": 4.111675126903553, "grad_norm": 1.119541049003601, "learning_rate": 1.8628680593391556e-05, "loss": 0.5425, "step": 2430 }, { "epoch": 4.128595600676819, "grad_norm": 0.9344776272773743, "learning_rate": 1.7947612929936053e-05, "loss": 0.5635, "step": 2440 }, { "epoch": 4.145516074450085, "grad_norm": 0.9114211201667786, "learning_rate": 1.72779990570741e-05, "loss": 0.5909, "step": 2450 }, { "epoch": 4.16243654822335, "grad_norm": 0.8573546409606934, "learning_rate": 1.6619932447005003e-05, "loss": 0.597, "step": 2460 }, { "epoch": 4.179357021996616, "grad_norm": 0.9491050839424133, "learning_rate": 1.5973504960031936e-05, "loss": 0.5778, "step": 2470 }, { "epoch": 4.196277495769881, "grad_norm": 1.0331389904022217, "learning_rate": 1.533880683173885e-05, "loss": 0.6187, "step": 2480 }, { "epoch": 4.213197969543147, "grad_norm": 1.0236334800720215, "learning_rate": 1.4715926660394696e-05, "loss": 0.5816, "step": 2490 }, { "epoch": 4.230118443316413, "grad_norm": 0.8633939027786255, "learning_rate": 1.410495139458563e-05, "loss": 0.584, "step": 2500 }, { "epoch": 4.247038917089679, "grad_norm": 1.0404596328735352, "learning_rate": 1.3505966321077857e-05, "loss": 0.5848, "step": 2510 }, { "epoch": 4.2639593908629445, "grad_norm": 1.1281291246414185, "learning_rate": 1.2919055052912288e-05, "loss": 0.5899, "step": 2520 }, { "epoch": 4.280879864636209, "grad_norm": 0.9481174945831299, "learning_rate": 1.2344299517733048e-05, "loss": 0.5875, "step": 2530 }, { "epoch": 4.297800338409475, "grad_norm": 0.9576058387756348, "learning_rate": 1.1781779946350924e-05, "loss": 0.5772, "step": 2540 }, { "epoch": 4.314720812182741, "grad_norm": 1.1529393196105957, "learning_rate": 1.1231574861543892e-05, "loss": 0.5495, "step": 2550 }, { "epoch": 4.331641285956007, "grad_norm": 1.1568267345428467, "learning_rate": 1.069376106709612e-05, "loss": 0.5644, "step": 2560 }, { "epoch": 4.3485617597292725, "grad_norm": 1.0102249383926392, "learning_rate": 1.0168413637076735e-05, "loss": 0.5893, "step": 2570 }, { "epoch": 4.365482233502538, "grad_norm": 1.1391773223876953, "learning_rate": 9.65560590536021e-06, "loss": 0.6075, "step": 2580 }, { "epoch": 4.382402707275804, "grad_norm": 1.0958099365234375, "learning_rate": 9.155409455389553e-06, "loss": 0.6103, "step": 2590 }, { "epoch": 4.39932318104907, "grad_norm": 1.1859424114227295, "learning_rate": 8.667894110183895e-06, "loss": 0.5932, "step": 2600 }, { "epoch": 4.416243654822335, "grad_norm": 1.0985631942749023, "learning_rate": 8.19312792259187e-06, "loss": 0.6136, "step": 2610 }, { "epoch": 4.4331641285956005, "grad_norm": 1.019136667251587, "learning_rate": 7.731177165791948e-06, "loss": 0.6161, "step": 2620 }, { "epoch": 4.450084602368866, "grad_norm": 0.9153673052787781, "learning_rate": 7.282106324041349e-06, "loss": 0.6149, "step": 2630 }, { "epoch": 4.467005076142132, "grad_norm": 1.2232962846755981, "learning_rate": 6.845978083674587e-06, "loss": 0.5584, "step": 2640 }, { "epoch": 4.483925549915398, "grad_norm": 1.3178131580352783, "learning_rate": 6.4228533243530065e-06, "loss": 0.6045, "step": 2650 }, { "epoch": 4.500846023688664, "grad_norm": 1.2076665163040161, "learning_rate": 6.012791110566473e-06, "loss": 0.613, "step": 2660 }, { "epoch": 4.517766497461929, "grad_norm": 1.0967868566513062, "learning_rate": 5.615848683388636e-06, "loss": 0.6126, "step": 2670 }, { "epoch": 4.534686971235194, "grad_norm": 1.1182656288146973, "learning_rate": 5.232081452486437e-06, "loss": 0.5872, "step": 2680 }, { "epoch": 4.55160744500846, "grad_norm": 1.2936855554580688, "learning_rate": 4.861542988385393e-06, "loss": 0.5872, "step": 2690 }, { "epoch": 4.568527918781726, "grad_norm": 1.2152096033096313, "learning_rate": 4.504285014991761e-06, "loss": 0.5829, "step": 2700 }, { "epoch": 4.585448392554992, "grad_norm": 0.859682559967041, "learning_rate": 4.160357402372217e-06, "loss": 0.6164, "step": 2710 }, { "epoch": 4.602368866328257, "grad_norm": 1.1588406562805176, "learning_rate": 3.8298081597925025e-06, "loss": 0.5858, "step": 2720 }, { "epoch": 4.619289340101523, "grad_norm": 0.8907061815261841, "learning_rate": 3.5126834290157063e-06, "loss": 0.583, "step": 2730 }, { "epoch": 4.636209813874789, "grad_norm": 1.0281578302383423, "learning_rate": 3.209027477861293e-06, "loss": 0.6147, "step": 2740 }, { "epoch": 4.653130287648054, "grad_norm": 1.67826247215271, "learning_rate": 2.9188826940257373e-06, "loss": 0.5593, "step": 2750 }, { "epoch": 4.67005076142132, "grad_norm": 0.8971337676048279, "learning_rate": 2.6422895791655243e-06, "loss": 0.5717, "step": 2760 }, { "epoch": 4.686971235194585, "grad_norm": 0.9563831090927124, "learning_rate": 2.379286743243514e-06, "loss": 0.5616, "step": 2770 }, { "epoch": 4.703891708967851, "grad_norm": 0.8459508419036865, "learning_rate": 2.1299108991393314e-06, "loss": 0.6003, "step": 2780 }, { "epoch": 4.720812182741117, "grad_norm": 1.0900635719299316, "learning_rate": 1.8941968575245327e-06, "loss": 0.5673, "step": 2790 }, { "epoch": 4.737732656514383, "grad_norm": 0.8653163313865662, "learning_rate": 1.6721775220033598e-06, "loss": 0.5735, "step": 2800 }, { "epoch": 4.7546531302876485, "grad_norm": 1.0450564622879028, "learning_rate": 1.4638838845197344e-06, "loss": 0.5527, "step": 2810 }, { "epoch": 4.771573604060913, "grad_norm": 1.1020820140838623, "learning_rate": 1.2693450210309877e-06, "loss": 0.5623, "step": 2820 }, { "epoch": 4.788494077834179, "grad_norm": 0.9869380593299866, "learning_rate": 1.0885880874491273e-06, "loss": 0.5677, "step": 2830 }, { "epoch": 4.805414551607445, "grad_norm": 1.2315829992294312, "learning_rate": 9.216383158501596e-07, "loss": 0.5805, "step": 2840 }, { "epoch": 4.822335025380711, "grad_norm": 0.9307577610015869, "learning_rate": 7.685190109518514e-07, "loss": 0.5778, "step": 2850 }, { "epoch": 4.8392554991539765, "grad_norm": 1.1593735218048096, "learning_rate": 6.29251546860643e-07, "loss": 0.5772, "step": 2860 }, { "epoch": 4.856175972927242, "grad_norm": 1.1079590320587158, "learning_rate": 5.038553640879684e-07, "loss": 0.6036, "step": 2870 }, { "epoch": 4.873096446700508, "grad_norm": 1.0658416748046875, "learning_rate": 3.923479668365815e-07, "loss": 0.5779, "step": 2880 }, { "epoch": 4.890016920473773, "grad_norm": 0.9755645990371704, "learning_rate": 2.9474492055708845e-07, "loss": 0.6032, "step": 2890 }, { "epoch": 4.906937394247039, "grad_norm": 1.306435465812683, "learning_rate": 2.1105984977513038e-07, "loss": 0.6159, "step": 2900 }, { "epoch": 4.9238578680203045, "grad_norm": 1.0507274866104126, "learning_rate": 1.413044361895932e-07, "loss": 0.5738, "step": 2910 }, { "epoch": 4.94077834179357, "grad_norm": 0.9364911913871765, "learning_rate": 8.548841704185684e-08, "loss": 0.5774, "step": 2920 }, { "epoch": 4.957698815566836, "grad_norm": 0.8698614239692688, "learning_rate": 4.361958375662667e-08, "loss": 0.5491, "step": 2930 }, { "epoch": 4.974619289340102, "grad_norm": 1.1547551155090332, "learning_rate": 1.570378085428148e-08, "loss": 0.5937, "step": 2940 }, { "epoch": 4.991539763113368, "grad_norm": 1.1454553604125977, "learning_rate": 1.7449051350482137e-09, "loss": 0.5837, "step": 2950 }, { "epoch": 5.0, "step": 2955, "total_flos": 3.752202925842104e+17, "train_loss": 0.8236714023422267, "train_runtime": 1258.0596, "train_samples_per_second": 75.155, "train_steps_per_second": 2.349 } ], "logging_steps": 10, "max_steps": 2955, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.752202925842104e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }