{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1257, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02386634844868735, "grad_norm": 2.8815478378979726, "learning_rate": 5e-06, "loss": 0.8847, "step": 10 }, { "epoch": 0.0477326968973747, "grad_norm": 2.1111681660055823, "learning_rate": 5e-06, "loss": 0.7795, "step": 20 }, { "epoch": 0.07159904534606205, "grad_norm": 0.7661989040642819, "learning_rate": 5e-06, "loss": 0.7503, "step": 30 }, { "epoch": 0.0954653937947494, "grad_norm": 0.9394058763103804, "learning_rate": 5e-06, "loss": 0.7339, "step": 40 }, { "epoch": 0.11933174224343675, "grad_norm": 0.8225402898620612, "learning_rate": 5e-06, "loss": 0.7135, "step": 50 }, { "epoch": 0.1431980906921241, "grad_norm": 0.7799933213085141, "learning_rate": 5e-06, "loss": 0.7122, "step": 60 }, { "epoch": 0.16706443914081145, "grad_norm": 0.5800160564456466, "learning_rate": 5e-06, "loss": 0.6973, "step": 70 }, { "epoch": 0.1909307875894988, "grad_norm": 0.597063104491002, "learning_rate": 5e-06, "loss": 0.6903, "step": 80 }, { "epoch": 0.21479713603818615, "grad_norm": 0.5997913835782436, "learning_rate": 5e-06, "loss": 0.6808, "step": 90 }, { "epoch": 0.2386634844868735, "grad_norm": 0.9361461454403766, "learning_rate": 5e-06, "loss": 0.6882, "step": 100 }, { "epoch": 0.26252983293556087, "grad_norm": 0.6783993996639943, "learning_rate": 5e-06, "loss": 0.6836, "step": 110 }, { "epoch": 0.2863961813842482, "grad_norm": 0.5795802549448508, "learning_rate": 5e-06, "loss": 0.6806, "step": 120 }, { "epoch": 0.31026252983293556, "grad_norm": 0.5386116555684645, "learning_rate": 5e-06, "loss": 0.6786, "step": 130 }, { "epoch": 0.3341288782816229, "grad_norm": 1.1955667749232783, "learning_rate": 5e-06, "loss": 0.673, "step": 140 }, { "epoch": 0.35799522673031026, "grad_norm": 0.963473662374355, "learning_rate": 5e-06, "loss": 0.6791, "step": 150 }, { "epoch": 0.3818615751789976, "grad_norm": 0.5375818632492324, "learning_rate": 5e-06, "loss": 0.6715, "step": 160 }, { "epoch": 0.40572792362768495, "grad_norm": 0.5122467752567826, "learning_rate": 5e-06, "loss": 0.6672, "step": 170 }, { "epoch": 0.4295942720763723, "grad_norm": 0.5438018622021655, "learning_rate": 5e-06, "loss": 0.6696, "step": 180 }, { "epoch": 0.45346062052505964, "grad_norm": 0.5443450875717797, "learning_rate": 5e-06, "loss": 0.6707, "step": 190 }, { "epoch": 0.477326968973747, "grad_norm": 0.5591666330075007, "learning_rate": 5e-06, "loss": 0.6629, "step": 200 }, { "epoch": 0.5011933174224343, "grad_norm": 0.7316386056094906, "learning_rate": 5e-06, "loss": 0.6601, "step": 210 }, { "epoch": 0.5250596658711217, "grad_norm": 0.8169771743047101, "learning_rate": 5e-06, "loss": 0.666, "step": 220 }, { "epoch": 0.548926014319809, "grad_norm": 0.5023436573258486, "learning_rate": 5e-06, "loss": 0.6655, "step": 230 }, { "epoch": 0.5727923627684964, "grad_norm": 0.5715922888425466, "learning_rate": 5e-06, "loss": 0.6621, "step": 240 }, { "epoch": 0.5966587112171837, "grad_norm": 0.5978492051245125, "learning_rate": 5e-06, "loss": 0.6606, "step": 250 }, { "epoch": 0.6205250596658711, "grad_norm": 0.5562863722589444, "learning_rate": 5e-06, "loss": 0.6646, "step": 260 }, { "epoch": 0.6443914081145584, "grad_norm": 0.5933691995834427, "learning_rate": 5e-06, "loss": 0.6583, "step": 270 }, { "epoch": 0.6682577565632458, "grad_norm": 0.5981641076306046, "learning_rate": 5e-06, "loss": 0.6557, "step": 280 }, { "epoch": 0.6921241050119332, "grad_norm": 0.8109530838139422, "learning_rate": 5e-06, "loss": 0.6582, "step": 290 }, { "epoch": 0.7159904534606205, "grad_norm": 0.5965206875329182, "learning_rate": 5e-06, "loss": 0.6621, "step": 300 }, { "epoch": 0.7398568019093079, "grad_norm": 0.486222749934066, "learning_rate": 5e-06, "loss": 0.6549, "step": 310 }, { "epoch": 0.7637231503579952, "grad_norm": 0.5522832083975265, "learning_rate": 5e-06, "loss": 0.6496, "step": 320 }, { "epoch": 0.7875894988066826, "grad_norm": 0.5396338578678825, "learning_rate": 5e-06, "loss": 0.6431, "step": 330 }, { "epoch": 0.8114558472553699, "grad_norm": 0.509360104131435, "learning_rate": 5e-06, "loss": 0.6458, "step": 340 }, { "epoch": 0.8353221957040573, "grad_norm": 0.5620324892726529, "learning_rate": 5e-06, "loss": 0.6564, "step": 350 }, { "epoch": 0.8591885441527446, "grad_norm": 0.6119146933240237, "learning_rate": 5e-06, "loss": 0.6534, "step": 360 }, { "epoch": 0.883054892601432, "grad_norm": 0.5839021797409776, "learning_rate": 5e-06, "loss": 0.6565, "step": 370 }, { "epoch": 0.9069212410501193, "grad_norm": 0.43401159052073285, "learning_rate": 5e-06, "loss": 0.6535, "step": 380 }, { "epoch": 0.9307875894988067, "grad_norm": 0.5668890229094246, "learning_rate": 5e-06, "loss": 0.6462, "step": 390 }, { "epoch": 0.954653937947494, "grad_norm": 0.5782226624956547, "learning_rate": 5e-06, "loss": 0.6546, "step": 400 }, { "epoch": 0.9785202863961814, "grad_norm": 0.6579659643852935, "learning_rate": 5e-06, "loss": 0.641, "step": 410 }, { "epoch": 1.0, "eval_loss": 0.6449207663536072, "eval_runtime": 41.1778, "eval_samples_per_second": 273.813, "eval_steps_per_second": 1.093, "step": 419 }, { "epoch": 1.0023866348448687, "grad_norm": 0.7827774952972171, "learning_rate": 5e-06, "loss": 0.6431, "step": 420 }, { "epoch": 1.026252983293556, "grad_norm": 0.6329747383963555, "learning_rate": 5e-06, "loss": 0.6134, "step": 430 }, { "epoch": 1.0501193317422435, "grad_norm": 0.5281801985221292, "learning_rate": 5e-06, "loss": 0.6083, "step": 440 }, { "epoch": 1.0739856801909309, "grad_norm": 0.7577758408944637, "learning_rate": 5e-06, "loss": 0.6061, "step": 450 }, { "epoch": 1.097852028639618, "grad_norm": 0.5351677292156073, "learning_rate": 5e-06, "loss": 0.6031, "step": 460 }, { "epoch": 1.1217183770883055, "grad_norm": 0.5015989972497082, "learning_rate": 5e-06, "loss": 0.6059, "step": 470 }, { "epoch": 1.1455847255369929, "grad_norm": 0.5967761959033508, "learning_rate": 5e-06, "loss": 0.6144, "step": 480 }, { "epoch": 1.1694510739856803, "grad_norm": 0.6199769517789647, "learning_rate": 5e-06, "loss": 0.6089, "step": 490 }, { "epoch": 1.1933174224343674, "grad_norm": 0.5989234321280023, "learning_rate": 5e-06, "loss": 0.6079, "step": 500 }, { "epoch": 1.2171837708830548, "grad_norm": 0.48214122296698664, "learning_rate": 5e-06, "loss": 0.6106, "step": 510 }, { "epoch": 1.2410501193317423, "grad_norm": 0.500906885639557, "learning_rate": 5e-06, "loss": 0.6114, "step": 520 }, { "epoch": 1.2649164677804297, "grad_norm": 0.5055182485221988, "learning_rate": 5e-06, "loss": 0.6073, "step": 530 }, { "epoch": 1.288782816229117, "grad_norm": 0.5890740590556416, "learning_rate": 5e-06, "loss": 0.6074, "step": 540 }, { "epoch": 1.3126491646778042, "grad_norm": 0.48236044063151085, "learning_rate": 5e-06, "loss": 0.6128, "step": 550 }, { "epoch": 1.3365155131264916, "grad_norm": 0.5202514925506149, "learning_rate": 5e-06, "loss": 0.6058, "step": 560 }, { "epoch": 1.360381861575179, "grad_norm": 0.5228316664959745, "learning_rate": 5e-06, "loss": 0.6091, "step": 570 }, { "epoch": 1.3842482100238662, "grad_norm": 0.51530770994292, "learning_rate": 5e-06, "loss": 0.6087, "step": 580 }, { "epoch": 1.4081145584725536, "grad_norm": 0.6386559379894787, "learning_rate": 5e-06, "loss": 0.6116, "step": 590 }, { "epoch": 1.431980906921241, "grad_norm": 0.4779744600855222, "learning_rate": 5e-06, "loss": 0.6033, "step": 600 }, { "epoch": 1.4558472553699284, "grad_norm": 0.4819600928038827, "learning_rate": 5e-06, "loss": 0.6077, "step": 610 }, { "epoch": 1.4797136038186158, "grad_norm": 0.45917275315096606, "learning_rate": 5e-06, "loss": 0.6094, "step": 620 }, { "epoch": 1.503579952267303, "grad_norm": 0.5010113270578477, "learning_rate": 5e-06, "loss": 0.6055, "step": 630 }, { "epoch": 1.5274463007159904, "grad_norm": 0.7579117243752399, "learning_rate": 5e-06, "loss": 0.6159, "step": 640 }, { "epoch": 1.5513126491646778, "grad_norm": 0.6420792800924288, "learning_rate": 5e-06, "loss": 0.6058, "step": 650 }, { "epoch": 1.575178997613365, "grad_norm": 0.55940882824889, "learning_rate": 5e-06, "loss": 0.5961, "step": 660 }, { "epoch": 1.5990453460620526, "grad_norm": 0.4983792149426302, "learning_rate": 5e-06, "loss": 0.6083, "step": 670 }, { "epoch": 1.6229116945107398, "grad_norm": 0.5263299595036224, "learning_rate": 5e-06, "loss": 0.6053, "step": 680 }, { "epoch": 1.6467780429594272, "grad_norm": 0.5149484039474402, "learning_rate": 5e-06, "loss": 0.6079, "step": 690 }, { "epoch": 1.6706443914081146, "grad_norm": 0.4685802940879146, "learning_rate": 5e-06, "loss": 0.6041, "step": 700 }, { "epoch": 1.6945107398568018, "grad_norm": 0.4639317334767733, "learning_rate": 5e-06, "loss": 0.6012, "step": 710 }, { "epoch": 1.7183770883054894, "grad_norm": 0.48774835965978913, "learning_rate": 5e-06, "loss": 0.6117, "step": 720 }, { "epoch": 1.7422434367541766, "grad_norm": 0.4895883070209168, "learning_rate": 5e-06, "loss": 0.6066, "step": 730 }, { "epoch": 1.766109785202864, "grad_norm": 0.45180400146140737, "learning_rate": 5e-06, "loss": 0.6031, "step": 740 }, { "epoch": 1.7899761336515514, "grad_norm": 0.5197512123710193, "learning_rate": 5e-06, "loss": 0.6018, "step": 750 }, { "epoch": 1.8138424821002386, "grad_norm": 0.45816712660411146, "learning_rate": 5e-06, "loss": 0.6024, "step": 760 }, { "epoch": 1.837708830548926, "grad_norm": 0.45318755220959944, "learning_rate": 5e-06, "loss": 0.6101, "step": 770 }, { "epoch": 1.8615751789976134, "grad_norm": 0.48227944198410183, "learning_rate": 5e-06, "loss": 0.6084, "step": 780 }, { "epoch": 1.8854415274463006, "grad_norm": 0.5498211459609608, "learning_rate": 5e-06, "loss": 0.6014, "step": 790 }, { "epoch": 1.9093078758949882, "grad_norm": 0.5272792947894827, "learning_rate": 5e-06, "loss": 0.6035, "step": 800 }, { "epoch": 1.9331742243436754, "grad_norm": 0.4520775749202011, "learning_rate": 5e-06, "loss": 0.6023, "step": 810 }, { "epoch": 1.9570405727923628, "grad_norm": 0.48624457536560756, "learning_rate": 5e-06, "loss": 0.6022, "step": 820 }, { "epoch": 1.9809069212410502, "grad_norm": 0.5488566572359053, "learning_rate": 5e-06, "loss": 0.6068, "step": 830 }, { "epoch": 2.0, "eval_loss": 0.6355295777320862, "eval_runtime": 40.6265, "eval_samples_per_second": 277.529, "eval_steps_per_second": 1.108, "step": 838 }, { "epoch": 2.0047732696897373, "grad_norm": 0.6658281603460572, "learning_rate": 5e-06, "loss": 0.5955, "step": 840 }, { "epoch": 2.028639618138425, "grad_norm": 0.7098614483793992, "learning_rate": 5e-06, "loss": 0.5701, "step": 850 }, { "epoch": 2.052505966587112, "grad_norm": 0.6533616500153973, "learning_rate": 5e-06, "loss": 0.5634, "step": 860 }, { "epoch": 2.0763723150357993, "grad_norm": 0.6126417325021997, "learning_rate": 5e-06, "loss": 0.5639, "step": 870 }, { "epoch": 2.100238663484487, "grad_norm": 0.44612672152357774, "learning_rate": 5e-06, "loss": 0.5671, "step": 880 }, { "epoch": 2.124105011933174, "grad_norm": 0.5379973514690706, "learning_rate": 5e-06, "loss": 0.562, "step": 890 }, { "epoch": 2.1479713603818618, "grad_norm": 0.442404391317877, "learning_rate": 5e-06, "loss": 0.5728, "step": 900 }, { "epoch": 2.171837708830549, "grad_norm": 0.5754078775127957, "learning_rate": 5e-06, "loss": 0.5613, "step": 910 }, { "epoch": 2.195704057279236, "grad_norm": 0.6586173018673331, "learning_rate": 5e-06, "loss": 0.5654, "step": 920 }, { "epoch": 2.2195704057279237, "grad_norm": 0.5439510862576353, "learning_rate": 5e-06, "loss": 0.5614, "step": 930 }, { "epoch": 2.243436754176611, "grad_norm": 0.5740630674331443, "learning_rate": 5e-06, "loss": 0.565, "step": 940 }, { "epoch": 2.2673031026252985, "grad_norm": 0.48777976586303107, "learning_rate": 5e-06, "loss": 0.561, "step": 950 }, { "epoch": 2.2911694510739857, "grad_norm": 0.4461443532364416, "learning_rate": 5e-06, "loss": 0.5582, "step": 960 }, { "epoch": 2.315035799522673, "grad_norm": 0.487505257972905, "learning_rate": 5e-06, "loss": 0.5692, "step": 970 }, { "epoch": 2.3389021479713605, "grad_norm": 0.48305958679617367, "learning_rate": 5e-06, "loss": 0.5694, "step": 980 }, { "epoch": 2.3627684964200477, "grad_norm": 0.503359166301003, "learning_rate": 5e-06, "loss": 0.5609, "step": 990 }, { "epoch": 2.386634844868735, "grad_norm": 0.5256679194745039, "learning_rate": 5e-06, "loss": 0.5632, "step": 1000 }, { "epoch": 2.4105011933174225, "grad_norm": 0.5732669619119689, "learning_rate": 5e-06, "loss": 0.5666, "step": 1010 }, { "epoch": 2.4343675417661097, "grad_norm": 0.4506151186524362, "learning_rate": 5e-06, "loss": 0.5762, "step": 1020 }, { "epoch": 2.4582338902147973, "grad_norm": 0.4746241123968773, "learning_rate": 5e-06, "loss": 0.5652, "step": 1030 }, { "epoch": 2.4821002386634845, "grad_norm": 0.46349902513638275, "learning_rate": 5e-06, "loss": 0.5644, "step": 1040 }, { "epoch": 2.5059665871121717, "grad_norm": 0.45976906873116374, "learning_rate": 5e-06, "loss": 0.5686, "step": 1050 }, { "epoch": 2.5298329355608593, "grad_norm": 0.4662220401853357, "learning_rate": 5e-06, "loss": 0.5686, "step": 1060 }, { "epoch": 2.5536992840095465, "grad_norm": 0.4951436398512421, "learning_rate": 5e-06, "loss": 0.5702, "step": 1070 }, { "epoch": 2.577565632458234, "grad_norm": 0.4502698747379483, "learning_rate": 5e-06, "loss": 0.5719, "step": 1080 }, { "epoch": 2.6014319809069213, "grad_norm": 0.5079705723721918, "learning_rate": 5e-06, "loss": 0.5671, "step": 1090 }, { "epoch": 2.6252983293556085, "grad_norm": 0.5140346872907439, "learning_rate": 5e-06, "loss": 0.5671, "step": 1100 }, { "epoch": 2.649164677804296, "grad_norm": 0.5358331120197253, "learning_rate": 5e-06, "loss": 0.5633, "step": 1110 }, { "epoch": 2.6730310262529833, "grad_norm": 0.4907309631164768, "learning_rate": 5e-06, "loss": 0.5644, "step": 1120 }, { "epoch": 2.6968973747016705, "grad_norm": 0.5049970936550133, "learning_rate": 5e-06, "loss": 0.5692, "step": 1130 }, { "epoch": 2.720763723150358, "grad_norm": 0.5553567091902175, "learning_rate": 5e-06, "loss": 0.5658, "step": 1140 }, { "epoch": 2.7446300715990453, "grad_norm": 0.49261752142359677, "learning_rate": 5e-06, "loss": 0.5742, "step": 1150 }, { "epoch": 2.7684964200477324, "grad_norm": 0.5018759977401656, "learning_rate": 5e-06, "loss": 0.5645, "step": 1160 }, { "epoch": 2.79236276849642, "grad_norm": 0.45826106825699625, "learning_rate": 5e-06, "loss": 0.5641, "step": 1170 }, { "epoch": 2.8162291169451072, "grad_norm": 0.5072976091618316, "learning_rate": 5e-06, "loss": 0.5676, "step": 1180 }, { "epoch": 2.840095465393795, "grad_norm": 0.651235919618626, "learning_rate": 5e-06, "loss": 0.5681, "step": 1190 }, { "epoch": 2.863961813842482, "grad_norm": 0.5113781250941779, "learning_rate": 5e-06, "loss": 0.5669, "step": 1200 }, { "epoch": 2.8878281622911697, "grad_norm": 0.4949601222660754, "learning_rate": 5e-06, "loss": 0.5581, "step": 1210 }, { "epoch": 2.911694510739857, "grad_norm": 0.5748654481884351, "learning_rate": 5e-06, "loss": 0.5678, "step": 1220 }, { "epoch": 2.935560859188544, "grad_norm": 0.5793617800123868, "learning_rate": 5e-06, "loss": 0.5649, "step": 1230 }, { "epoch": 2.9594272076372317, "grad_norm": 0.4514242707013011, "learning_rate": 5e-06, "loss": 0.5652, "step": 1240 }, { "epoch": 2.983293556085919, "grad_norm": 0.4744906861925991, "learning_rate": 5e-06, "loss": 0.5638, "step": 1250 }, { "epoch": 3.0, "eval_loss": 0.6389971375465393, "eval_runtime": 41.4704, "eval_samples_per_second": 271.881, "eval_steps_per_second": 1.085, "step": 1257 }, { "epoch": 3.0, "step": 1257, "total_flos": 2105521817518080.0, "train_loss": 0.6169646524112189, "train_runtime": 7810.9265, "train_samples_per_second": 82.273, "train_steps_per_second": 0.161 } ], "logging_steps": 10, "max_steps": 1257, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2105521817518080.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }