{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.994871794871795, "eval_steps": 500, "global_step": 657, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045584045584045586, "grad_norm": 0.9747199141382675, "learning_rate": 5e-06, "loss": 0.7529, "step": 10 }, { "epoch": 0.09116809116809117, "grad_norm": 0.8738603014316104, "learning_rate": 5e-06, "loss": 0.6963, "step": 20 }, { "epoch": 0.13675213675213677, "grad_norm": 0.8150690175536751, "learning_rate": 5e-06, "loss": 0.6838, "step": 30 }, { "epoch": 0.18233618233618235, "grad_norm": 0.8040590579100562, "learning_rate": 5e-06, "loss": 0.6731, "step": 40 }, { "epoch": 0.22792022792022792, "grad_norm": 0.8097580687205038, "learning_rate": 5e-06, "loss": 0.6568, "step": 50 }, { "epoch": 0.27350427350427353, "grad_norm": 0.7017512733530653, "learning_rate": 5e-06, "loss": 0.6482, "step": 60 }, { "epoch": 0.3190883190883191, "grad_norm": 0.46658768732638123, "learning_rate": 5e-06, "loss": 0.6508, "step": 70 }, { "epoch": 0.3646723646723647, "grad_norm": 0.27624642671580113, "learning_rate": 5e-06, "loss": 0.6449, "step": 80 }, { "epoch": 0.41025641025641024, "grad_norm": 0.24917438010760184, "learning_rate": 5e-06, "loss": 0.6411, "step": 90 }, { "epoch": 0.45584045584045585, "grad_norm": 0.2581331671168845, "learning_rate": 5e-06, "loss": 0.6402, "step": 100 }, { "epoch": 0.5014245014245015, "grad_norm": 0.24037652957624636, "learning_rate": 5e-06, "loss": 0.6422, "step": 110 }, { "epoch": 0.5470085470085471, "grad_norm": 0.23269369333404952, "learning_rate": 5e-06, "loss": 0.6374, "step": 120 }, { "epoch": 0.5925925925925926, "grad_norm": 0.23915104514085617, "learning_rate": 5e-06, "loss": 0.6395, "step": 130 }, { "epoch": 0.6381766381766382, "grad_norm": 0.2609780887042042, "learning_rate": 5e-06, "loss": 0.6412, "step": 140 }, { "epoch": 0.6837606837606838, "grad_norm": 0.23404048980327943, "learning_rate": 5e-06, "loss": 0.6337, "step": 150 }, { "epoch": 0.7293447293447294, "grad_norm": 0.21838704952274085, "learning_rate": 5e-06, "loss": 0.6333, "step": 160 }, { "epoch": 0.7749287749287749, "grad_norm": 0.2382035774873146, "learning_rate": 5e-06, "loss": 0.633, "step": 170 }, { "epoch": 0.8205128205128205, "grad_norm": 0.2438000385852018, "learning_rate": 5e-06, "loss": 0.6386, "step": 180 }, { "epoch": 0.8660968660968661, "grad_norm": 0.22530554762484006, "learning_rate": 5e-06, "loss": 0.6345, "step": 190 }, { "epoch": 0.9116809116809117, "grad_norm": 0.2349886748386411, "learning_rate": 5e-06, "loss": 0.6408, "step": 200 }, { "epoch": 0.9572649572649573, "grad_norm": 0.25085516929265755, "learning_rate": 5e-06, "loss": 0.6322, "step": 210 }, { "epoch": 0.9982905982905983, "eval_loss": 0.6290858387947083, "eval_runtime": 223.0866, "eval_samples_per_second": 52.997, "eval_steps_per_second": 0.417, "step": 219 }, { "epoch": 1.002849002849003, "grad_norm": 0.4128874703264031, "learning_rate": 5e-06, "loss": 0.6672, "step": 220 }, { "epoch": 1.0484330484330484, "grad_norm": 0.2741974395463933, "learning_rate": 5e-06, "loss": 0.6126, "step": 230 }, { "epoch": 1.0940170940170941, "grad_norm": 0.24996957336799014, "learning_rate": 5e-06, "loss": 0.6122, "step": 240 }, { "epoch": 1.1396011396011396, "grad_norm": 0.2308654521968029, "learning_rate": 5e-06, "loss": 0.6147, "step": 250 }, { "epoch": 1.1851851851851851, "grad_norm": 0.21800620424831227, "learning_rate": 5e-06, "loss": 0.6104, "step": 260 }, { "epoch": 1.2307692307692308, "grad_norm": 0.22878445296983144, "learning_rate": 5e-06, "loss": 0.6109, "step": 270 }, { "epoch": 1.2763532763532763, "grad_norm": 0.22048121695978434, "learning_rate": 5e-06, "loss": 0.6137, "step": 280 }, { "epoch": 1.3219373219373218, "grad_norm": 0.22321985375553052, "learning_rate": 5e-06, "loss": 0.6102, "step": 290 }, { "epoch": 1.3675213675213675, "grad_norm": 0.23340158642086364, "learning_rate": 5e-06, "loss": 0.609, "step": 300 }, { "epoch": 1.413105413105413, "grad_norm": 0.2160885724116477, "learning_rate": 5e-06, "loss": 0.613, "step": 310 }, { "epoch": 1.4586894586894588, "grad_norm": 0.21579764874251084, "learning_rate": 5e-06, "loss": 0.6105, "step": 320 }, { "epoch": 1.5042735042735043, "grad_norm": 0.24191612198460108, "learning_rate": 5e-06, "loss": 0.6049, "step": 330 }, { "epoch": 1.54985754985755, "grad_norm": 0.2218278655422596, "learning_rate": 5e-06, "loss": 0.6081, "step": 340 }, { "epoch": 1.5954415954415955, "grad_norm": 0.2322145105092986, "learning_rate": 5e-06, "loss": 0.6129, "step": 350 }, { "epoch": 1.641025641025641, "grad_norm": 0.21658541308539972, "learning_rate": 5e-06, "loss": 0.6092, "step": 360 }, { "epoch": 1.6866096866096867, "grad_norm": 0.24054412349179422, "learning_rate": 5e-06, "loss": 0.6057, "step": 370 }, { "epoch": 1.7321937321937322, "grad_norm": 0.2725818691336268, "learning_rate": 5e-06, "loss": 0.6065, "step": 380 }, { "epoch": 1.7777777777777777, "grad_norm": 0.23261322507876125, "learning_rate": 5e-06, "loss": 0.6075, "step": 390 }, { "epoch": 1.8233618233618234, "grad_norm": 0.21931910039916533, "learning_rate": 5e-06, "loss": 0.6028, "step": 400 }, { "epoch": 1.868945868945869, "grad_norm": 0.2102187671028171, "learning_rate": 5e-06, "loss": 0.6003, "step": 410 }, { "epoch": 1.9145299145299144, "grad_norm": 0.22631213435723302, "learning_rate": 5e-06, "loss": 0.6067, "step": 420 }, { "epoch": 1.96011396011396, "grad_norm": 0.23696933054944608, "learning_rate": 5e-06, "loss": 0.6083, "step": 430 }, { "epoch": 1.9965811965811966, "eval_loss": 0.6215006709098816, "eval_runtime": 224.1659, "eval_samples_per_second": 52.742, "eval_steps_per_second": 0.415, "step": 438 }, { "epoch": 2.005698005698006, "grad_norm": 0.2323893228533647, "learning_rate": 5e-06, "loss": 0.6421, "step": 440 }, { "epoch": 2.051282051282051, "grad_norm": 0.2248336007687922, "learning_rate": 5e-06, "loss": 0.5803, "step": 450 }, { "epoch": 2.096866096866097, "grad_norm": 0.225417442071853, "learning_rate": 5e-06, "loss": 0.5869, "step": 460 }, { "epoch": 2.1424501424501425, "grad_norm": 0.2204104722476421, "learning_rate": 5e-06, "loss": 0.5845, "step": 470 }, { "epoch": 2.1880341880341883, "grad_norm": 0.2565084390638512, "learning_rate": 5e-06, "loss": 0.5837, "step": 480 }, { "epoch": 2.2336182336182335, "grad_norm": 0.2245257556826023, "learning_rate": 5e-06, "loss": 0.5828, "step": 490 }, { "epoch": 2.2792022792022792, "grad_norm": 0.22001308034422606, "learning_rate": 5e-06, "loss": 0.5831, "step": 500 }, { "epoch": 2.324786324786325, "grad_norm": 0.2270454664497775, "learning_rate": 5e-06, "loss": 0.5838, "step": 510 }, { "epoch": 2.3703703703703702, "grad_norm": 0.22740470236209073, "learning_rate": 5e-06, "loss": 0.586, "step": 520 }, { "epoch": 2.415954415954416, "grad_norm": 0.2302979493200276, "learning_rate": 5e-06, "loss": 0.5883, "step": 530 }, { "epoch": 2.4615384615384617, "grad_norm": 0.25292997039450504, "learning_rate": 5e-06, "loss": 0.5927, "step": 540 }, { "epoch": 2.5071225071225074, "grad_norm": 0.2223763921390108, "learning_rate": 5e-06, "loss": 0.5914, "step": 550 }, { "epoch": 2.5527065527065527, "grad_norm": 0.23815767371523935, "learning_rate": 5e-06, "loss": 0.5859, "step": 560 }, { "epoch": 2.5982905982905984, "grad_norm": 0.2593266359272012, "learning_rate": 5e-06, "loss": 0.5854, "step": 570 }, { "epoch": 2.6438746438746437, "grad_norm": 0.23931559412564932, "learning_rate": 5e-06, "loss": 0.5854, "step": 580 }, { "epoch": 2.6894586894586894, "grad_norm": 0.21456969497967046, "learning_rate": 5e-06, "loss": 0.5865, "step": 590 }, { "epoch": 2.735042735042735, "grad_norm": 0.23061617250879088, "learning_rate": 5e-06, "loss": 0.5862, "step": 600 }, { "epoch": 2.780626780626781, "grad_norm": 0.2088223573865155, "learning_rate": 5e-06, "loss": 0.5856, "step": 610 }, { "epoch": 2.826210826210826, "grad_norm": 0.22810405657658617, "learning_rate": 5e-06, "loss": 0.5874, "step": 620 }, { "epoch": 2.871794871794872, "grad_norm": 0.268468678439456, "learning_rate": 5e-06, "loss": 0.5889, "step": 630 }, { "epoch": 2.9173789173789175, "grad_norm": 0.23190583340922707, "learning_rate": 5e-06, "loss": 0.585, "step": 640 }, { "epoch": 2.962962962962963, "grad_norm": 0.2202766065355913, "learning_rate": 5e-06, "loss": 0.58, "step": 650 }, { "epoch": 2.994871794871795, "eval_loss": 0.6190248131752014, "eval_runtime": 222.4874, "eval_samples_per_second": 53.14, "eval_steps_per_second": 0.418, "step": 657 }, { "epoch": 2.994871794871795, "step": 657, "total_flos": 2754957111459840.0, "train_loss": 0.6167326612196739, "train_runtime": 35776.9525, "train_samples_per_second": 18.835, "train_steps_per_second": 0.018 } ], "logging_steps": 10, "max_steps": 657, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2754957111459840.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }