{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 969, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030959752321981424, "grad_norm": 9.26792445696984, "learning_rate": 5e-06, "loss": 0.9296, "step": 10 }, { "epoch": 0.06191950464396285, "grad_norm": 1.9224039875777081, "learning_rate": 5e-06, "loss": 0.826, "step": 20 }, { "epoch": 0.09287925696594428, "grad_norm": 4.074240624021427, "learning_rate": 5e-06, "loss": 0.7818, "step": 30 }, { "epoch": 0.1238390092879257, "grad_norm": 1.5813292349658419, "learning_rate": 5e-06, "loss": 0.7624, "step": 40 }, { "epoch": 0.15479876160990713, "grad_norm": 0.8536213234663932, "learning_rate": 5e-06, "loss": 0.7313, "step": 50 }, { "epoch": 0.18575851393188855, "grad_norm": 1.087523888330213, "learning_rate": 5e-06, "loss": 0.7307, "step": 60 }, { "epoch": 0.21671826625386997, "grad_norm": 0.8453603539631231, "learning_rate": 5e-06, "loss": 0.7126, "step": 70 }, { "epoch": 0.2476780185758514, "grad_norm": 0.9824688751534248, "learning_rate": 5e-06, "loss": 0.7113, "step": 80 }, { "epoch": 0.2786377708978328, "grad_norm": 0.9326537359571924, "learning_rate": 5e-06, "loss": 0.7037, "step": 90 }, { "epoch": 0.30959752321981426, "grad_norm": 0.548855695013045, "learning_rate": 5e-06, "loss": 0.6988, "step": 100 }, { "epoch": 0.34055727554179566, "grad_norm": 0.5127345076323682, "learning_rate": 5e-06, "loss": 0.6969, "step": 110 }, { "epoch": 0.3715170278637771, "grad_norm": 0.5346034637159197, "learning_rate": 5e-06, "loss": 0.6932, "step": 120 }, { "epoch": 0.4024767801857585, "grad_norm": 0.6366003373202664, "learning_rate": 5e-06, "loss": 0.6862, "step": 130 }, { "epoch": 0.43343653250773995, "grad_norm": 0.659232587562787, "learning_rate": 5e-06, "loss": 0.6861, "step": 140 }, { "epoch": 0.46439628482972134, "grad_norm": 0.7626299453869156, "learning_rate": 5e-06, "loss": 0.6867, "step": 150 }, { "epoch": 0.4953560371517028, "grad_norm": 0.600108061207952, "learning_rate": 5e-06, "loss": 0.6761, "step": 160 }, { "epoch": 0.5263157894736842, "grad_norm": 0.6562920403508146, "learning_rate": 5e-06, "loss": 0.6822, "step": 170 }, { "epoch": 0.5572755417956656, "grad_norm": 0.554500335402875, "learning_rate": 5e-06, "loss": 0.6771, "step": 180 }, { "epoch": 0.5882352941176471, "grad_norm": 0.7448258243980852, "learning_rate": 5e-06, "loss": 0.6725, "step": 190 }, { "epoch": 0.6191950464396285, "grad_norm": 0.7212261495226227, "learning_rate": 5e-06, "loss": 0.6704, "step": 200 }, { "epoch": 0.6501547987616099, "grad_norm": 0.650918538222657, "learning_rate": 5e-06, "loss": 0.6747, "step": 210 }, { "epoch": 0.6811145510835913, "grad_norm": 0.5979672431899267, "learning_rate": 5e-06, "loss": 0.6721, "step": 220 }, { "epoch": 0.7120743034055728, "grad_norm": 0.5587950559181737, "learning_rate": 5e-06, "loss": 0.6775, "step": 230 }, { "epoch": 0.7430340557275542, "grad_norm": 0.7504748626653679, "learning_rate": 5e-06, "loss": 0.6772, "step": 240 }, { "epoch": 0.7739938080495357, "grad_norm": 0.5672124597726503, "learning_rate": 5e-06, "loss": 0.6649, "step": 250 }, { "epoch": 0.804953560371517, "grad_norm": 0.6667059518291187, "learning_rate": 5e-06, "loss": 0.6628, "step": 260 }, { "epoch": 0.8359133126934984, "grad_norm": 0.5557591111686754, "learning_rate": 5e-06, "loss": 0.666, "step": 270 }, { "epoch": 0.8668730650154799, "grad_norm": 0.7400262670967761, "learning_rate": 5e-06, "loss": 0.6608, "step": 280 }, { "epoch": 0.8978328173374613, "grad_norm": 0.5053622688793449, "learning_rate": 5e-06, "loss": 0.6657, "step": 290 }, { "epoch": 0.9287925696594427, "grad_norm": 0.611108366609724, "learning_rate": 5e-06, "loss": 0.6612, "step": 300 }, { "epoch": 0.9597523219814241, "grad_norm": 0.5223978557328981, "learning_rate": 5e-06, "loss": 0.6585, "step": 310 }, { "epoch": 0.9907120743034056, "grad_norm": 0.5084066168996301, "learning_rate": 5e-06, "loss": 0.6582, "step": 320 }, { "epoch": 1.0, "eval_loss": 0.6614387631416321, "eval_runtime": 31.0197, "eval_samples_per_second": 280.015, "eval_steps_per_second": 1.096, "step": 323 }, { "epoch": 1.021671826625387, "grad_norm": 0.8185084818131129, "learning_rate": 5e-06, "loss": 0.6318, "step": 330 }, { "epoch": 1.0526315789473684, "grad_norm": 0.7247142810746562, "learning_rate": 5e-06, "loss": 0.6153, "step": 340 }, { "epoch": 1.08359133126935, "grad_norm": 0.574027398851242, "learning_rate": 5e-06, "loss": 0.6182, "step": 350 }, { "epoch": 1.1145510835913313, "grad_norm": 0.5449783832911268, "learning_rate": 5e-06, "loss": 0.6155, "step": 360 }, { "epoch": 1.1455108359133126, "grad_norm": 0.5025194777117987, "learning_rate": 5e-06, "loss": 0.6181, "step": 370 }, { "epoch": 1.1764705882352942, "grad_norm": 0.5884698444538693, "learning_rate": 5e-06, "loss": 0.6145, "step": 380 }, { "epoch": 1.2074303405572755, "grad_norm": 0.6402773293709152, "learning_rate": 5e-06, "loss": 0.6182, "step": 390 }, { "epoch": 1.238390092879257, "grad_norm": 0.7376501065513348, "learning_rate": 5e-06, "loss": 0.6078, "step": 400 }, { "epoch": 1.2693498452012384, "grad_norm": 0.5604731743792086, "learning_rate": 5e-06, "loss": 0.6178, "step": 410 }, { "epoch": 1.3003095975232197, "grad_norm": 0.5374880157799365, "learning_rate": 5e-06, "loss": 0.6118, "step": 420 }, { "epoch": 1.3312693498452013, "grad_norm": 0.5183594439426741, "learning_rate": 5e-06, "loss": 0.6152, "step": 430 }, { "epoch": 1.3622291021671826, "grad_norm": 0.7429617462855133, "learning_rate": 5e-06, "loss": 0.6189, "step": 440 }, { "epoch": 1.3931888544891642, "grad_norm": 0.5637315725076051, "learning_rate": 5e-06, "loss": 0.6143, "step": 450 }, { "epoch": 1.4241486068111455, "grad_norm": 0.47806274705798535, "learning_rate": 5e-06, "loss": 0.6184, "step": 460 }, { "epoch": 1.4551083591331269, "grad_norm": 0.7013859457502354, "learning_rate": 5e-06, "loss": 0.6146, "step": 470 }, { "epoch": 1.4860681114551084, "grad_norm": 0.7728057436850783, "learning_rate": 5e-06, "loss": 0.6235, "step": 480 }, { "epoch": 1.5170278637770898, "grad_norm": 0.5540033221172351, "learning_rate": 5e-06, "loss": 0.6146, "step": 490 }, { "epoch": 1.5479876160990713, "grad_norm": 0.4989101370327553, "learning_rate": 5e-06, "loss": 0.6109, "step": 500 }, { "epoch": 1.5789473684210527, "grad_norm": 0.6201415740790441, "learning_rate": 5e-06, "loss": 0.6127, "step": 510 }, { "epoch": 1.609907120743034, "grad_norm": 0.6182247247832351, "learning_rate": 5e-06, "loss": 0.6158, "step": 520 }, { "epoch": 1.6408668730650153, "grad_norm": 0.5273934388255005, "learning_rate": 5e-06, "loss": 0.6152, "step": 530 }, { "epoch": 1.671826625386997, "grad_norm": 0.5657886835824731, "learning_rate": 5e-06, "loss": 0.6135, "step": 540 }, { "epoch": 1.7027863777089784, "grad_norm": 0.6873412871431165, "learning_rate": 5e-06, "loss": 0.615, "step": 550 }, { "epoch": 1.7337461300309598, "grad_norm": 0.5877043924887657, "learning_rate": 5e-06, "loss": 0.6125, "step": 560 }, { "epoch": 1.7647058823529411, "grad_norm": 0.6037122620569243, "learning_rate": 5e-06, "loss": 0.6118, "step": 570 }, { "epoch": 1.7956656346749225, "grad_norm": 0.4753962140527953, "learning_rate": 5e-06, "loss": 0.6128, "step": 580 }, { "epoch": 1.826625386996904, "grad_norm": 0.5268653628592879, "learning_rate": 5e-06, "loss": 0.6172, "step": 590 }, { "epoch": 1.8575851393188856, "grad_norm": 1.002687895644436, "learning_rate": 5e-06, "loss": 0.615, "step": 600 }, { "epoch": 1.888544891640867, "grad_norm": 0.5100009376935558, "learning_rate": 5e-06, "loss": 0.6182, "step": 610 }, { "epoch": 1.9195046439628483, "grad_norm": 0.691746404073842, "learning_rate": 5e-06, "loss": 0.6131, "step": 620 }, { "epoch": 1.9504643962848296, "grad_norm": 0.5303891186235742, "learning_rate": 5e-06, "loss": 0.6152, "step": 630 }, { "epoch": 1.9814241486068112, "grad_norm": 0.5949259061449034, "learning_rate": 5e-06, "loss": 0.6073, "step": 640 }, { "epoch": 2.0, "eval_loss": 0.6524380445480347, "eval_runtime": 31.1666, "eval_samples_per_second": 278.696, "eval_steps_per_second": 1.091, "step": 646 }, { "epoch": 2.0123839009287927, "grad_norm": 0.9446095499864162, "learning_rate": 5e-06, "loss": 0.591, "step": 650 }, { "epoch": 2.043343653250774, "grad_norm": 0.6258477650873515, "learning_rate": 5e-06, "loss": 0.5646, "step": 660 }, { "epoch": 2.0743034055727554, "grad_norm": 0.604477009985834, "learning_rate": 5e-06, "loss": 0.5693, "step": 670 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5940857975430127, "learning_rate": 5e-06, "loss": 0.5621, "step": 680 }, { "epoch": 2.136222910216718, "grad_norm": 0.6310773073567004, "learning_rate": 5e-06, "loss": 0.5664, "step": 690 }, { "epoch": 2.1671826625387, "grad_norm": 0.6735784589983905, "learning_rate": 5e-06, "loss": 0.5659, "step": 700 }, { "epoch": 2.198142414860681, "grad_norm": 0.6737551186617048, "learning_rate": 5e-06, "loss": 0.5744, "step": 710 }, { "epoch": 2.2291021671826625, "grad_norm": 0.538363932283787, "learning_rate": 5e-06, "loss": 0.5634, "step": 720 }, { "epoch": 2.260061919504644, "grad_norm": 0.6696764784705668, "learning_rate": 5e-06, "loss": 0.5633, "step": 730 }, { "epoch": 2.291021671826625, "grad_norm": 0.5541664073886028, "learning_rate": 5e-06, "loss": 0.572, "step": 740 }, { "epoch": 2.321981424148607, "grad_norm": 0.7344897597565605, "learning_rate": 5e-06, "loss": 0.5686, "step": 750 }, { "epoch": 2.3529411764705883, "grad_norm": 0.5507126279939225, "learning_rate": 5e-06, "loss": 0.5729, "step": 760 }, { "epoch": 2.3839009287925697, "grad_norm": 0.6344649027391833, "learning_rate": 5e-06, "loss": 0.5665, "step": 770 }, { "epoch": 2.414860681114551, "grad_norm": 0.6106920196828644, "learning_rate": 5e-06, "loss": 0.5702, "step": 780 }, { "epoch": 2.4458204334365323, "grad_norm": 0.6238727356689014, "learning_rate": 5e-06, "loss": 0.5706, "step": 790 }, { "epoch": 2.476780185758514, "grad_norm": 0.6326270787372308, "learning_rate": 5e-06, "loss": 0.5739, "step": 800 }, { "epoch": 2.5077399380804954, "grad_norm": 0.6392551743906308, "learning_rate": 5e-06, "loss": 0.5707, "step": 810 }, { "epoch": 2.538699690402477, "grad_norm": 0.5293919880888138, "learning_rate": 5e-06, "loss": 0.5694, "step": 820 }, { "epoch": 2.569659442724458, "grad_norm": 0.5315004573871298, "learning_rate": 5e-06, "loss": 0.5715, "step": 830 }, { "epoch": 2.6006191950464395, "grad_norm": 0.6021834459451176, "learning_rate": 5e-06, "loss": 0.5686, "step": 840 }, { "epoch": 2.6315789473684212, "grad_norm": 0.7264827387006807, "learning_rate": 5e-06, "loss": 0.5711, "step": 850 }, { "epoch": 2.6625386996904026, "grad_norm": 0.5432440378468479, "learning_rate": 5e-06, "loss": 0.5682, "step": 860 }, { "epoch": 2.693498452012384, "grad_norm": 0.4887763215481647, "learning_rate": 5e-06, "loss": 0.5727, "step": 870 }, { "epoch": 2.7244582043343653, "grad_norm": 0.6219072904402111, "learning_rate": 5e-06, "loss": 0.5676, "step": 880 }, { "epoch": 2.7554179566563466, "grad_norm": 0.5596606256063349, "learning_rate": 5e-06, "loss": 0.5748, "step": 890 }, { "epoch": 2.7863777089783284, "grad_norm": 0.6091604993895532, "learning_rate": 5e-06, "loss": 0.5639, "step": 900 }, { "epoch": 2.8173374613003097, "grad_norm": 0.5920477606163769, "learning_rate": 5e-06, "loss": 0.5717, "step": 910 }, { "epoch": 2.848297213622291, "grad_norm": 0.6807900156580523, "learning_rate": 5e-06, "loss": 0.5721, "step": 920 }, { "epoch": 2.8792569659442724, "grad_norm": 0.6050955818661157, "learning_rate": 5e-06, "loss": 0.5748, "step": 930 }, { "epoch": 2.9102167182662537, "grad_norm": 0.6165235050612774, "learning_rate": 5e-06, "loss": 0.5732, "step": 940 }, { "epoch": 2.9411764705882355, "grad_norm": 0.6346237356986199, "learning_rate": 5e-06, "loss": 0.5743, "step": 950 }, { "epoch": 2.972136222910217, "grad_norm": 0.6125475070291018, "learning_rate": 5e-06, "loss": 0.5733, "step": 960 }, { "epoch": 3.0, "eval_loss": 0.6580318212509155, "eval_runtime": 31.0741, "eval_samples_per_second": 279.526, "eval_steps_per_second": 1.094, "step": 969 }, { "epoch": 3.0, "step": 969, "total_flos": 1623111090831360.0, "train_loss": 0.628350810984955, "train_runtime": 6073.5579, "train_samples_per_second": 81.516, "train_steps_per_second": 0.16 } ], "logging_steps": 10, "max_steps": 969, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1623111090831360.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }