{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02564102564102564, "grad_norm": 3.406068801879883, "learning_rate": 5e-06, "loss": 0.9168, "step": 10 }, { "epoch": 0.05128205128205128, "grad_norm": 3.5034339427948, "learning_rate": 5e-06, "loss": 0.7688, "step": 20 }, { "epoch": 0.07692307692307693, "grad_norm": 1.0085859298706055, "learning_rate": 5e-06, "loss": 0.729, "step": 30 }, { "epoch": 0.10256410256410256, "grad_norm": 0.8697875142097473, "learning_rate": 5e-06, "loss": 0.7004, "step": 40 }, { "epoch": 0.1282051282051282, "grad_norm": 0.9098314642906189, "learning_rate": 5e-06, "loss": 0.6853, "step": 50 }, { "epoch": 0.15384615384615385, "grad_norm": 1.240673542022705, "learning_rate": 5e-06, "loss": 0.6668, "step": 60 }, { "epoch": 0.1794871794871795, "grad_norm": 1.3245731592178345, "learning_rate": 5e-06, "loss": 0.654, "step": 70 }, { "epoch": 0.20512820512820512, "grad_norm": 0.6942853331565857, "learning_rate": 5e-06, "loss": 0.6427, "step": 80 }, { "epoch": 0.23076923076923078, "grad_norm": 0.7867761254310608, "learning_rate": 5e-06, "loss": 0.6425, "step": 90 }, { "epoch": 0.2564102564102564, "grad_norm": 0.7657337784767151, "learning_rate": 5e-06, "loss": 0.6305, "step": 100 }, { "epoch": 0.28205128205128205, "grad_norm": 0.8531173467636108, "learning_rate": 5e-06, "loss": 0.6308, "step": 110 }, { "epoch": 0.3076923076923077, "grad_norm": 0.6474210023880005, "learning_rate": 5e-06, "loss": 0.6227, "step": 120 }, { "epoch": 0.3333333333333333, "grad_norm": 0.6232497692108154, "learning_rate": 5e-06, "loss": 0.6293, "step": 130 }, { "epoch": 0.358974358974359, "grad_norm": 0.6542862057685852, "learning_rate": 5e-06, "loss": 0.6217, "step": 140 }, { "epoch": 0.38461538461538464, "grad_norm": 0.9984610080718994, "learning_rate": 5e-06, "loss": 0.6213, "step": 150 }, { "epoch": 0.41025641025641024, "grad_norm": 0.7436656355857849, "learning_rate": 5e-06, "loss": 0.614, "step": 160 }, { "epoch": 0.4358974358974359, "grad_norm": 0.7107124328613281, "learning_rate": 5e-06, "loss": 0.6057, "step": 170 }, { "epoch": 0.46153846153846156, "grad_norm": 0.8069647550582886, "learning_rate": 5e-06, "loss": 0.6134, "step": 180 }, { "epoch": 0.48717948717948717, "grad_norm": 0.5554560422897339, "learning_rate": 5e-06, "loss": 0.6069, "step": 190 }, { "epoch": 0.5128205128205128, "grad_norm": 0.5725631713867188, "learning_rate": 5e-06, "loss": 0.6081, "step": 200 }, { "epoch": 0.5384615384615384, "grad_norm": 0.7718241214752197, "learning_rate": 5e-06, "loss": 0.6003, "step": 210 }, { "epoch": 0.5641025641025641, "grad_norm": 0.659430742263794, "learning_rate": 5e-06, "loss": 0.6044, "step": 220 }, { "epoch": 0.5897435897435898, "grad_norm": 0.5641574859619141, "learning_rate": 5e-06, "loss": 0.5988, "step": 230 }, { "epoch": 0.6153846153846154, "grad_norm": 0.613983154296875, "learning_rate": 5e-06, "loss": 0.6079, "step": 240 }, { "epoch": 0.6410256410256411, "grad_norm": 0.7252273559570312, "learning_rate": 5e-06, "loss": 0.5981, "step": 250 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6930380463600159, "learning_rate": 5e-06, "loss": 0.5927, "step": 260 }, { "epoch": 0.6923076923076923, "grad_norm": 0.6013543605804443, "learning_rate": 5e-06, "loss": 0.5942, "step": 270 }, { "epoch": 0.717948717948718, "grad_norm": 0.5443430542945862, "learning_rate": 5e-06, "loss": 0.594, "step": 280 }, { "epoch": 0.7435897435897436, "grad_norm": 0.6630271077156067, "learning_rate": 5e-06, "loss": 0.5926, "step": 290 }, { "epoch": 0.7692307692307693, "grad_norm": 0.6013346314430237, "learning_rate": 5e-06, "loss": 0.5874, "step": 300 }, { "epoch": 0.7948717948717948, "grad_norm": 0.5420301556587219, "learning_rate": 5e-06, "loss": 0.5913, "step": 310 }, { "epoch": 0.8205128205128205, "grad_norm": 0.809611976146698, "learning_rate": 5e-06, "loss": 0.5917, "step": 320 }, { "epoch": 0.8461538461538461, "grad_norm": 0.7773135304450989, "learning_rate": 5e-06, "loss": 0.5836, "step": 330 }, { "epoch": 0.8717948717948718, "grad_norm": 0.5458276271820068, "learning_rate": 5e-06, "loss": 0.5794, "step": 340 }, { "epoch": 0.8974358974358975, "grad_norm": 0.6851891875267029, "learning_rate": 5e-06, "loss": 0.5902, "step": 350 }, { "epoch": 0.9230769230769231, "grad_norm": 0.5757733583450317, "learning_rate": 5e-06, "loss": 0.5826, "step": 360 }, { "epoch": 0.9487179487179487, "grad_norm": 0.5545377135276794, "learning_rate": 5e-06, "loss": 0.5887, "step": 370 }, { "epoch": 0.9743589743589743, "grad_norm": 0.5440164804458618, "learning_rate": 5e-06, "loss": 0.584, "step": 380 }, { "epoch": 1.0, "grad_norm": 0.5356547236442566, "learning_rate": 5e-06, "loss": 0.5779, "step": 390 }, { "epoch": 1.0, "eval_loss": 0.5792508721351624, "eval_runtime": 37.2797, "eval_samples_per_second": 281.762, "eval_steps_per_second": 1.127, "step": 390 }, { "epoch": 1.0256410256410255, "grad_norm": 0.6288334131240845, "learning_rate": 5e-06, "loss": 0.5453, "step": 400 }, { "epoch": 1.0512820512820513, "grad_norm": 0.6558266282081604, "learning_rate": 5e-06, "loss": 0.5475, "step": 410 }, { "epoch": 1.0769230769230769, "grad_norm": 0.7556455135345459, "learning_rate": 5e-06, "loss": 0.5465, "step": 420 }, { "epoch": 1.1025641025641026, "grad_norm": 0.586370050907135, "learning_rate": 5e-06, "loss": 0.5395, "step": 430 }, { "epoch": 1.1282051282051282, "grad_norm": 0.5581791996955872, "learning_rate": 5e-06, "loss": 0.5366, "step": 440 }, { "epoch": 1.1538461538461537, "grad_norm": 0.5495139956474304, "learning_rate": 5e-06, "loss": 0.5487, "step": 450 }, { "epoch": 1.1794871794871795, "grad_norm": 0.6099910736083984, "learning_rate": 5e-06, "loss": 0.5397, "step": 460 }, { "epoch": 1.205128205128205, "grad_norm": 0.5565390586853027, "learning_rate": 5e-06, "loss": 0.5376, "step": 470 }, { "epoch": 1.2307692307692308, "grad_norm": 0.6461251378059387, "learning_rate": 5e-06, "loss": 0.534, "step": 480 }, { "epoch": 1.2564102564102564, "grad_norm": 0.6213774681091309, "learning_rate": 5e-06, "loss": 0.5366, "step": 490 }, { "epoch": 1.282051282051282, "grad_norm": 0.5515546798706055, "learning_rate": 5e-06, "loss": 0.5381, "step": 500 }, { "epoch": 1.3076923076923077, "grad_norm": 0.6342268586158752, "learning_rate": 5e-06, "loss": 0.536, "step": 510 }, { "epoch": 1.3333333333333333, "grad_norm": 0.5595047473907471, "learning_rate": 5e-06, "loss": 0.5396, "step": 520 }, { "epoch": 1.358974358974359, "grad_norm": 0.5788885951042175, "learning_rate": 5e-06, "loss": 0.5371, "step": 530 }, { "epoch": 1.3846153846153846, "grad_norm": 0.5231652855873108, "learning_rate": 5e-06, "loss": 0.5357, "step": 540 }, { "epoch": 1.4102564102564101, "grad_norm": 0.7670683860778809, "learning_rate": 5e-06, "loss": 0.5376, "step": 550 }, { "epoch": 1.435897435897436, "grad_norm": 0.5181464552879333, "learning_rate": 5e-06, "loss": 0.5321, "step": 560 }, { "epoch": 1.4615384615384617, "grad_norm": 0.5286003351211548, "learning_rate": 5e-06, "loss": 0.5361, "step": 570 }, { "epoch": 1.4871794871794872, "grad_norm": 0.5364812016487122, "learning_rate": 5e-06, "loss": 0.5391, "step": 580 }, { "epoch": 1.5128205128205128, "grad_norm": 0.6188532710075378, "learning_rate": 5e-06, "loss": 0.5336, "step": 590 }, { "epoch": 1.5384615384615383, "grad_norm": 0.4892689883708954, "learning_rate": 5e-06, "loss": 0.5352, "step": 600 }, { "epoch": 1.564102564102564, "grad_norm": 0.5404101014137268, "learning_rate": 5e-06, "loss": 0.5321, "step": 610 }, { "epoch": 1.5897435897435899, "grad_norm": 0.5570105314254761, "learning_rate": 5e-06, "loss": 0.5341, "step": 620 }, { "epoch": 1.6153846153846154, "grad_norm": 0.6178380846977234, "learning_rate": 5e-06, "loss": 0.5321, "step": 630 }, { "epoch": 1.641025641025641, "grad_norm": 0.8673184514045715, "learning_rate": 5e-06, "loss": 0.5216, "step": 640 }, { "epoch": 1.6666666666666665, "grad_norm": 0.6193057298660278, "learning_rate": 5e-06, "loss": 0.5312, "step": 650 }, { "epoch": 1.6923076923076923, "grad_norm": 0.5737960934638977, "learning_rate": 5e-06, "loss": 0.5317, "step": 660 }, { "epoch": 1.717948717948718, "grad_norm": 0.6953139305114746, "learning_rate": 5e-06, "loss": 0.5297, "step": 670 }, { "epoch": 1.7435897435897436, "grad_norm": 0.5698896050453186, "learning_rate": 5e-06, "loss": 0.5248, "step": 680 }, { "epoch": 1.7692307692307692, "grad_norm": 0.4578630328178406, "learning_rate": 5e-06, "loss": 0.5251, "step": 690 }, { "epoch": 1.7948717948717947, "grad_norm": 0.4654785692691803, "learning_rate": 5e-06, "loss": 0.5308, "step": 700 }, { "epoch": 1.8205128205128205, "grad_norm": 0.500807523727417, "learning_rate": 5e-06, "loss": 0.5259, "step": 710 }, { "epoch": 1.8461538461538463, "grad_norm": 0.5273367762565613, "learning_rate": 5e-06, "loss": 0.5286, "step": 720 }, { "epoch": 1.8717948717948718, "grad_norm": 0.49950945377349854, "learning_rate": 5e-06, "loss": 0.5292, "step": 730 }, { "epoch": 1.8974358974358974, "grad_norm": 0.46231916546821594, "learning_rate": 5e-06, "loss": 0.5243, "step": 740 }, { "epoch": 1.9230769230769231, "grad_norm": 0.5422210693359375, "learning_rate": 5e-06, "loss": 0.5241, "step": 750 }, { "epoch": 1.9487179487179487, "grad_norm": 0.569016695022583, "learning_rate": 5e-06, "loss": 0.5237, "step": 760 }, { "epoch": 1.9743589743589745, "grad_norm": 0.5401661396026611, "learning_rate": 5e-06, "loss": 0.5289, "step": 770 }, { "epoch": 2.0, "grad_norm": 0.6024136543273926, "learning_rate": 5e-06, "loss": 0.5262, "step": 780 }, { "epoch": 2.0, "eval_loss": 0.5533561706542969, "eval_runtime": 36.5672, "eval_samples_per_second": 287.252, "eval_steps_per_second": 1.149, "step": 780 }, { "epoch": 2.0256410256410255, "grad_norm": 0.5916836857795715, "learning_rate": 5e-06, "loss": 0.4847, "step": 790 }, { "epoch": 2.051282051282051, "grad_norm": 0.6934983730316162, "learning_rate": 5e-06, "loss": 0.4906, "step": 800 }, { "epoch": 2.076923076923077, "grad_norm": 0.546357274055481, "learning_rate": 5e-06, "loss": 0.4857, "step": 810 }, { "epoch": 2.1025641025641026, "grad_norm": 0.7491461634635925, "learning_rate": 5e-06, "loss": 0.4861, "step": 820 }, { "epoch": 2.128205128205128, "grad_norm": 0.5870935320854187, "learning_rate": 5e-06, "loss": 0.4847, "step": 830 }, { "epoch": 2.1538461538461537, "grad_norm": 0.6286914348602295, "learning_rate": 5e-06, "loss": 0.4895, "step": 840 }, { "epoch": 2.1794871794871793, "grad_norm": 0.515387773513794, "learning_rate": 5e-06, "loss": 0.4897, "step": 850 }, { "epoch": 2.2051282051282053, "grad_norm": 0.5456680655479431, "learning_rate": 5e-06, "loss": 0.4861, "step": 860 }, { "epoch": 2.230769230769231, "grad_norm": 0.49457597732543945, "learning_rate": 5e-06, "loss": 0.484, "step": 870 }, { "epoch": 2.2564102564102564, "grad_norm": 0.5255215167999268, "learning_rate": 5e-06, "loss": 0.4936, "step": 880 }, { "epoch": 2.282051282051282, "grad_norm": 0.5122373700141907, "learning_rate": 5e-06, "loss": 0.4915, "step": 890 }, { "epoch": 2.3076923076923075, "grad_norm": 0.5085675120353699, "learning_rate": 5e-06, "loss": 0.4894, "step": 900 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5925104022026062, "learning_rate": 5e-06, "loss": 0.4873, "step": 910 }, { "epoch": 2.358974358974359, "grad_norm": 0.519869327545166, "learning_rate": 5e-06, "loss": 0.4882, "step": 920 }, { "epoch": 2.3846153846153846, "grad_norm": 0.550400972366333, "learning_rate": 5e-06, "loss": 0.486, "step": 930 }, { "epoch": 2.41025641025641, "grad_norm": 0.6560305953025818, "learning_rate": 5e-06, "loss": 0.4963, "step": 940 }, { "epoch": 2.435897435897436, "grad_norm": 0.5110371112823486, "learning_rate": 5e-06, "loss": 0.4855, "step": 950 }, { "epoch": 2.4615384615384617, "grad_norm": 0.48929715156555176, "learning_rate": 5e-06, "loss": 0.4859, "step": 960 }, { "epoch": 2.4871794871794872, "grad_norm": 0.5958628058433533, "learning_rate": 5e-06, "loss": 0.4884, "step": 970 }, { "epoch": 2.5128205128205128, "grad_norm": 0.604121208190918, "learning_rate": 5e-06, "loss": 0.4858, "step": 980 }, { "epoch": 2.5384615384615383, "grad_norm": 0.5479825735092163, "learning_rate": 5e-06, "loss": 0.4864, "step": 990 }, { "epoch": 2.564102564102564, "grad_norm": 0.5345059633255005, "learning_rate": 5e-06, "loss": 0.4859, "step": 1000 }, { "epoch": 2.58974358974359, "grad_norm": 0.519960880279541, "learning_rate": 5e-06, "loss": 0.4867, "step": 1010 }, { "epoch": 2.6153846153846154, "grad_norm": 0.5771586298942566, "learning_rate": 5e-06, "loss": 0.4921, "step": 1020 }, { "epoch": 2.641025641025641, "grad_norm": 0.5097467303276062, "learning_rate": 5e-06, "loss": 0.4898, "step": 1030 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5330426096916199, "learning_rate": 5e-06, "loss": 0.487, "step": 1040 }, { "epoch": 2.6923076923076925, "grad_norm": 0.6157095432281494, "learning_rate": 5e-06, "loss": 0.4896, "step": 1050 }, { "epoch": 2.717948717948718, "grad_norm": 0.5218502283096313, "learning_rate": 5e-06, "loss": 0.4883, "step": 1060 }, { "epoch": 2.7435897435897436, "grad_norm": 0.620424747467041, "learning_rate": 5e-06, "loss": 0.4895, "step": 1070 }, { "epoch": 2.769230769230769, "grad_norm": 0.5703057646751404, "learning_rate": 5e-06, "loss": 0.4854, "step": 1080 }, { "epoch": 2.7948717948717947, "grad_norm": 0.49105432629585266, "learning_rate": 5e-06, "loss": 0.4929, "step": 1090 }, { "epoch": 2.8205128205128203, "grad_norm": 0.5956032872200012, "learning_rate": 5e-06, "loss": 0.4847, "step": 1100 }, { "epoch": 2.8461538461538463, "grad_norm": 0.6310620307922363, "learning_rate": 5e-06, "loss": 0.4824, "step": 1110 }, { "epoch": 2.871794871794872, "grad_norm": 0.5469298958778381, "learning_rate": 5e-06, "loss": 0.4843, "step": 1120 }, { "epoch": 2.8974358974358974, "grad_norm": 0.6324126720428467, "learning_rate": 5e-06, "loss": 0.4822, "step": 1130 }, { "epoch": 2.9230769230769234, "grad_norm": 0.5963416695594788, "learning_rate": 5e-06, "loss": 0.4856, "step": 1140 }, { "epoch": 2.948717948717949, "grad_norm": 0.5227168202400208, "learning_rate": 5e-06, "loss": 0.4888, "step": 1150 }, { "epoch": 2.9743589743589745, "grad_norm": 0.4714514911174774, "learning_rate": 5e-06, "loss": 0.4913, "step": 1160 }, { "epoch": 3.0, "grad_norm": 0.5580222010612488, "learning_rate": 5e-06, "loss": 0.4851, "step": 1170 }, { "epoch": 3.0, "eval_loss": 0.5471954345703125, "eval_runtime": 35.7738, "eval_samples_per_second": 293.622, "eval_steps_per_second": 1.174, "step": 1170 }, { "epoch": 3.0, "step": 1170, "total_flos": 5.524377893797154e+19, "train_loss": 0.5494362928928473, "train_runtime": 8709.0296, "train_samples_per_second": 68.748, "train_steps_per_second": 0.134 } ], "logging_steps": 10, "max_steps": 1170, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.524377893797154e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }