{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.048128, "eval_steps": 100, "global_step": 1024, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01024, "grad_norm": 269.9034729003906, "learning_rate": 0.0001999995200527669, "loss": 624.6766, "step": 10 }, { "epoch": 0.02048, "grad_norm": 73.23262786865234, "learning_rate": 0.000199941931959037, "loss": 544.9223, "step": 20 }, { "epoch": 0.03072, "grad_norm": 95.47492218017578, "learning_rate": 0.00019978841775475367, "loss": 511.6501, "step": 30 }, { "epoch": 0.04096, "grad_norm": 145.1549530029297, "learning_rate": 0.00019953912478568305, "loss": 497.5438, "step": 40 }, { "epoch": 0.0512, "grad_norm": 150.53515625, "learning_rate": 0.00019919429232781712, "loss": 497.208, "step": 50 }, { "epoch": 0.06144, "grad_norm": 151.51736450195312, "learning_rate": 0.0001987542513577122, "loss": 496.9036, "step": 60 }, { "epoch": 0.07168, "grad_norm": 159.39337158203125, "learning_rate": 0.0001982771584048096, "loss": 489.2719, "step": 70 }, { "epoch": 0.08192, "grad_norm": 148.92176818847656, "learning_rate": 0.00019765746006440455, "loss": 482.9451, "step": 80 }, { "epoch": 0.09216, "grad_norm": 139.00486755371094, "learning_rate": 0.0001970195706599109, "loss": 476.9848, "step": 90 }, { "epoch": 0.1024, "grad_norm": 157.25439453125, "learning_rate": 0.00019622236172137374, "loss": 471.4595, "step": 100 }, { "epoch": 0.1024, "eval_loss": 7.320011615753174, "eval_runtime": 3.4958, "eval_samples_per_second": 143.029, "eval_steps_per_second": 9.154, "step": 100 }, { "epoch": 0.11264, "grad_norm": 141.9667510986328, "learning_rate": 0.0001953327967844356, "loss": 468.9138, "step": 110 }, { "epoch": 0.12288, "grad_norm": 99.36428833007812, "learning_rate": 0.0001943517296699384, "loss": 468.5555, "step": 120 }, { "epoch": 0.13312, "grad_norm": 98.90169525146484, "learning_rate": 0.00019328010202420258, "loss": 463.3139, "step": 130 }, { "epoch": 0.14336, "grad_norm": 77.31971740722656, "learning_rate": 0.00019211894241521758, "loss": 458.0901, "step": 140 }, { "epoch": 0.1536, "grad_norm": 134.48403930664062, "learning_rate": 0.0001908693653454033, "loss": 454.8131, "step": 150 }, { "epoch": 0.16384, "grad_norm": 95.27815246582031, "learning_rate": 0.00018953257018189024, "loss": 454.0167, "step": 160 }, { "epoch": 0.17408, "grad_norm": 87.9055404663086, "learning_rate": 0.00018810984000534458, "loss": 449.3531, "step": 170 }, { "epoch": 0.18432, "grad_norm": 116.1905288696289, "learning_rate": 0.00018660254037844388, "loss": 447.6146, "step": 180 }, { "epoch": 0.19456, "grad_norm": 81.48722076416016, "learning_rate": 0.00018501211803518468, "loss": 450.6066, "step": 190 }, { "epoch": 0.2048, "grad_norm": 75.57894897460938, "learning_rate": 0.00018334009949228061, "loss": 448.9498, "step": 200 }, { "epoch": 0.2048, "eval_loss": 6.992556571960449, "eval_runtime": 3.4911, "eval_samples_per_second": 143.219, "eval_steps_per_second": 9.166, "step": 200 }, { "epoch": 0.21504, "grad_norm": 160.8426513671875, "learning_rate": 0.00018158808958398338, "loss": 449.3857, "step": 210 }, { "epoch": 0.22528, "grad_norm": 86.6707992553711, "learning_rate": 0.00017975776992173344, "loss": 449.0133, "step": 220 }, { "epoch": 0.23552, "grad_norm": 65.60407257080078, "learning_rate": 0.00017785089728011798, "loss": 446.8142, "step": 230 }, { "epoch": 0.24576, "grad_norm": 125.04796600341797, "learning_rate": 0.00017586930191068655, "loss": 446.0437, "step": 240 }, { "epoch": 0.256, "grad_norm": 89.98313903808594, "learning_rate": 0.00017381488578524173, "loss": 445.3744, "step": 250 }, { "epoch": 0.26624, "grad_norm": 269.5317077636719, "learning_rate": 0.00017168962077029147, "loss": 446.719, "step": 260 }, { "epoch": 0.27648, "grad_norm": 73.03639221191406, "learning_rate": 0.00016949554673441534, "loss": 448.0971, "step": 270 }, { "epoch": 0.28672, "grad_norm": 122.57962799072266, "learning_rate": 0.00016723476959036083, "loss": 448.991, "step": 280 }, { "epoch": 0.29696, "grad_norm": 81.79669189453125, "learning_rate": 0.0001649094592737497, "loss": 444.1866, "step": 290 }, { "epoch": 0.3072, "grad_norm": 74.33326721191406, "learning_rate": 0.00016252184766033342, "loss": 436.623, "step": 300 }, { "epoch": 0.3072, "eval_loss": 6.777511119842529, "eval_runtime": 3.4227, "eval_samples_per_second": 146.083, "eval_steps_per_second": 9.349, "step": 300 }, { "epoch": 0.31744, "grad_norm": 113.46410369873047, "learning_rate": 0.0001600742264237979, "loss": 435.7422, "step": 310 }, { "epoch": 0.32768, "grad_norm": 99.42645263671875, "learning_rate": 0.00015756894483617267, "loss": 439.4858, "step": 320 }, { "epoch": 0.33792, "grad_norm": 328.0025634765625, "learning_rate": 0.0001550084075129563, "loss": 447.5792, "step": 330 }, { "epoch": 0.34816, "grad_norm": 82.54906463623047, "learning_rate": 0.00015239507210512194, "loss": 446.5024, "step": 340 }, { "epoch": 0.3584, "grad_norm": 62.32942581176758, "learning_rate": 0.00014973144694021876, "loss": 437.9146, "step": 350 }, { "epoch": 0.36864, "grad_norm": 64.61022186279297, "learning_rate": 0.00014702008861483266, "loss": 430.4142, "step": 360 }, { "epoch": 0.37888, "grad_norm": 133.777587890625, "learning_rate": 0.00014426359954071796, "loss": 428.6971, "step": 370 }, { "epoch": 0.38912, "grad_norm": 218.512939453125, "learning_rate": 0.00014146462544695426, "loss": 435.1475, "step": 380 }, { "epoch": 0.39936, "grad_norm": 125.56941986083984, "learning_rate": 0.00013862585284052714, "loss": 445.5835, "step": 390 }, { "epoch": 0.4096, "grad_norm": 109.06041717529297, "learning_rate": 0.00013575000642776893, "loss": 446.3095, "step": 400 }, { "epoch": 0.4096, "eval_loss": 6.905622959136963, "eval_runtime": 3.4269, "eval_samples_per_second": 145.903, "eval_steps_per_second": 9.338, "step": 400 }, { "epoch": 0.41984, "grad_norm": 69.12989044189453, "learning_rate": 0.0001328398464991355, "loss": 438.9709, "step": 410 }, { "epoch": 0.43008, "grad_norm": 68.11474609375, "learning_rate": 0.00012989816627982848, "loss": 432.2964, "step": 420 }, { "epoch": 0.44032, "grad_norm": 65.17674255371094, "learning_rate": 0.00012692778924880603, "loss": 428.2125, "step": 430 }, { "epoch": 0.45056, "grad_norm": 114.95523834228516, "learning_rate": 0.0001239315664287558, "loss": 426.8882, "step": 440 }, { "epoch": 0.4608, "grad_norm": 185.0157470703125, "learning_rate": 0.00012091237364963071, "loss": 435.8043, "step": 450 }, { "epoch": 0.47104, "grad_norm": 92.59754180908203, "learning_rate": 0.00011787310878837422, "loss": 440.9751, "step": 460 }, { "epoch": 0.48128, "grad_norm": 75.24162292480469, "learning_rate": 0.00011481668898748475, "loss": 439.3276, "step": 470 }, { "epoch": 0.49152, "grad_norm": 55.42325210571289, "learning_rate": 0.00011174604785508813, "loss": 432.4603, "step": 480 }, { "epoch": 0.50176, "grad_norm": 62.27671813964844, "learning_rate": 0.00010866413264920678, "loss": 427.5299, "step": 490 }, { "epoch": 0.512, "grad_norm": 65.43367767333984, "learning_rate": 0.00010557390144892684, "loss": 425.4595, "step": 500 }, { "epoch": 0.512, "eval_loss": 6.613161087036133, "eval_runtime": 3.4182, "eval_samples_per_second": 146.277, "eval_steps_per_second": 9.362, "step": 500 }, { "epoch": 0.52224, "grad_norm": 175.58470153808594, "learning_rate": 0.0001024783203151793, "loss": 425.5378, "step": 510 }, { "epoch": 0.53248, "grad_norm": 199.92291259765625, "learning_rate": 9.938036044386005e-05, "loss": 431.3893, "step": 520 }, { "epoch": 0.54272, "grad_norm": 212.65650939941406, "learning_rate": 9.628299531402117e-05, "loss": 443.9659, "step": 530 }, { "epoch": 0.55296, "grad_norm": 93.11270141601562, "learning_rate": 9.318919783387094e-05, "loss": 443.3476, "step": 540 }, { "epoch": 0.5632, "grad_norm": 93.02433013916016, "learning_rate": 9.010193748732155e-05, "loss": 438.1048, "step": 550 }, { "epoch": 0.57344, "grad_norm": 72.0661849975586, "learning_rate": 8.702417748382385e-05, "loss": 431.1463, "step": 560 }, { "epoch": 0.58368, "grad_norm": 67.0578842163086, "learning_rate": 8.395887191422397e-05, "loss": 427.2931, "step": 570 }, { "epoch": 0.59392, "grad_norm": 85.532958984375, "learning_rate": 8.090896291537273e-05, "loss": 424.9293, "step": 580 }, { "epoch": 0.60416, "grad_norm": 72.48572540283203, "learning_rate": 7.787737784620803e-05, "loss": 424.9051, "step": 590 }, { "epoch": 0.6144, "grad_norm": 237.96592712402344, "learning_rate": 7.486702647802213e-05, "loss": 425.6438, "step": 600 }, { "epoch": 0.6144, "eval_loss": 6.683709621429443, "eval_runtime": 3.436, "eval_samples_per_second": 145.519, "eval_steps_per_second": 9.313, "step": 600 }, { "epoch": 0.62464, "grad_norm": 178.17239379882812, "learning_rate": 7.188079820160904e-05, "loss": 432.3896, "step": 610 }, { "epoch": 0.63488, "grad_norm": 84.38874053955078, "learning_rate": 6.892155925397436e-05, "loss": 434.9848, "step": 620 }, { "epoch": 0.64512, "grad_norm": 66.67383575439453, "learning_rate": 6.59921499672677e-05, "loss": 433.8923, "step": 630 }, { "epoch": 0.65536, "grad_norm": 74.11187744140625, "learning_rate": 6.309538204257977e-05, "loss": 430.2817, "step": 640 }, { "epoch": 0.6656, "grad_norm": 95.32003784179688, "learning_rate": 6.02340358512196e-05, "loss": 427.1533, "step": 650 }, { "epoch": 0.67584, "grad_norm": 71.91348266601562, "learning_rate": 5.7410857766062966e-05, "loss": 425.3034, "step": 660 }, { "epoch": 0.68608, "grad_norm": 95.72642517089844, "learning_rate": 5.4628557525532976e-05, "loss": 425.3343, "step": 670 }, { "epoch": 0.69632, "grad_norm": 161.08612060546875, "learning_rate": 5.188980563274315e-05, "loss": 426.5362, "step": 680 }, { "epoch": 0.70656, "grad_norm": 130.4775848388672, "learning_rate": 4.9197230792299195e-05, "loss": 431.4921, "step": 690 }, { "epoch": 0.7168, "grad_norm": 102.47798919677734, "learning_rate": 4.6553417387219886e-05, "loss": 432.9831, "step": 700 }, { "epoch": 0.7168, "eval_loss": 6.725553512573242, "eval_runtime": 3.4338, "eval_samples_per_second": 145.61, "eval_steps_per_second": 9.319, "step": 700 }, { "epoch": 0.72704, "grad_norm": 73.72420501708984, "learning_rate": 4.421777466693434e-05, "loss": 431.4859, "step": 710 }, { "epoch": 0.73728, "grad_norm": 83.63558197021484, "learning_rate": 4.167355837898584e-05, "loss": 428.698, "step": 720 }, { "epoch": 0.74752, "grad_norm": 69.27027893066406, "learning_rate": 3.918532488602094e-05, "loss": 428.0623, "step": 730 }, { "epoch": 0.75776, "grad_norm": 107.68405151367188, "learning_rate": 3.675546244046228e-05, "loss": 425.6424, "step": 740 }, { "epoch": 0.768, "grad_norm": 96.42312622070312, "learning_rate": 3.438630326912414e-05, "loss": 425.8188, "step": 750 }, { "epoch": 0.77824, "grad_norm": 116.1615982055664, "learning_rate": 3.208012133469799e-05, "loss": 425.9528, "step": 760 }, { "epoch": 0.78848, "grad_norm": 91.75414276123047, "learning_rate": 2.9839130153161154e-05, "loss": 426.8583, "step": 770 }, { "epoch": 0.79872, "grad_norm": 79.31800079345703, "learning_rate": 2.766548066920338e-05, "loss": 425.4576, "step": 780 }, { "epoch": 0.80896, "grad_norm": 108.06861114501953, "learning_rate": 2.5561259191710407e-05, "loss": 425.0249, "step": 790 }, { "epoch": 0.8192, "grad_norm": 86.25403594970703, "learning_rate": 2.3528485391286147e-05, "loss": 426.0778, "step": 800 }, { "epoch": 0.8192, "eval_loss": 6.630011558532715, "eval_runtime": 3.4108, "eval_samples_per_second": 146.591, "eval_steps_per_second": 9.382, "step": 800 }, { "epoch": 0.82944, "grad_norm": 80.1161117553711, "learning_rate": 2.1569110361735677e-05, "loss": 426.9529, "step": 810 }, { "epoch": 0.83968, "grad_norm": 92.57079315185547, "learning_rate": 2e-05, "loss": 425.7674, "step": 820 }, { "epoch": 0.84992, "grad_norm": 76.28582000732422, "learning_rate": 2e-05, "loss": 425.3365, "step": 830 }, { "epoch": 0.86016, "grad_norm": 111.58943176269531, "learning_rate": 2e-05, "loss": 424.6131, "step": 840 }, { "epoch": 0.8704, "grad_norm": 158.44044494628906, "learning_rate": 2e-05, "loss": 425.0354, "step": 850 }, { "epoch": 0.88064, "grad_norm": 101.99372100830078, "learning_rate": 2e-05, "loss": 424.9413, "step": 860 }, { "epoch": 0.89088, "grad_norm": 140.2552490234375, "learning_rate": 2e-05, "loss": 426.5773, "step": 870 }, { "epoch": 0.90112, "grad_norm": 117.06301879882812, "learning_rate": 2e-05, "loss": 427.2063, "step": 880 }, { "epoch": 0.91136, "grad_norm": 147.27670288085938, "learning_rate": 2e-05, "loss": 427.2577, "step": 890 }, { "epoch": 0.9216, "grad_norm": 109.34888458251953, "learning_rate": 2e-05, "loss": 428.4192, "step": 900 }, { "epoch": 0.9216, "eval_loss": 6.663826942443848, "eval_runtime": 3.429, "eval_samples_per_second": 145.817, "eval_steps_per_second": 9.332, "step": 900 }, { "epoch": 0.93184, "grad_norm": 145.62522888183594, "learning_rate": 2e-05, "loss": 428.9891, "step": 910 }, { "epoch": 0.94208, "grad_norm": 90.70750427246094, "learning_rate": 2e-05, "loss": 429.0984, "step": 920 }, { "epoch": 0.95232, "grad_norm": 92.83578491210938, "learning_rate": 2e-05, "loss": 429.0002, "step": 930 }, { "epoch": 0.96256, "grad_norm": 125.1180648803711, "learning_rate": 2e-05, "loss": 428.4883, "step": 940 }, { "epoch": 0.9728, "grad_norm": 132.3828125, "learning_rate": 2e-05, "loss": 428.6993, "step": 950 }, { "epoch": 0.98304, "grad_norm": 100.2248306274414, "learning_rate": 2e-05, "loss": 429.164, "step": 960 }, { "epoch": 0.99328, "grad_norm": 112.38407897949219, "learning_rate": 2e-05, "loss": 430.1383, "step": 970 }, { "epoch": 1.003072, "grad_norm": 104.0753173828125, "learning_rate": 2e-05, "loss": 410.847, "step": 980 }, { "epoch": 1.013312, "grad_norm": 137.40553283691406, "learning_rate": 2e-05, "loss": 431.1839, "step": 990 }, { "epoch": 1.023552, "grad_norm": 191.53709411621094, "learning_rate": 2e-05, "loss": 431.9959, "step": 1000 }, { "epoch": 1.023552, "eval_loss": 6.718299865722656, "eval_runtime": 3.4132, "eval_samples_per_second": 146.49, "eval_steps_per_second": 9.375, "step": 1000 }, { "epoch": 1.033792, "grad_norm": 188.12156677246094, "learning_rate": 2e-05, "loss": 432.0317, "step": 1010 }, { "epoch": 1.044032, "grad_norm": 119.64492797851562, "learning_rate": 2e-05, "loss": 432.8214, "step": 1020 } ], "logging_steps": 10, "max_steps": 1024, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.134583528711782e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }