{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9943289224952743, "eval_steps": 500, "global_step": 1188, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02520478890989288, "grad_norm": 9.633888402845015, "learning_rate": 5e-06, "loss": 0.9444, "step": 10 }, { "epoch": 0.05040957781978576, "grad_norm": 1.6254228664238897, "learning_rate": 5e-06, "loss": 0.8095, "step": 20 }, { "epoch": 0.07561436672967864, "grad_norm": 0.8936028314003451, "learning_rate": 5e-06, "loss": 0.7596, "step": 30 }, { "epoch": 0.10081915563957151, "grad_norm": 0.9708205238462188, "learning_rate": 5e-06, "loss": 0.7334, "step": 40 }, { "epoch": 0.1260239445494644, "grad_norm": 0.8882056031134592, "learning_rate": 5e-06, "loss": 0.7166, "step": 50 }, { "epoch": 0.15122873345935728, "grad_norm": 0.8542020955227222, "learning_rate": 5e-06, "loss": 0.7004, "step": 60 }, { "epoch": 0.17643352236925017, "grad_norm": 0.719450185402131, "learning_rate": 5e-06, "loss": 0.6877, "step": 70 }, { "epoch": 0.20163831127914303, "grad_norm": 0.6838086991132964, "learning_rate": 5e-06, "loss": 0.676, "step": 80 }, { "epoch": 0.22684310018903592, "grad_norm": 0.6199036833773371, "learning_rate": 5e-06, "loss": 0.6754, "step": 90 }, { "epoch": 0.2520478890989288, "grad_norm": 0.5787703367375647, "learning_rate": 5e-06, "loss": 0.6562, "step": 100 }, { "epoch": 0.2772526780088217, "grad_norm": 0.633249705700733, "learning_rate": 5e-06, "loss": 0.665, "step": 110 }, { "epoch": 0.30245746691871456, "grad_norm": 0.6304627866586173, "learning_rate": 5e-06, "loss": 0.6634, "step": 120 }, { "epoch": 0.3276622558286074, "grad_norm": 0.5506728721157861, "learning_rate": 5e-06, "loss": 0.6549, "step": 130 }, { "epoch": 0.35286704473850034, "grad_norm": 0.5151002955744762, "learning_rate": 5e-06, "loss": 0.6522, "step": 140 }, { "epoch": 0.3780718336483932, "grad_norm": 0.5992082289679086, "learning_rate": 5e-06, "loss": 0.6528, "step": 150 }, { "epoch": 0.40327662255828606, "grad_norm": 0.4663777253829188, "learning_rate": 5e-06, "loss": 0.642, "step": 160 }, { "epoch": 0.428481411468179, "grad_norm": 0.5785034175448459, "learning_rate": 5e-06, "loss": 0.6469, "step": 170 }, { "epoch": 0.45368620037807184, "grad_norm": 0.6585860693219876, "learning_rate": 5e-06, "loss": 0.646, "step": 180 }, { "epoch": 0.4788909892879647, "grad_norm": 0.6722487645620258, "learning_rate": 5e-06, "loss": 0.6403, "step": 190 }, { "epoch": 0.5040957781978576, "grad_norm": 0.5254154176523609, "learning_rate": 5e-06, "loss": 0.6387, "step": 200 }, { "epoch": 0.5293005671077504, "grad_norm": 0.5128942747923588, "learning_rate": 5e-06, "loss": 0.6379, "step": 210 }, { "epoch": 0.5545053560176434, "grad_norm": 0.5642956436475494, "learning_rate": 5e-06, "loss": 0.6393, "step": 220 }, { "epoch": 0.5797101449275363, "grad_norm": 0.6837039476741191, "learning_rate": 5e-06, "loss": 0.6359, "step": 230 }, { "epoch": 0.6049149338374291, "grad_norm": 0.6667359902635842, "learning_rate": 5e-06, "loss": 0.6351, "step": 240 }, { "epoch": 0.630119722747322, "grad_norm": 0.6627848973706361, "learning_rate": 5e-06, "loss": 0.6343, "step": 250 }, { "epoch": 0.6553245116572148, "grad_norm": 0.5241036221432155, "learning_rate": 5e-06, "loss": 0.6279, "step": 260 }, { "epoch": 0.6805293005671077, "grad_norm": 0.576211583526909, "learning_rate": 5e-06, "loss": 0.6258, "step": 270 }, { "epoch": 0.7057340894770007, "grad_norm": 0.5136516604177849, "learning_rate": 5e-06, "loss": 0.6322, "step": 280 }, { "epoch": 0.7309388783868935, "grad_norm": 0.5270426246043673, "learning_rate": 5e-06, "loss": 0.6243, "step": 290 }, { "epoch": 0.7561436672967864, "grad_norm": 0.5453025715363531, "learning_rate": 5e-06, "loss": 0.6224, "step": 300 }, { "epoch": 0.7813484562066793, "grad_norm": 0.5116409906154713, "learning_rate": 5e-06, "loss": 0.6155, "step": 310 }, { "epoch": 0.8065532451165721, "grad_norm": 0.5849562966179552, "learning_rate": 5e-06, "loss": 0.6212, "step": 320 }, { "epoch": 0.831758034026465, "grad_norm": 0.6135737091827808, "learning_rate": 5e-06, "loss": 0.6258, "step": 330 }, { "epoch": 0.856962822936358, "grad_norm": 0.5181177544801595, "learning_rate": 5e-06, "loss": 0.6238, "step": 340 }, { "epoch": 0.8821676118462508, "grad_norm": 0.627205460376999, "learning_rate": 5e-06, "loss": 0.6167, "step": 350 }, { "epoch": 0.9073724007561437, "grad_norm": 0.5261678739160346, "learning_rate": 5e-06, "loss": 0.6117, "step": 360 }, { "epoch": 0.9325771896660365, "grad_norm": 0.6244352371075516, "learning_rate": 5e-06, "loss": 0.6153, "step": 370 }, { "epoch": 0.9577819785759294, "grad_norm": 0.48330689120915904, "learning_rate": 5e-06, "loss": 0.6157, "step": 380 }, { "epoch": 0.9829867674858223, "grad_norm": 0.5423735222942331, "learning_rate": 5e-06, "loss": 0.6132, "step": 390 }, { "epoch": 0.998109640831758, "eval_loss": 0.6147013306617737, "eval_runtime": 212.3997, "eval_samples_per_second": 50.334, "eval_steps_per_second": 0.395, "step": 396 }, { "epoch": 1.0081915563957151, "grad_norm": 0.5070359483563761, "learning_rate": 5e-06, "loss": 0.6024, "step": 400 }, { "epoch": 1.033396345305608, "grad_norm": 0.5765247476192352, "learning_rate": 5e-06, "loss": 0.5805, "step": 410 }, { "epoch": 1.0586011342155008, "grad_norm": 0.6110502474250971, "learning_rate": 5e-06, "loss": 0.5715, "step": 420 }, { "epoch": 1.0838059231253938, "grad_norm": 0.7545087966794868, "learning_rate": 5e-06, "loss": 0.5797, "step": 430 }, { "epoch": 1.1090107120352868, "grad_norm": 0.6365159456598485, "learning_rate": 5e-06, "loss": 0.5751, "step": 440 }, { "epoch": 1.1342155009451795, "grad_norm": 0.6056198671024684, "learning_rate": 5e-06, "loss": 0.5742, "step": 450 }, { "epoch": 1.1594202898550725, "grad_norm": 0.5291507949595291, "learning_rate": 5e-06, "loss": 0.5761, "step": 460 }, { "epoch": 1.1846250787649653, "grad_norm": 0.6153628132427752, "learning_rate": 5e-06, "loss": 0.5779, "step": 470 }, { "epoch": 1.2098298676748582, "grad_norm": 0.5060871095129219, "learning_rate": 5e-06, "loss": 0.574, "step": 480 }, { "epoch": 1.2350346565847512, "grad_norm": 0.6456813781855114, "learning_rate": 5e-06, "loss": 0.5759, "step": 490 }, { "epoch": 1.260239445494644, "grad_norm": 0.5052335738280048, "learning_rate": 5e-06, "loss": 0.5756, "step": 500 }, { "epoch": 1.285444234404537, "grad_norm": 0.5653893528313595, "learning_rate": 5e-06, "loss": 0.5754, "step": 510 }, { "epoch": 1.3106490233144297, "grad_norm": 0.5278634789290781, "learning_rate": 5e-06, "loss": 0.5733, "step": 520 }, { "epoch": 1.3358538122243226, "grad_norm": 0.5245168937373562, "learning_rate": 5e-06, "loss": 0.5721, "step": 530 }, { "epoch": 1.3610586011342156, "grad_norm": 0.5215086066445794, "learning_rate": 5e-06, "loss": 0.5733, "step": 540 }, { "epoch": 1.3862633900441084, "grad_norm": 0.5370456877329636, "learning_rate": 5e-06, "loss": 0.571, "step": 550 }, { "epoch": 1.4114681789540013, "grad_norm": 0.5521665579415627, "learning_rate": 5e-06, "loss": 0.5698, "step": 560 }, { "epoch": 1.436672967863894, "grad_norm": 0.7002009297394419, "learning_rate": 5e-06, "loss": 0.5672, "step": 570 }, { "epoch": 1.461877756773787, "grad_norm": 0.5738916744555749, "learning_rate": 5e-06, "loss": 0.5646, "step": 580 }, { "epoch": 1.48708254568368, "grad_norm": 0.47370527901117226, "learning_rate": 5e-06, "loss": 0.5717, "step": 590 }, { "epoch": 1.5122873345935728, "grad_norm": 0.554049330965348, "learning_rate": 5e-06, "loss": 0.567, "step": 600 }, { "epoch": 1.5374921235034655, "grad_norm": 0.5960178872278222, "learning_rate": 5e-06, "loss": 0.5667, "step": 610 }, { "epoch": 1.5626969124133585, "grad_norm": 0.5085168346574576, "learning_rate": 5e-06, "loss": 0.5682, "step": 620 }, { "epoch": 1.5879017013232515, "grad_norm": 0.6537267193437978, "learning_rate": 5e-06, "loss": 0.5629, "step": 630 }, { "epoch": 1.6131064902331445, "grad_norm": 0.5874046443880916, "learning_rate": 5e-06, "loss": 0.5686, "step": 640 }, { "epoch": 1.6383112791430372, "grad_norm": 0.5727374180932888, "learning_rate": 5e-06, "loss": 0.5634, "step": 650 }, { "epoch": 1.66351606805293, "grad_norm": 0.4739882267214101, "learning_rate": 5e-06, "loss": 0.5643, "step": 660 }, { "epoch": 1.688720856962823, "grad_norm": 0.5225576157310349, "learning_rate": 5e-06, "loss": 0.5585, "step": 670 }, { "epoch": 1.713925645872716, "grad_norm": 0.5180601277471911, "learning_rate": 5e-06, "loss": 0.5656, "step": 680 }, { "epoch": 1.7391304347826086, "grad_norm": 0.4742139859545316, "learning_rate": 5e-06, "loss": 0.5601, "step": 690 }, { "epoch": 1.7643352236925016, "grad_norm": 0.569511350039058, "learning_rate": 5e-06, "loss": 0.5601, "step": 700 }, { "epoch": 1.7895400126023944, "grad_norm": 0.5632608053135445, "learning_rate": 5e-06, "loss": 0.5622, "step": 710 }, { "epoch": 1.8147448015122873, "grad_norm": 0.7901163025671061, "learning_rate": 5e-06, "loss": 0.5617, "step": 720 }, { "epoch": 1.8399495904221803, "grad_norm": 0.7242318592229516, "learning_rate": 5e-06, "loss": 0.5681, "step": 730 }, { "epoch": 1.865154379332073, "grad_norm": 0.5903083920098006, "learning_rate": 5e-06, "loss": 0.561, "step": 740 }, { "epoch": 1.8903591682419658, "grad_norm": 0.7074496398717507, "learning_rate": 5e-06, "loss": 0.5618, "step": 750 }, { "epoch": 1.9155639571518588, "grad_norm": 0.6211817411184037, "learning_rate": 5e-06, "loss": 0.5602, "step": 760 }, { "epoch": 1.9407687460617518, "grad_norm": 0.7491153337871396, "learning_rate": 5e-06, "loss": 0.5563, "step": 770 }, { "epoch": 1.9659735349716447, "grad_norm": 0.4814370775695788, "learning_rate": 5e-06, "loss": 0.5575, "step": 780 }, { "epoch": 1.9911783238815375, "grad_norm": 0.4983004502393923, "learning_rate": 5e-06, "loss": 0.5614, "step": 790 }, { "epoch": 1.9987397605545052, "eval_loss": 0.5889107584953308, "eval_runtime": 212.4952, "eval_samples_per_second": 50.312, "eval_steps_per_second": 0.395, "step": 793 }, { "epoch": 2.0163831127914302, "grad_norm": 0.6245989305076182, "learning_rate": 5e-06, "loss": 0.5329, "step": 800 }, { "epoch": 2.041587901701323, "grad_norm": 0.5453952668705114, "learning_rate": 5e-06, "loss": 0.5187, "step": 810 }, { "epoch": 2.066792690611216, "grad_norm": 0.4731173438501679, "learning_rate": 5e-06, "loss": 0.5157, "step": 820 }, { "epoch": 2.091997479521109, "grad_norm": 0.5441667804629673, "learning_rate": 5e-06, "loss": 0.5227, "step": 830 }, { "epoch": 2.1172022684310017, "grad_norm": 0.5708819169258152, "learning_rate": 5e-06, "loss": 0.5229, "step": 840 }, { "epoch": 2.1424070573408946, "grad_norm": 0.523183963336559, "learning_rate": 5e-06, "loss": 0.5262, "step": 850 }, { "epoch": 2.1676118462507876, "grad_norm": 0.5229800543372238, "learning_rate": 5e-06, "loss": 0.5235, "step": 860 }, { "epoch": 2.1928166351606806, "grad_norm": 0.5691473122380714, "learning_rate": 5e-06, "loss": 0.5177, "step": 870 }, { "epoch": 2.2180214240705736, "grad_norm": 0.5400039287033931, "learning_rate": 5e-06, "loss": 0.5273, "step": 880 }, { "epoch": 2.243226212980466, "grad_norm": 0.5107824302626609, "learning_rate": 5e-06, "loss": 0.5323, "step": 890 }, { "epoch": 2.268431001890359, "grad_norm": 0.5059906821559053, "learning_rate": 5e-06, "loss": 0.5202, "step": 900 }, { "epoch": 2.293635790800252, "grad_norm": 0.5379755140149425, "learning_rate": 5e-06, "loss": 0.5245, "step": 910 }, { "epoch": 2.318840579710145, "grad_norm": 0.5628132546435494, "learning_rate": 5e-06, "loss": 0.5227, "step": 920 }, { "epoch": 2.344045368620038, "grad_norm": 0.5940730599429787, "learning_rate": 5e-06, "loss": 0.5244, "step": 930 }, { "epoch": 2.3692501575299305, "grad_norm": 0.4876405401032709, "learning_rate": 5e-06, "loss": 0.5231, "step": 940 }, { "epoch": 2.3944549464398235, "grad_norm": 0.5287351322876084, "learning_rate": 5e-06, "loss": 0.5282, "step": 950 }, { "epoch": 2.4196597353497165, "grad_norm": 0.5497856784965347, "learning_rate": 5e-06, "loss": 0.5224, "step": 960 }, { "epoch": 2.4448645242596094, "grad_norm": 0.5169812352131126, "learning_rate": 5e-06, "loss": 0.5186, "step": 970 }, { "epoch": 2.4700693131695024, "grad_norm": 0.5065988236822105, "learning_rate": 5e-06, "loss": 0.5175, "step": 980 }, { "epoch": 2.495274102079395, "grad_norm": 0.5210233733825254, "learning_rate": 5e-06, "loss": 0.5167, "step": 990 }, { "epoch": 2.520478890989288, "grad_norm": 0.5188140313597245, "learning_rate": 5e-06, "loss": 0.518, "step": 1000 }, { "epoch": 2.545683679899181, "grad_norm": 0.5405876217592858, "learning_rate": 5e-06, "loss": 0.5193, "step": 1010 }, { "epoch": 2.570888468809074, "grad_norm": 0.46633858042210613, "learning_rate": 5e-06, "loss": 0.5248, "step": 1020 }, { "epoch": 2.596093257718967, "grad_norm": 0.5030938635404251, "learning_rate": 5e-06, "loss": 0.5171, "step": 1030 }, { "epoch": 2.6212980466288593, "grad_norm": 0.7552264831047579, "learning_rate": 5e-06, "loss": 0.5208, "step": 1040 }, { "epoch": 2.6465028355387523, "grad_norm": 0.6446011609995526, "learning_rate": 5e-06, "loss": 0.5226, "step": 1050 }, { "epoch": 2.6717076244486453, "grad_norm": 0.598720313170294, "learning_rate": 5e-06, "loss": 0.5201, "step": 1060 }, { "epoch": 2.6969124133585383, "grad_norm": 0.5646013331239497, "learning_rate": 5e-06, "loss": 0.5247, "step": 1070 }, { "epoch": 2.7221172022684312, "grad_norm": 0.5205190466062173, "learning_rate": 5e-06, "loss": 0.5228, "step": 1080 }, { "epoch": 2.7473219911783238, "grad_norm": 0.6675587237327031, "learning_rate": 5e-06, "loss": 0.5238, "step": 1090 }, { "epoch": 2.7725267800882167, "grad_norm": 0.5888446017638219, "learning_rate": 5e-06, "loss": 0.5246, "step": 1100 }, { "epoch": 2.7977315689981097, "grad_norm": 0.5291240035154432, "learning_rate": 5e-06, "loss": 0.5207, "step": 1110 }, { "epoch": 2.8229363579080027, "grad_norm": 0.5322435909276529, "learning_rate": 5e-06, "loss": 0.5167, "step": 1120 }, { "epoch": 2.8481411468178957, "grad_norm": 0.4603004988767882, "learning_rate": 5e-06, "loss": 0.5236, "step": 1130 }, { "epoch": 2.873345935727788, "grad_norm": 0.48682290640941545, "learning_rate": 5e-06, "loss": 0.5252, "step": 1140 }, { "epoch": 2.898550724637681, "grad_norm": 0.5361316970255996, "learning_rate": 5e-06, "loss": 0.5264, "step": 1150 }, { "epoch": 2.923755513547574, "grad_norm": 0.6157284870441493, "learning_rate": 5e-06, "loss": 0.5188, "step": 1160 }, { "epoch": 2.9489603024574667, "grad_norm": 0.5584802223169939, "learning_rate": 5e-06, "loss": 0.5189, "step": 1170 }, { "epoch": 2.97416509136736, "grad_norm": 0.5261737585816265, "learning_rate": 5e-06, "loss": 0.5205, "step": 1180 }, { "epoch": 2.9943289224952743, "eval_loss": 0.58283931016922, "eval_runtime": 213.9304, "eval_samples_per_second": 49.974, "eval_steps_per_second": 0.393, "step": 1188 }, { "epoch": 2.9943289224952743, "step": 1188, "total_flos": 1989525488271360.0, "train_loss": 0.5829599999418162, "train_runtime": 35401.3525, "train_samples_per_second": 17.213, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 1188, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1989525488271360.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }