{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2304, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0390625, "grad_norm": 2.857225616936394, "learning_rate": 6.493506493506493e-07, "loss": 0.7578, "step": 30 }, { "epoch": 0.078125, "grad_norm": 1.3044758629614164, "learning_rate": 1.2987012987012986e-06, "loss": 0.5677, "step": 60 }, { "epoch": 0.1171875, "grad_norm": 1.1974358130130436, "learning_rate": 1.9480519480519483e-06, "loss": 0.5243, "step": 90 }, { "epoch": 0.15625, "grad_norm": 1.0778991302776368, "learning_rate": 2.597402597402597e-06, "loss": 0.5118, "step": 120 }, { "epoch": 0.1953125, "grad_norm": 1.0101444029426376, "learning_rate": 3.246753246753247e-06, "loss": 0.5146, "step": 150 }, { "epoch": 0.234375, "grad_norm": 0.97813783772122, "learning_rate": 3.896103896103897e-06, "loss": 0.5121, "step": 180 }, { "epoch": 0.2734375, "grad_norm": 1.0146147755881423, "learning_rate": 4.5454545454545455e-06, "loss": 0.5172, "step": 210 }, { "epoch": 0.3125, "grad_norm": 1.0002914434860946, "learning_rate": 4.999767464405452e-06, "loss": 0.5053, "step": 240 }, { "epoch": 0.3515625, "grad_norm": 0.9909790519322216, "learning_rate": 4.995634701567892e-06, "loss": 0.5197, "step": 270 }, { "epoch": 0.390625, "grad_norm": 0.9471114669156748, "learning_rate": 4.986344312601082e-06, "loss": 0.5087, "step": 300 }, { "epoch": 0.4296875, "grad_norm": 0.990508902551153, "learning_rate": 4.971915497571788e-06, "loss": 0.5126, "step": 330 }, { "epoch": 0.46875, "grad_norm": 0.9299422099207572, "learning_rate": 4.9523780759216764e-06, "loss": 0.5144, "step": 360 }, { "epoch": 0.5078125, "grad_norm": 0.8743256439010634, "learning_rate": 4.927772424840702e-06, "loss": 0.5083, "step": 390 }, { "epoch": 0.546875, "grad_norm": 0.9036662054182406, "learning_rate": 4.898149395821218e-06, "loss": 0.5117, "step": 420 }, { "epoch": 0.5859375, "grad_norm": 0.8810653800350299, "learning_rate": 4.863570209565277e-06, "loss": 0.5107, "step": 450 }, { "epoch": 0.625, "grad_norm": 0.8846561337964834, "learning_rate": 4.824106329462313e-06, "loss": 0.5098, "step": 480 }, { "epoch": 0.6640625, "grad_norm": 0.8997382537130586, "learning_rate": 4.779839313898675e-06, "loss": 0.5152, "step": 510 }, { "epoch": 0.703125, "grad_norm": 0.939340943654687, "learning_rate": 4.730860647704252e-06, "loss": 0.5078, "step": 540 }, { "epoch": 0.7421875, "grad_norm": 0.845279140634368, "learning_rate": 4.677271553084515e-06, "loss": 0.5078, "step": 570 }, { "epoch": 0.78125, "grad_norm": 0.9487929366289761, "learning_rate": 4.6191827804287236e-06, "loss": 0.5073, "step": 600 }, { "epoch": 0.8203125, "grad_norm": 0.9162235285381133, "learning_rate": 4.556714379426634e-06, "loss": 0.5194, "step": 630 }, { "epoch": 0.859375, "grad_norm": 0.8817621527988113, "learning_rate": 4.489995450966714e-06, "loss": 0.4997, "step": 660 }, { "epoch": 0.8984375, "grad_norm": 0.8573037310302468, "learning_rate": 4.419163880328615e-06, "loss": 0.5008, "step": 690 }, { "epoch": 0.9375, "grad_norm": 0.9279340785042338, "learning_rate": 4.344366052221316e-06, "loss": 0.5037, "step": 720 }, { "epoch": 0.9765625, "grad_norm": 0.8075055611747111, "learning_rate": 4.265756548255823e-06, "loss": 0.4977, "step": 750 }, { "epoch": 1.015625, "grad_norm": 1.098595770652355, "learning_rate": 4.183497827477687e-06, "loss": 0.4782, "step": 780 }, { "epoch": 1.0546875, "grad_norm": 0.9988970588938918, "learning_rate": 4.097759890619539e-06, "loss": 0.439, "step": 810 }, { "epoch": 1.09375, "grad_norm": 0.9731524721989655, "learning_rate": 4.00871992876753e-06, "loss": 0.4441, "step": 840 }, { "epoch": 1.1328125, "grad_norm": 0.9099368305482215, "learning_rate": 3.916561957167765e-06, "loss": 0.4438, "step": 870 }, { "epoch": 1.171875, "grad_norm": 1.0659953860812488, "learning_rate": 3.82147643492952e-06, "loss": 0.44, "step": 900 }, { "epoch": 1.2109375, "grad_norm": 0.9779085212603401, "learning_rate": 3.723659871411196e-06, "loss": 0.4406, "step": 930 }, { "epoch": 1.25, "grad_norm": 0.9719837374463801, "learning_rate": 3.623314420102467e-06, "loss": 0.4464, "step": 960 }, { "epoch": 1.2890625, "grad_norm": 1.0182265044301695, "learning_rate": 3.5206474608419385e-06, "loss": 0.4462, "step": 990 }, { "epoch": 1.328125, "grad_norm": 0.9265295707713885, "learning_rate": 3.415871171233709e-06, "loss": 0.4412, "step": 1020 }, { "epoch": 1.3671875, "grad_norm": 1.0015451544453786, "learning_rate": 3.3092020881486085e-06, "loss": 0.4395, "step": 1050 }, { "epoch": 1.40625, "grad_norm": 0.93431276827024, "learning_rate": 3.2008606602163023e-06, "loss": 0.4425, "step": 1080 }, { "epoch": 1.4453125, "grad_norm": 0.9255587183512949, "learning_rate": 3.091070792233124e-06, "loss": 0.439, "step": 1110 }, { "epoch": 1.484375, "grad_norm": 0.9343974096084879, "learning_rate": 2.9800593824272027e-06, "loss": 0.4354, "step": 1140 }, { "epoch": 1.5234375, "grad_norm": 1.042370945691791, "learning_rate": 2.8680558535371688e-06, "loss": 0.4404, "step": 1170 }, { "epoch": 1.5625, "grad_norm": 0.9126651351226419, "learning_rate": 2.7552916786735744e-06, "loss": 0.4431, "step": 1200 }, { "epoch": 1.6015625, "grad_norm": 0.9963009170045803, "learning_rate": 2.641999902942882e-06, "loss": 0.43, "step": 1230 }, { "epoch": 1.640625, "grad_norm": 0.9703924532720508, "learning_rate": 2.5284146618226807e-06, "loss": 0.449, "step": 1260 }, { "epoch": 1.6796875, "grad_norm": 0.8989408268444277, "learning_rate": 2.414770697283471e-06, "loss": 0.4387, "step": 1290 }, { "epoch": 1.71875, "grad_norm": 0.9709563268893221, "learning_rate": 2.3013028726570436e-06, "loss": 0.444, "step": 1320 }, { "epoch": 1.7578125, "grad_norm": 0.9557991402725722, "learning_rate": 2.188245687254035e-06, "loss": 0.4394, "step": 1350 }, { "epoch": 1.796875, "grad_norm": 0.9603778434937646, "learning_rate": 2.075832791733802e-06, "loss": 0.4473, "step": 1380 }, { "epoch": 1.8359375, "grad_norm": 0.9476806986189421, "learning_rate": 1.9642965052281618e-06, "loss": 0.4404, "step": 1410 }, { "epoch": 1.875, "grad_norm": 0.9085522492818641, "learning_rate": 1.8538673352169467e-06, "loss": 0.4446, "step": 1440 }, { "epoch": 1.9140625, "grad_norm": 0.9480501458847437, "learning_rate": 1.744773501147627e-06, "loss": 0.4236, "step": 1470 }, { "epoch": 1.953125, "grad_norm": 0.9099438022581319, "learning_rate": 1.6372404627835182e-06, "loss": 0.4352, "step": 1500 }, { "epoch": 1.9921875, "grad_norm": 0.9130000879221961, "learning_rate": 1.5314904542553099e-06, "loss": 0.4344, "step": 1530 }, { "epoch": 2.03125, "grad_norm": 1.072785526043702, "learning_rate": 1.4277420247788842e-06, "loss": 0.3877, "step": 1560 }, { "epoch": 2.0703125, "grad_norm": 1.0213733525555977, "learning_rate": 1.3262095869885907e-06, "loss": 0.3748, "step": 1590 }, { "epoch": 2.109375, "grad_norm": 1.0378622376674729, "learning_rate": 1.227102973819426e-06, "loss": 0.3801, "step": 1620 }, { "epoch": 2.1484375, "grad_norm": 1.0636201329760862, "learning_rate": 1.1306270048538966e-06, "loss": 0.3623, "step": 1650 }, { "epoch": 2.1875, "grad_norm": 1.065350906800766, "learning_rate": 1.0369810630297658e-06, "loss": 0.3652, "step": 1680 }, { "epoch": 2.2265625, "grad_norm": 1.065580953707456, "learning_rate": 9.463586825834939e-07, "loss": 0.3724, "step": 1710 }, { "epoch": 2.265625, "grad_norm": 1.1044573731710503, "learning_rate": 8.589471490809473e-07, "loss": 0.3639, "step": 1740 }, { "epoch": 2.3046875, "grad_norm": 1.057219345369191, "learning_rate": 7.749271123619889e-07, "loss": 0.3665, "step": 1770 }, { "epoch": 2.34375, "grad_norm": 1.120692356243222, "learning_rate": 6.944722131988394e-07, "loss": 0.3624, "step": 1800 }, { "epoch": 2.3828125, "grad_norm": 1.1354171819466858, "learning_rate": 6.177487244398009e-07, "loss": 0.3629, "step": 1830 }, { "epoch": 2.421875, "grad_norm": 1.0454720627484864, "learning_rate": 5.449152073799616e-07, "loss": 0.3739, "step": 1860 }, { "epoch": 2.4609375, "grad_norm": 1.1194418224635865, "learning_rate": 4.761221840690586e-07, "loss": 0.3723, "step": 1890 }, { "epoch": 2.5, "grad_norm": 1.097996189333425, "learning_rate": 4.115118262337128e-07, "loss": 0.377, "step": 1920 }, { "epoch": 2.5390625, "grad_norm": 1.0208876828373903, "learning_rate": 3.512176614569418e-07, "loss": 0.3676, "step": 1950 }, { "epoch": 2.578125, "grad_norm": 1.022735887067447, "learning_rate": 2.9536429722216207e-07, "loss": 0.3714, "step": 1980 }, { "epoch": 2.6171875, "grad_norm": 1.04711360457167, "learning_rate": 2.440671633920075e-07, "loss": 0.3733, "step": 2010 }, { "epoch": 2.65625, "grad_norm": 1.0415975853212511, "learning_rate": 1.9743227365415092e-07, "loss": 0.3694, "step": 2040 }, { "epoch": 2.6953125, "grad_norm": 1.106325100873404, "learning_rate": 1.5555600642715442e-07, "loss": 0.3747, "step": 2070 }, { "epoch": 2.734375, "grad_norm": 1.0448870665618393, "learning_rate": 1.1852490567913655e-07, "loss": 0.3611, "step": 2100 }, { "epoch": 2.7734375, "grad_norm": 1.1214742422046289, "learning_rate": 8.641550207089039e-08, "loss": 0.3686, "step": 2130 }, { "epoch": 2.8125, "grad_norm": 1.0756561712922115, "learning_rate": 5.92941547931028e-08, "loss": 0.3716, "step": 2160 }, { "epoch": 2.8515625, "grad_norm": 1.057575658588282, "learning_rate": 3.7216914424527686e-08, "loss": 0.3624, "step": 2190 }, { "epoch": 2.890625, "grad_norm": 1.0540930132763664, "learning_rate": 2.0229407094547736e-08, "loss": 0.369, "step": 2220 }, { "epoch": 2.9296875, "grad_norm": 1.105125578397597, "learning_rate": 8.366740189520716e-09, "loss": 0.3668, "step": 2250 }, { "epoch": 2.96875, "grad_norm": 0.9862762687662256, "learning_rate": 1.6534297977804925e-09, "loss": 0.3621, "step": 2280 }, { "epoch": 3.0, "step": 2304, "total_flos": 415352546656256.0, "train_loss": 0.4438822174237834, "train_runtime": 11247.4679, "train_samples_per_second": 26.212, "train_steps_per_second": 0.205 } ], "logging_steps": 30, "max_steps": 2304, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 256, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 415352546656256.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }