{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 981, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03058103975535168, "grad_norm": 4.613314070863576, "learning_rate": 5e-06, "loss": 0.8051, "step": 10 }, { "epoch": 0.06116207951070336, "grad_norm": 1.8900295063815196, "learning_rate": 5e-06, "loss": 0.7288, "step": 20 }, { "epoch": 0.09174311926605505, "grad_norm": 2.828490629262087, "learning_rate": 5e-06, "loss": 0.709, "step": 30 }, { "epoch": 0.12232415902140673, "grad_norm": 1.3871149266999225, "learning_rate": 5e-06, "loss": 0.6966, "step": 40 }, { "epoch": 0.1529051987767584, "grad_norm": 1.5967435207002258, "learning_rate": 5e-06, "loss": 0.6812, "step": 50 }, { "epoch": 0.1834862385321101, "grad_norm": 1.882517891139545, "learning_rate": 5e-06, "loss": 0.6728, "step": 60 }, { "epoch": 0.21406727828746178, "grad_norm": 0.6460792834326635, "learning_rate": 5e-06, "loss": 0.661, "step": 70 }, { "epoch": 0.24464831804281345, "grad_norm": 0.8345961449379203, "learning_rate": 5e-06, "loss": 0.6527, "step": 80 }, { "epoch": 0.27522935779816515, "grad_norm": 0.6632736237849782, "learning_rate": 5e-06, "loss": 0.65, "step": 90 }, { "epoch": 0.3058103975535168, "grad_norm": 0.6414397366786443, "learning_rate": 5e-06, "loss": 0.6416, "step": 100 }, { "epoch": 0.3363914373088685, "grad_norm": 0.6410573252326648, "learning_rate": 5e-06, "loss": 0.6423, "step": 110 }, { "epoch": 0.3669724770642202, "grad_norm": 0.6301173144669027, "learning_rate": 5e-06, "loss": 0.6431, "step": 120 }, { "epoch": 0.39755351681957185, "grad_norm": 0.6700047532457443, "learning_rate": 5e-06, "loss": 0.6328, "step": 130 }, { "epoch": 0.42813455657492355, "grad_norm": 0.7253886574556052, "learning_rate": 5e-06, "loss": 0.6379, "step": 140 }, { "epoch": 0.45871559633027525, "grad_norm": 0.6468641147049369, "learning_rate": 5e-06, "loss": 0.6226, "step": 150 }, { "epoch": 0.4892966360856269, "grad_norm": 0.5346182726002575, "learning_rate": 5e-06, "loss": 0.6297, "step": 160 }, { "epoch": 0.5198776758409785, "grad_norm": 0.5313857610798393, "learning_rate": 5e-06, "loss": 0.6315, "step": 170 }, { "epoch": 0.5504587155963303, "grad_norm": 1.055453437070951, "learning_rate": 5e-06, "loss": 0.6319, "step": 180 }, { "epoch": 0.581039755351682, "grad_norm": 1.0346764494649112, "learning_rate": 5e-06, "loss": 0.6336, "step": 190 }, { "epoch": 0.6116207951070336, "grad_norm": 0.5618281272496094, "learning_rate": 5e-06, "loss": 0.6219, "step": 200 }, { "epoch": 0.6422018348623854, "grad_norm": 1.3519206618987067, "learning_rate": 5e-06, "loss": 0.6231, "step": 210 }, { "epoch": 0.672782874617737, "grad_norm": 0.7749478594608031, "learning_rate": 5e-06, "loss": 0.6242, "step": 220 }, { "epoch": 0.7033639143730887, "grad_norm": 0.5893825196950665, "learning_rate": 5e-06, "loss": 0.6241, "step": 230 }, { "epoch": 0.7339449541284404, "grad_norm": 0.6773691040863971, "learning_rate": 5e-06, "loss": 0.6222, "step": 240 }, { "epoch": 0.764525993883792, "grad_norm": 0.4827169638012845, "learning_rate": 5e-06, "loss": 0.619, "step": 250 }, { "epoch": 0.7951070336391437, "grad_norm": 0.6490833986094754, "learning_rate": 5e-06, "loss": 0.6214, "step": 260 }, { "epoch": 0.8256880733944955, "grad_norm": 0.4630327930179835, "learning_rate": 5e-06, "loss": 0.6234, "step": 270 }, { "epoch": 0.8562691131498471, "grad_norm": 0.5519953643760132, "learning_rate": 5e-06, "loss": 0.6238, "step": 280 }, { "epoch": 0.8868501529051988, "grad_norm": 0.6014046385653471, "learning_rate": 5e-06, "loss": 0.6205, "step": 290 }, { "epoch": 0.9174311926605505, "grad_norm": 0.9905714959978613, "learning_rate": 5e-06, "loss": 0.614, "step": 300 }, { "epoch": 0.9480122324159022, "grad_norm": 0.5722101718286907, "learning_rate": 5e-06, "loss": 0.6174, "step": 310 }, { "epoch": 0.9785932721712538, "grad_norm": 0.49394030115956855, "learning_rate": 5e-06, "loss": 0.6318, "step": 320 }, { "epoch": 1.0, "eval_loss": 0.6179984211921692, "eval_runtime": 175.4761, "eval_samples_per_second": 50.218, "eval_steps_per_second": 0.393, "step": 327 }, { "epoch": 1.0091743119266054, "grad_norm": 0.7689781158724909, "learning_rate": 5e-06, "loss": 0.6099, "step": 330 }, { "epoch": 1.039755351681957, "grad_norm": 0.6771992725638727, "learning_rate": 5e-06, "loss": 0.5636, "step": 340 }, { "epoch": 1.070336391437309, "grad_norm": 0.8428533582721723, "learning_rate": 5e-06, "loss": 0.5702, "step": 350 }, { "epoch": 1.1009174311926606, "grad_norm": 0.6417949273070751, "learning_rate": 5e-06, "loss": 0.5724, "step": 360 }, { "epoch": 1.1314984709480123, "grad_norm": 0.4739060770421859, "learning_rate": 5e-06, "loss": 0.5667, "step": 370 }, { "epoch": 1.162079510703364, "grad_norm": 0.8290714956133294, "learning_rate": 5e-06, "loss": 0.5723, "step": 380 }, { "epoch": 1.1926605504587156, "grad_norm": 0.4621451043452794, "learning_rate": 5e-06, "loss": 0.5712, "step": 390 }, { "epoch": 1.2232415902140672, "grad_norm": 0.5143870327303872, "learning_rate": 5e-06, "loss": 0.5665, "step": 400 }, { "epoch": 1.2538226299694188, "grad_norm": 0.5164679975473679, "learning_rate": 5e-06, "loss": 0.5738, "step": 410 }, { "epoch": 1.2844036697247707, "grad_norm": 0.8444328502616898, "learning_rate": 5e-06, "loss": 0.568, "step": 420 }, { "epoch": 1.3149847094801224, "grad_norm": 0.5068551444160476, "learning_rate": 5e-06, "loss": 0.5685, "step": 430 }, { "epoch": 1.345565749235474, "grad_norm": 0.5272187758950088, "learning_rate": 5e-06, "loss": 0.5719, "step": 440 }, { "epoch": 1.3761467889908257, "grad_norm": 0.6560982973750454, "learning_rate": 5e-06, "loss": 0.5631, "step": 450 }, { "epoch": 1.4067278287461773, "grad_norm": 0.4484289597201794, "learning_rate": 5e-06, "loss": 0.5662, "step": 460 }, { "epoch": 1.4373088685015292, "grad_norm": 0.5845602559047488, "learning_rate": 5e-06, "loss": 0.5555, "step": 470 }, { "epoch": 1.4678899082568808, "grad_norm": 0.5019202622500104, "learning_rate": 5e-06, "loss": 0.572, "step": 480 }, { "epoch": 1.4984709480122325, "grad_norm": 0.5453352197296611, "learning_rate": 5e-06, "loss": 0.5678, "step": 490 }, { "epoch": 1.529051987767584, "grad_norm": 0.5096577153134583, "learning_rate": 5e-06, "loss": 0.5747, "step": 500 }, { "epoch": 1.5596330275229358, "grad_norm": 0.6175776252130769, "learning_rate": 5e-06, "loss": 0.5664, "step": 510 }, { "epoch": 1.5902140672782874, "grad_norm": 0.5104602945006693, "learning_rate": 5e-06, "loss": 0.5634, "step": 520 }, { "epoch": 1.620795107033639, "grad_norm": 0.596086026271991, "learning_rate": 5e-06, "loss": 0.5751, "step": 530 }, { "epoch": 1.6513761467889907, "grad_norm": 0.5308994737717756, "learning_rate": 5e-06, "loss": 0.5733, "step": 540 }, { "epoch": 1.6819571865443423, "grad_norm": 0.4845901344757882, "learning_rate": 5e-06, "loss": 0.5717, "step": 550 }, { "epoch": 1.7125382262996942, "grad_norm": 0.5671326796569592, "learning_rate": 5e-06, "loss": 0.5724, "step": 560 }, { "epoch": 1.7431192660550459, "grad_norm": 0.4598174188757565, "learning_rate": 5e-06, "loss": 0.5683, "step": 570 }, { "epoch": 1.7737003058103975, "grad_norm": 0.5112157139948377, "learning_rate": 5e-06, "loss": 0.5681, "step": 580 }, { "epoch": 1.8042813455657494, "grad_norm": 0.5586531166082738, "learning_rate": 5e-06, "loss": 0.5684, "step": 590 }, { "epoch": 1.834862385321101, "grad_norm": 0.4688565551712795, "learning_rate": 5e-06, "loss": 0.5644, "step": 600 }, { "epoch": 1.8654434250764527, "grad_norm": 0.5068561602224454, "learning_rate": 5e-06, "loss": 0.5634, "step": 610 }, { "epoch": 1.8960244648318043, "grad_norm": 0.5177020231748777, "learning_rate": 5e-06, "loss": 0.5693, "step": 620 }, { "epoch": 1.926605504587156, "grad_norm": 0.5480741145181502, "learning_rate": 5e-06, "loss": 0.5613, "step": 630 }, { "epoch": 1.9571865443425076, "grad_norm": 0.4783567812818659, "learning_rate": 5e-06, "loss": 0.5585, "step": 640 }, { "epoch": 1.9877675840978593, "grad_norm": 0.5551088225058829, "learning_rate": 5e-06, "loss": 0.5674, "step": 650 }, { "epoch": 2.0, "eval_loss": 0.6093349456787109, "eval_runtime": 174.9969, "eval_samples_per_second": 50.355, "eval_steps_per_second": 0.394, "step": 654 }, { "epoch": 2.018348623853211, "grad_norm": 0.6215337411502556, "learning_rate": 5e-06, "loss": 0.5384, "step": 660 }, { "epoch": 2.0489296636085625, "grad_norm": 0.6523651145276358, "learning_rate": 5e-06, "loss": 0.5183, "step": 670 }, { "epoch": 2.079510703363914, "grad_norm": 0.4938873169688457, "learning_rate": 5e-06, "loss": 0.5154, "step": 680 }, { "epoch": 2.1100917431192663, "grad_norm": 0.5700577847392374, "learning_rate": 5e-06, "loss": 0.513, "step": 690 }, { "epoch": 2.140672782874618, "grad_norm": 0.6865444356940279, "learning_rate": 5e-06, "loss": 0.5107, "step": 700 }, { "epoch": 2.1712538226299696, "grad_norm": 0.49230256627015, "learning_rate": 5e-06, "loss": 0.5109, "step": 710 }, { "epoch": 2.2018348623853212, "grad_norm": 0.8680096117870334, "learning_rate": 5e-06, "loss": 0.5201, "step": 720 }, { "epoch": 2.232415902140673, "grad_norm": 0.5282977989250981, "learning_rate": 5e-06, "loss": 0.5176, "step": 730 }, { "epoch": 2.2629969418960245, "grad_norm": 0.5641604277626704, "learning_rate": 5e-06, "loss": 0.5175, "step": 740 }, { "epoch": 2.293577981651376, "grad_norm": 0.5627994676639944, "learning_rate": 5e-06, "loss": 0.5233, "step": 750 }, { "epoch": 2.324159021406728, "grad_norm": 0.5351783170372003, "learning_rate": 5e-06, "loss": 0.5193, "step": 760 }, { "epoch": 2.3547400611620795, "grad_norm": 0.5159357728539045, "learning_rate": 5e-06, "loss": 0.5145, "step": 770 }, { "epoch": 2.385321100917431, "grad_norm": 0.6104068286820499, "learning_rate": 5e-06, "loss": 0.5164, "step": 780 }, { "epoch": 2.4159021406727827, "grad_norm": 0.677498087908613, "learning_rate": 5e-06, "loss": 0.5262, "step": 790 }, { "epoch": 2.4464831804281344, "grad_norm": 0.5283675690505885, "learning_rate": 5e-06, "loss": 0.5149, "step": 800 }, { "epoch": 2.477064220183486, "grad_norm": 0.5044083108738047, "learning_rate": 5e-06, "loss": 0.5176, "step": 810 }, { "epoch": 2.5076452599388377, "grad_norm": 0.49494102207897933, "learning_rate": 5e-06, "loss": 0.5204, "step": 820 }, { "epoch": 2.5382262996941893, "grad_norm": 0.4861352677652072, "learning_rate": 5e-06, "loss": 0.5337, "step": 830 }, { "epoch": 2.5688073394495414, "grad_norm": 0.48851988662021156, "learning_rate": 5e-06, "loss": 0.5238, "step": 840 }, { "epoch": 2.599388379204893, "grad_norm": 0.5226833608668234, "learning_rate": 5e-06, "loss": 0.5205, "step": 850 }, { "epoch": 2.6299694189602447, "grad_norm": 0.5465654006210326, "learning_rate": 5e-06, "loss": 0.5221, "step": 860 }, { "epoch": 2.6605504587155964, "grad_norm": 0.5039029213538379, "learning_rate": 5e-06, "loss": 0.5196, "step": 870 }, { "epoch": 2.691131498470948, "grad_norm": 0.5371018828919037, "learning_rate": 5e-06, "loss": 0.5237, "step": 880 }, { "epoch": 2.7217125382262997, "grad_norm": 0.5383889127160468, "learning_rate": 5e-06, "loss": 0.5169, "step": 890 }, { "epoch": 2.7522935779816513, "grad_norm": 0.5406840391563221, "learning_rate": 5e-06, "loss": 0.5116, "step": 900 }, { "epoch": 2.782874617737003, "grad_norm": 0.5182378062317926, "learning_rate": 5e-06, "loss": 0.5147, "step": 910 }, { "epoch": 2.8134556574923546, "grad_norm": 0.483759876679847, "learning_rate": 5e-06, "loss": 0.514, "step": 920 }, { "epoch": 2.8440366972477067, "grad_norm": 0.5715170886676529, "learning_rate": 5e-06, "loss": 0.5189, "step": 930 }, { "epoch": 2.8746177370030583, "grad_norm": 0.4882822153954844, "learning_rate": 5e-06, "loss": 0.5131, "step": 940 }, { "epoch": 2.90519877675841, "grad_norm": 0.518462889966442, "learning_rate": 5e-06, "loss": 0.5228, "step": 950 }, { "epoch": 2.9357798165137616, "grad_norm": 0.500511829330958, "learning_rate": 5e-06, "loss": 0.5177, "step": 960 }, { "epoch": 2.9663608562691133, "grad_norm": 0.4799796524761594, "learning_rate": 5e-06, "loss": 0.5283, "step": 970 }, { "epoch": 2.996941896024465, "grad_norm": 0.6756287967133653, "learning_rate": 5e-06, "loss": 0.5228, "step": 980 }, { "epoch": 3.0, "eval_loss": 0.6134681701660156, "eval_runtime": 174.9719, "eval_samples_per_second": 50.362, "eval_steps_per_second": 0.394, "step": 981 }, { "epoch": 3.0, "step": 981, "total_flos": 1642792778465280.0, "train_loss": 0.5776342995062272, "train_runtime": 29440.9564, "train_samples_per_second": 17.06, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 981, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1642792778465280.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }