{ "best_metric": 0.8502547144889832, "best_model_checkpoint": "data/smollm2-1.7B-sft\\checkpoint-142", "epoch": 1.9984, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.035398230088495575, "grad_norm": 8.5625, "learning_rate": 0.0001724137931034483, "loss": 2.2924, "mean_token_accuracy": 0.5223099514842033, "step": 5 }, { "epoch": 0.07079646017699115, "grad_norm": 1.1796875, "learning_rate": 0.0003448275862068966, "loss": 1.07, "mean_token_accuracy": 0.7215210050344467, "step": 10 }, { "epoch": 0.10619469026548672, "grad_norm": 0.88671875, "learning_rate": 0.0005172413793103448, "loss": 0.8398, "mean_token_accuracy": 0.7689230531454087, "step": 15 }, { "epoch": 0.1415929203539823, "grad_norm": 0.96484375, "learning_rate": 0.0006896551724137932, "loss": 0.8466, "mean_token_accuracy": 0.7690734684467315, "step": 20 }, { "epoch": 0.17699115044247787, "grad_norm": 1.0859375, "learning_rate": 0.0008620689655172414, "loss": 0.8315, "mean_token_accuracy": 0.7698834031820297, "step": 25 }, { "epoch": 0.21238938053097345, "grad_norm": 0.8984375, "learning_rate": 0.000999961452773888, "loss": 0.8657, "mean_token_accuracy": 0.7694115966558457, "step": 30 }, { "epoch": 0.24778761061946902, "grad_norm": 0.89453125, "learning_rate": 0.0009986129238305635, "loss": 0.9508, "mean_token_accuracy": 0.7505242705345154, "step": 35 }, { "epoch": 0.2831858407079646, "grad_norm": 0.74609375, "learning_rate": 0.0009953429730181654, "loss": 0.9422, "mean_token_accuracy": 0.7508314579725266, "step": 40 }, { "epoch": 0.3185840707964602, "grad_norm": 0.84765625, "learning_rate": 0.0009901642012034213, "loss": 0.9568, "mean_token_accuracy": 0.7477447241544724, "step": 45 }, { "epoch": 0.35398230088495575, "grad_norm": 0.75390625, "learning_rate": 0.0009830965649597454, "loss": 0.9577, "mean_token_accuracy": 0.7479827046394348, "step": 50 }, { "epoch": 0.3893805309734513, "grad_norm": 0.69921875, "learning_rate": 0.0009741672996639047, "loss": 0.9318, "mean_token_accuracy": 0.7587000578641891, "step": 55 }, { "epoch": 0.4247787610619469, "grad_norm": 0.83203125, "learning_rate": 0.0009634108145435665, "loss": 0.9259, "mean_token_accuracy": 0.7593102544546128, "step": 60 }, { "epoch": 0.46017699115044247, "grad_norm": 0.7109375, "learning_rate": 0.0009508685600801703, "loss": 0.8877, "mean_token_accuracy": 0.768345057964325, "step": 65 }, { "epoch": 0.49557522123893805, "grad_norm": 0.66015625, "learning_rate": 0.0009365888682780861, "loss": 0.9253, "mean_token_accuracy": 0.7625227481126785, "step": 70 }, { "epoch": 0.5309734513274337, "grad_norm": 0.61328125, "learning_rate": 0.0009206267664155906, "loss": 0.912, "mean_token_accuracy": 0.7615309178829193, "step": 75 }, { "epoch": 0.5663716814159292, "grad_norm": 0.67578125, "learning_rate": 0.0009030437649953789, "loss": 0.9242, "mean_token_accuracy": 0.7563268959522247, "step": 80 }, { "epoch": 0.6017699115044248, "grad_norm": 0.6171875, "learning_rate": 0.0008839076207117484, "loss": 0.8975, "mean_token_accuracy": 0.7675409287214279, "step": 85 }, { "epoch": 0.6371681415929203, "grad_norm": 0.60546875, "learning_rate": 0.0008632920753478719, "loss": 0.8901, "mean_token_accuracy": 0.7678898781538009, "step": 90 }, { "epoch": 0.672566371681416, "grad_norm": 0.59765625, "learning_rate": 0.0008412765716093271, "loss": 0.8868, "mean_token_accuracy": 0.7646951735019684, "step": 95 }, { "epoch": 0.7079646017699115, "grad_norm": 0.6015625, "learning_rate": 0.0008179459469889268, "loss": 0.9314, "mean_token_accuracy": 0.7571736365556717, "step": 100 }, { "epoch": 0.7433628318584071, "grad_norm": 0.6171875, "learning_rate": 0.0007933901068425538, "loss": 0.8961, "mean_token_accuracy": 0.7628611207008362, "step": 105 }, { "epoch": 0.7787610619469026, "grad_norm": 0.5859375, "learning_rate": 0.000767703677935813, "loss": 0.891, "mean_token_accuracy": 0.7617514222860337, "step": 110 }, { "epoch": 0.8141592920353983, "grad_norm": 0.69140625, "learning_rate": 0.000740985643796569, "loss": 0.9057, "mean_token_accuracy": 0.7621735870838166, "step": 115 }, { "epoch": 0.8495575221238938, "grad_norm": 0.61328125, "learning_rate": 0.0007133389632785542, "loss": 0.8874, "mean_token_accuracy": 0.7660593271255494, "step": 120 }, { "epoch": 0.8849557522123894, "grad_norm": 0.5703125, "learning_rate": 0.0006848701738059226, "loss": 0.8441, "mean_token_accuracy": 0.7740032315254212, "step": 125 }, { "epoch": 0.9203539823008849, "grad_norm": 0.59375, "learning_rate": 0.0006556889808276594, "loss": 0.8767, "mean_token_accuracy": 0.7661084860563279, "step": 130 }, { "epoch": 0.9557522123893806, "grad_norm": 0.66796875, "learning_rate": 0.0006259078350639009, "loss": 0.8962, "mean_token_accuracy": 0.7660992830991745, "step": 135 }, { "epoch": 0.9911504424778761, "grad_norm": 0.5390625, "learning_rate": 0.0005956414991732583, "loss": 0.8515, "mean_token_accuracy": 0.7711150646209717, "step": 140 }, { "epoch": 1.0, "eval_loss": 0.8502547144889832, "eval_mean_token_accuracy": 0.7765891637120929, "eval_runtime": 2.1657, "eval_samples_per_second": 54.949, "eval_steps_per_second": 13.853, "step": 142 }, { "epoch": 0.928, "grad_norm": 1.1484375, "learning_rate": 0.000649163469284578, "loss": 1.5121, "mean_token_accuracy": 0.6798420002063116, "step": 145 }, { "epoch": 0.96, "grad_norm": 0.6484375, "learning_rate": 0.000622170203068947, "loss": 1.1683, "mean_token_accuracy": 0.7228868573904037, "step": 150 }, { "epoch": 0.992, "grad_norm": 0.6171875, "learning_rate": 0.0005947925441958392, "loss": 1.1881, "mean_token_accuracy": 0.7217638492584229, "step": 155 }, { "epoch": 0.9984, "eval_loss": 1.1636234521865845, "eval_mean_token_accuracy": 0.7250353273223428, "eval_runtime": 70.0038, "eval_samples_per_second": 14.285, "eval_steps_per_second": 3.571, "step": 156 }, { "epoch": 1.0256, "grad_norm": 0.609375, "learning_rate": 0.0005671166329088278, "loss": 1.3735, "mean_token_accuracy": 0.7168766930699348, "step": 160 }, { "epoch": 1.0576, "grad_norm": 0.578125, "learning_rate": 0.0005392295478639225, "loss": 1.0769, "mean_token_accuracy": 0.740374532341957, "step": 165 }, { "epoch": 1.0896, "grad_norm": 0.5, "learning_rate": 0.0005112190321479025, "loss": 1.0308, "mean_token_accuracy": 0.7460691720247269, "step": 170 }, { "epoch": 1.1216, "grad_norm": 0.4609375, "learning_rate": 0.0004831732172061032, "loss": 1.1521, "mean_token_accuracy": 0.7258656650781632, "step": 175 }, { "epoch": 1.1536, "grad_norm": 0.466796875, "learning_rate": 0.0004551803455482833, "loss": 1.0695, "mean_token_accuracy": 0.7392740726470948, "step": 180 }, { "epoch": 1.1856, "grad_norm": 0.3671875, "learning_rate": 0.0004273284931050438, "loss": 1.0262, "mean_token_accuracy": 0.7486750066280365, "step": 185 }, { "epoch": 1.2176, "grad_norm": 0.439453125, "learning_rate": 0.00039970529210836363, "loss": 1.1084, "mean_token_accuracy": 0.7333114802837372, "step": 190 }, { "epoch": 1.2496, "grad_norm": 0.431640625, "learning_rate": 0.00037239765536817873, "loss": 0.9957, "mean_token_accuracy": 0.7531520247459411, "step": 195 }, { "epoch": 1.2816, "grad_norm": 0.369140625, "learning_rate": 0.00034549150281252633, "loss": 0.9807, "mean_token_accuracy": 0.7547169893980026, "step": 200 }, { "epoch": 1.3136, "grad_norm": 0.35546875, "learning_rate": 0.000319071491151664, "loss": 0.9396, "mean_token_accuracy": 0.7626697480678558, "step": 205 }, { "epoch": 1.3456000000000001, "grad_norm": 0.40234375, "learning_rate": 0.00029322074751673977, "loss": 0.9961, "mean_token_accuracy": 0.753812238574028, "step": 210 }, { "epoch": 1.3776, "grad_norm": 0.380859375, "learning_rate": 0.000268020607911083, "loss": 0.9953, "mean_token_accuracy": 0.7547316879034043, "step": 215 }, { "epoch": 1.4096, "grad_norm": 0.3515625, "learning_rate": 0.000243550361297047, "loss": 0.975, "mean_token_accuracy": 0.7559601426124573, "step": 220 }, { "epoch": 1.4416, "grad_norm": 0.412109375, "learning_rate": 0.00021988700012359863, "loss": 1.0118, "mean_token_accuracy": 0.7498785346746445, "step": 225 }, { "epoch": 1.4736, "grad_norm": 0.35546875, "learning_rate": 0.0001971049780795901, "loss": 0.9645, "mean_token_accuracy": 0.757828313112259, "step": 230 }, { "epoch": 1.5056, "grad_norm": 0.380859375, "learning_rate": 0.00017527597583490823, "loss": 0.9121, "mean_token_accuracy": 0.7678615897893906, "step": 235 }, { "epoch": 1.5375999999999999, "grad_norm": 0.388671875, "learning_rate": 0.00015446867550656773, "loss": 1.0359, "mean_token_accuracy": 0.7470418214797974, "step": 240 }, { "epoch": 1.5695999999999999, "grad_norm": 0.416015625, "learning_rate": 0.0001347485445593612, "loss": 0.9479, "mean_token_accuracy": 0.7594236731529236, "step": 245 }, { "epoch": 1.6016, "grad_norm": 0.34765625, "learning_rate": 0.00011617762982099444, "loss": 0.9621, "mean_token_accuracy": 0.7574791193008423, "step": 250 }, { "epoch": 1.6336, "grad_norm": 0.4140625, "learning_rate": 9.881436225981105e-05, "loss": 0.9807, "mean_token_accuracy": 0.7525162696838379, "step": 255 }, { "epoch": 1.6656, "grad_norm": 0.431640625, "learning_rate": 8.271337313934873e-05, "loss": 0.9372, "mean_token_accuracy": 0.7655486732721328, "step": 260 }, { "epoch": 1.6976, "grad_norm": 0.40625, "learning_rate": 6.792532212817271e-05, "loss": 0.9755, "mean_token_accuracy": 0.7566721349954605, "step": 265 }, { "epoch": 1.7296, "grad_norm": 0.44921875, "learning_rate": 5.449673790581611e-05, "loss": 0.9346, "mean_token_accuracy": 0.7655843138694763, "step": 270 }, { "epoch": 1.7616, "grad_norm": 0.443359375, "learning_rate": 4.2469871766340095e-05, "loss": 1.0262, "mean_token_accuracy": 0.748220956325531, "step": 275 }, { "epoch": 1.7936, "grad_norm": 0.416015625, "learning_rate": 3.18825646801314e-05, "loss": 0.973, "mean_token_accuracy": 0.7554513663053513, "step": 280 }, { "epoch": 1.8256000000000001, "grad_norm": 0.36328125, "learning_rate": 2.276812823220964e-05, "loss": 0.9479, "mean_token_accuracy": 0.7615785777568818, "step": 285 }, { "epoch": 1.8576000000000001, "grad_norm": 0.390625, "learning_rate": 1.5155239811656562e-05, "loss": 0.9759, "mean_token_accuracy": 0.7542947381734848, "step": 290 }, { "epoch": 1.8896, "grad_norm": 0.41796875, "learning_rate": 9.0678523819408e-06, "loss": 0.9293, "mean_token_accuracy": 0.7637231528759003, "step": 295 }, { "epoch": 1.9216, "grad_norm": 0.416015625, "learning_rate": 4.52511911603265e-06, "loss": 0.9287, "mean_token_accuracy": 0.7664623886346817, "step": 300 }, { "epoch": 1.9536, "grad_norm": 0.357421875, "learning_rate": 1.541333133436018e-06, "loss": 0.995, "mean_token_accuracy": 0.7533135890960694, "step": 305 }, { "epoch": 1.9856, "grad_norm": 0.384765625, "learning_rate": 1.2588252874673466e-07, "loss": 0.9186, "mean_token_accuracy": 0.7667679220438004, "step": 310 }, { "epoch": 1.9984, "eval_loss": 1.0249439477920532, "eval_mean_token_accuracy": 0.7480296109073846, "eval_runtime": 74.4351, "eval_samples_per_second": 13.435, "eval_steps_per_second": 3.359, "step": 312 }, { "epoch": 1.9984, "step": 312, "total_flos": 6258600061401600.0, "train_loss": 0.5569522526019659, "train_runtime": 905.7572, "train_samples_per_second": 5.52, "train_steps_per_second": 0.344 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6258600061401600.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }