{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9607843137254903, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02178649237472767, "grad_norm": 1.4119257926940918, "learning_rate": 4.945533769063181e-05, "loss": 0.6917, "step": 100 }, { "epoch": 0.04357298474945534, "grad_norm": 1.3989609479904175, "learning_rate": 4.891067538126362e-05, "loss": 0.4496, "step": 200 }, { "epoch": 0.06535947712418301, "grad_norm": 1.8628056049346924, "learning_rate": 4.8366013071895424e-05, "loss": 0.4774, "step": 300 }, { "epoch": 0.08714596949891068, "grad_norm": 1.6151583194732666, "learning_rate": 4.7821350762527234e-05, "loss": 0.4805, "step": 400 }, { "epoch": 0.10893246187363835, "grad_norm": 0.3191039264202118, "learning_rate": 4.7276688453159044e-05, "loss": 0.4375, "step": 500 }, { "epoch": 0.13071895424836602, "grad_norm": 1.743823766708374, "learning_rate": 4.673202614379085e-05, "loss": 0.4678, "step": 600 }, { "epoch": 0.15250544662309368, "grad_norm": 1.9511303901672363, "learning_rate": 4.6187363834422656e-05, "loss": 0.5285, "step": 700 }, { "epoch": 0.17429193899782136, "grad_norm": 0.5090198516845703, "learning_rate": 4.564270152505447e-05, "loss": 0.4451, "step": 800 }, { "epoch": 0.19607843137254902, "grad_norm": 2.6404006481170654, "learning_rate": 4.5098039215686275e-05, "loss": 0.4524, "step": 900 }, { "epoch": 0.2178649237472767, "grad_norm": 2.596175193786621, "learning_rate": 4.4553376906318085e-05, "loss": 0.4284, "step": 1000 }, { "epoch": 0.23965141612200436, "grad_norm": 2.235574960708618, "learning_rate": 4.400871459694989e-05, "loss": 0.4234, "step": 1100 }, { "epoch": 0.26143790849673204, "grad_norm": 1.786952018737793, "learning_rate": 4.3464052287581704e-05, "loss": 0.4881, "step": 1200 }, { "epoch": 0.28322440087145967, "grad_norm": 1.1873655319213867, "learning_rate": 4.291938997821351e-05, "loss": 0.4714, "step": 1300 }, { "epoch": 0.30501089324618735, "grad_norm": 1.3493982553482056, "learning_rate": 4.2374727668845316e-05, "loss": 0.4329, "step": 1400 }, { "epoch": 0.32679738562091504, "grad_norm": 1.2561562061309814, "learning_rate": 4.1830065359477126e-05, "loss": 0.3972, "step": 1500 }, { "epoch": 0.3485838779956427, "grad_norm": 3.1698968410491943, "learning_rate": 4.1285403050108935e-05, "loss": 0.4276, "step": 1600 }, { "epoch": 0.37037037037037035, "grad_norm": 1.0001728534698486, "learning_rate": 4.074074074074074e-05, "loss": 0.5036, "step": 1700 }, { "epoch": 0.39215686274509803, "grad_norm": 2.307466745376587, "learning_rate": 4.0196078431372555e-05, "loss": 0.437, "step": 1800 }, { "epoch": 0.4139433551198257, "grad_norm": 1.7833284139633179, "learning_rate": 3.965141612200436e-05, "loss": 0.4415, "step": 1900 }, { "epoch": 0.4357298474945534, "grad_norm": 1.4704581499099731, "learning_rate": 3.910675381263617e-05, "loss": 0.4752, "step": 2000 }, { "epoch": 0.45751633986928103, "grad_norm": 1.4827431440353394, "learning_rate": 3.8562091503267977e-05, "loss": 0.4268, "step": 2100 }, { "epoch": 0.4793028322440087, "grad_norm": 0.10033423453569412, "learning_rate": 3.8017429193899786e-05, "loss": 0.4641, "step": 2200 }, { "epoch": 0.5010893246187363, "grad_norm": 2.280324935913086, "learning_rate": 3.747276688453159e-05, "loss": 0.4393, "step": 2300 }, { "epoch": 0.5228758169934641, "grad_norm": 0.7182455658912659, "learning_rate": 3.6928104575163405e-05, "loss": 0.4143, "step": 2400 }, { "epoch": 0.5446623093681917, "grad_norm": 6.931342124938965, "learning_rate": 3.638344226579521e-05, "loss": 0.4742, "step": 2500 }, { "epoch": 0.5664488017429193, "grad_norm": 1.6501855850219727, "learning_rate": 3.583877995642702e-05, "loss": 0.4183, "step": 2600 }, { "epoch": 0.5882352941176471, "grad_norm": 15.619595527648926, "learning_rate": 3.529411764705883e-05, "loss": 0.4591, "step": 2700 }, { "epoch": 0.6100217864923747, "grad_norm": 1.5793421268463135, "learning_rate": 3.474945533769064e-05, "loss": 0.4326, "step": 2800 }, { "epoch": 0.6318082788671024, "grad_norm": 1.2501574754714966, "learning_rate": 3.420479302832244e-05, "loss": 0.4019, "step": 2900 }, { "epoch": 0.6535947712418301, "grad_norm": 2.454075336456299, "learning_rate": 3.366013071895425e-05, "loss": 0.4306, "step": 3000 }, { "epoch": 0.6753812636165577, "grad_norm": 1.4100277423858643, "learning_rate": 3.311546840958606e-05, "loss": 0.4533, "step": 3100 }, { "epoch": 0.6971677559912854, "grad_norm": 1.2618755102157593, "learning_rate": 3.257080610021787e-05, "loss": 0.4523, "step": 3200 }, { "epoch": 0.7189542483660131, "grad_norm": 1.4713486433029175, "learning_rate": 3.202614379084967e-05, "loss": 0.4371, "step": 3300 }, { "epoch": 0.7407407407407407, "grad_norm": 1.1793227195739746, "learning_rate": 3.148148148148148e-05, "loss": 0.4465, "step": 3400 }, { "epoch": 0.7625272331154684, "grad_norm": 0.7598987221717834, "learning_rate": 3.093681917211329e-05, "loss": 0.4759, "step": 3500 }, { "epoch": 0.7843137254901961, "grad_norm": 1.8617286682128906, "learning_rate": 3.0392156862745097e-05, "loss": 0.4248, "step": 3600 }, { "epoch": 0.8061002178649237, "grad_norm": 0.8327335715293884, "learning_rate": 2.984749455337691e-05, "loss": 0.4257, "step": 3700 }, { "epoch": 0.8278867102396514, "grad_norm": 1.4007729291915894, "learning_rate": 2.9302832244008716e-05, "loss": 0.4066, "step": 3800 }, { "epoch": 0.8496732026143791, "grad_norm": 1.50600266456604, "learning_rate": 2.8758169934640522e-05, "loss": 0.4225, "step": 3900 }, { "epoch": 0.8714596949891068, "grad_norm": 1.622689127922058, "learning_rate": 2.8213507625272335e-05, "loss": 0.4262, "step": 4000 }, { "epoch": 0.8932461873638344, "grad_norm": 1.0574493408203125, "learning_rate": 2.766884531590414e-05, "loss": 0.4102, "step": 4100 }, { "epoch": 0.9150326797385621, "grad_norm": 0.6209813356399536, "learning_rate": 2.7124183006535947e-05, "loss": 0.4231, "step": 4200 }, { "epoch": 0.9368191721132898, "grad_norm": 0.6364914178848267, "learning_rate": 2.657952069716776e-05, "loss": 0.4556, "step": 4300 }, { "epoch": 0.9586056644880174, "grad_norm": 0.9522435665130615, "learning_rate": 2.6034858387799566e-05, "loss": 0.4344, "step": 4400 }, { "epoch": 0.9803921568627451, "grad_norm": 0.8629395365715027, "learning_rate": 2.5490196078431373e-05, "loss": 0.4111, "step": 4500 }, { "epoch": 1.0021786492374727, "grad_norm": 1.2452311515808105, "learning_rate": 2.4945533769063182e-05, "loss": 0.3741, "step": 4600 }, { "epoch": 1.0239651416122004, "grad_norm": 0.6457747220993042, "learning_rate": 2.4400871459694992e-05, "loss": 0.3617, "step": 4700 }, { "epoch": 1.0457516339869282, "grad_norm": 3.9131181240081787, "learning_rate": 2.38562091503268e-05, "loss": 0.3263, "step": 4800 }, { "epoch": 1.0675381263616557, "grad_norm": 1.347782015800476, "learning_rate": 2.3311546840958608e-05, "loss": 0.3248, "step": 4900 }, { "epoch": 1.0893246187363834, "grad_norm": 15.944000244140625, "learning_rate": 2.2766884531590417e-05, "loss": 0.3329, "step": 5000 }, { "epoch": 1.1111111111111112, "grad_norm": 0.9070430994033813, "learning_rate": 2.2222222222222223e-05, "loss": 0.3405, "step": 5100 }, { "epoch": 1.132897603485839, "grad_norm": 1.5099300146102905, "learning_rate": 2.1677559912854033e-05, "loss": 0.3405, "step": 5200 }, { "epoch": 1.1546840958605664, "grad_norm": 1.4810432195663452, "learning_rate": 2.113289760348584e-05, "loss": 0.3575, "step": 5300 }, { "epoch": 1.1764705882352942, "grad_norm": 1.5918389558792114, "learning_rate": 2.058823529411765e-05, "loss": 0.3284, "step": 5400 }, { "epoch": 1.1982570806100217, "grad_norm": 1.1542829275131226, "learning_rate": 2.0043572984749455e-05, "loss": 0.3328, "step": 5500 }, { "epoch": 1.2200435729847494, "grad_norm": 2.9146111011505127, "learning_rate": 1.9498910675381264e-05, "loss": 0.3358, "step": 5600 }, { "epoch": 1.2418300653594772, "grad_norm": 1.1495596170425415, "learning_rate": 1.895424836601307e-05, "loss": 0.3581, "step": 5700 }, { "epoch": 1.263616557734205, "grad_norm": 1.925133228302002, "learning_rate": 1.840958605664488e-05, "loss": 0.338, "step": 5800 }, { "epoch": 1.2854030501089324, "grad_norm": 0.6670601963996887, "learning_rate": 1.786492374727669e-05, "loss": 0.3166, "step": 5900 }, { "epoch": 1.3071895424836601, "grad_norm": 1.3753750324249268, "learning_rate": 1.7320261437908496e-05, "loss": 0.3322, "step": 6000 }, { "epoch": 1.3289760348583877, "grad_norm": 0.8356995582580566, "learning_rate": 1.6775599128540306e-05, "loss": 0.3465, "step": 6100 }, { "epoch": 1.3507625272331154, "grad_norm": 0.6578918695449829, "learning_rate": 1.6230936819172112e-05, "loss": 0.3449, "step": 6200 }, { "epoch": 1.3725490196078431, "grad_norm": 0.8830516338348389, "learning_rate": 1.568627450980392e-05, "loss": 0.3152, "step": 6300 }, { "epoch": 1.3943355119825709, "grad_norm": 2.0208992958068848, "learning_rate": 1.5141612200435731e-05, "loss": 0.3237, "step": 6400 }, { "epoch": 1.4161220043572984, "grad_norm": 0.7672997117042542, "learning_rate": 1.4596949891067537e-05, "loss": 0.3317, "step": 6500 }, { "epoch": 1.4379084967320261, "grad_norm": 1.3644828796386719, "learning_rate": 1.4052287581699347e-05, "loss": 0.3341, "step": 6600 }, { "epoch": 1.4596949891067539, "grad_norm": 1.7263342142105103, "learning_rate": 1.3507625272331156e-05, "loss": 0.2875, "step": 6700 }, { "epoch": 1.4814814814814814, "grad_norm": 0.5791486501693726, "learning_rate": 1.2962962962962962e-05, "loss": 0.3583, "step": 6800 }, { "epoch": 1.5032679738562091, "grad_norm": 1.689420223236084, "learning_rate": 1.2418300653594772e-05, "loss": 0.3011, "step": 6900 }, { "epoch": 1.5250544662309369, "grad_norm": 0.42838066816329956, "learning_rate": 1.187363834422658e-05, "loss": 0.3153, "step": 7000 }, { "epoch": 1.5468409586056646, "grad_norm": 1.6973720788955688, "learning_rate": 1.1328976034858388e-05, "loss": 0.3253, "step": 7100 }, { "epoch": 1.5686274509803921, "grad_norm": 1.9545549154281616, "learning_rate": 1.0784313725490197e-05, "loss": 0.3348, "step": 7200 }, { "epoch": 1.5904139433551199, "grad_norm": 1.2370498180389404, "learning_rate": 1.0239651416122005e-05, "loss": 0.3207, "step": 7300 }, { "epoch": 1.6122004357298474, "grad_norm": 1.108176589012146, "learning_rate": 9.694989106753813e-06, "loss": 0.3145, "step": 7400 }, { "epoch": 1.6339869281045751, "grad_norm": 1.849331259727478, "learning_rate": 9.150326797385621e-06, "loss": 0.2982, "step": 7500 }, { "epoch": 1.6557734204793029, "grad_norm": 1.2936469316482544, "learning_rate": 8.60566448801743e-06, "loss": 0.3161, "step": 7600 }, { "epoch": 1.6775599128540306, "grad_norm": 1.2373560667037964, "learning_rate": 8.061002178649239e-06, "loss": 0.3335, "step": 7700 }, { "epoch": 1.6993464052287581, "grad_norm": 0.7337093949317932, "learning_rate": 7.5163398692810456e-06, "loss": 0.3027, "step": 7800 }, { "epoch": 1.7211328976034859, "grad_norm": 0.2871895134449005, "learning_rate": 6.971677559912855e-06, "loss": 0.3048, "step": 7900 }, { "epoch": 1.7429193899782134, "grad_norm": 1.113721489906311, "learning_rate": 6.427015250544663e-06, "loss": 0.3076, "step": 8000 }, { "epoch": 1.7647058823529411, "grad_norm": 1.1479638814926147, "learning_rate": 5.882352941176471e-06, "loss": 0.3513, "step": 8100 }, { "epoch": 1.7864923747276689, "grad_norm": 0.8669132590293884, "learning_rate": 5.33769063180828e-06, "loss": 0.3347, "step": 8200 }, { "epoch": 1.8082788671023966, "grad_norm": 1.534184455871582, "learning_rate": 4.7930283224400875e-06, "loss": 0.336, "step": 8300 }, { "epoch": 1.8300653594771243, "grad_norm": 1.0394097566604614, "learning_rate": 4.2483660130718954e-06, "loss": 0.3152, "step": 8400 }, { "epoch": 1.8518518518518519, "grad_norm": 0.9366344809532166, "learning_rate": 3.7037037037037037e-06, "loss": 0.3272, "step": 8500 }, { "epoch": 1.8736383442265794, "grad_norm": 1.5486916303634644, "learning_rate": 3.159041394335512e-06, "loss": 0.3223, "step": 8600 }, { "epoch": 1.8954248366013071, "grad_norm": 2.589451551437378, "learning_rate": 2.6143790849673204e-06, "loss": 0.3189, "step": 8700 }, { "epoch": 1.9172113289760349, "grad_norm": 0.5650085806846619, "learning_rate": 2.0697167755991287e-06, "loss": 0.3132, "step": 8800 }, { "epoch": 1.9389978213507626, "grad_norm": 0.18872463703155518, "learning_rate": 1.525054466230937e-06, "loss": 0.3219, "step": 8900 }, { "epoch": 1.9607843137254903, "grad_norm": 1.094841718673706, "learning_rate": 9.80392156862745e-07, "loss": 0.3015, "step": 9000 } ], "logging_steps": 100, "max_steps": 9180, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.825775061368832e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }