{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4287245444801715, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004287245444801715, "eval_loss": 1.4249097108840942, "eval_runtime": 10.7749, "eval_samples_per_second": 36.474, "eval_steps_per_second": 4.64, "step": 1 }, { "epoch": 0.012861736334405145, "grad_norm": 0.9113896489143372, "learning_rate": 3e-05, "loss": 1.4703, "step": 3 }, { "epoch": 0.02572347266881029, "grad_norm": 0.8911994099617004, "learning_rate": 6e-05, "loss": 1.4236, "step": 6 }, { "epoch": 0.03858520900321544, "grad_norm": 0.7710233330726624, "learning_rate": 9e-05, "loss": 1.3008, "step": 9 }, { "epoch": 0.03858520900321544, "eval_loss": 1.1259669065475464, "eval_runtime": 10.753, "eval_samples_per_second": 36.548, "eval_steps_per_second": 4.65, "step": 9 }, { "epoch": 0.05144694533762058, "grad_norm": 0.8508097529411316, "learning_rate": 9.987820251299122e-05, "loss": 1.0518, "step": 12 }, { "epoch": 0.06430868167202572, "grad_norm": 0.8443539142608643, "learning_rate": 9.924038765061042e-05, "loss": 0.7249, "step": 15 }, { "epoch": 0.07717041800643087, "grad_norm": 1.4097633361816406, "learning_rate": 9.806308479691595e-05, "loss": 0.5457, "step": 18 }, { "epoch": 0.07717041800643087, "eval_loss": 0.4698084890842438, "eval_runtime": 10.8272, "eval_samples_per_second": 36.297, "eval_steps_per_second": 4.618, "step": 18 }, { "epoch": 0.09003215434083602, "grad_norm": 0.34708932042121887, "learning_rate": 9.635919272833938e-05, "loss": 0.4553, "step": 21 }, { "epoch": 0.10289389067524116, "grad_norm": 0.4105893671512604, "learning_rate": 9.414737964294636e-05, "loss": 0.4212, "step": 24 }, { "epoch": 0.1157556270096463, "grad_norm": 1.0311174392700195, "learning_rate": 9.145187862775209e-05, "loss": 0.4673, "step": 27 }, { "epoch": 0.1157556270096463, "eval_loss": 0.43113449215888977, "eval_runtime": 10.8366, "eval_samples_per_second": 36.266, "eval_steps_per_second": 4.614, "step": 27 }, { "epoch": 0.12861736334405144, "grad_norm": 0.4956219792366028, "learning_rate": 8.83022221559489e-05, "loss": 0.4289, "step": 30 }, { "epoch": 0.1414790996784566, "grad_norm": 0.4162253141403198, "learning_rate": 8.473291852294987e-05, "loss": 0.4294, "step": 33 }, { "epoch": 0.15434083601286175, "grad_norm": 1.1481068134307861, "learning_rate": 8.07830737662829e-05, "loss": 0.4328, "step": 36 }, { "epoch": 0.15434083601286175, "eval_loss": 0.41734907031059265, "eval_runtime": 10.8711, "eval_samples_per_second": 36.151, "eval_steps_per_second": 4.599, "step": 36 }, { "epoch": 0.16720257234726688, "grad_norm": 0.262054443359375, "learning_rate": 7.649596321166024e-05, "loss": 0.4119, "step": 39 }, { "epoch": 0.18006430868167203, "grad_norm": 0.3609679639339447, "learning_rate": 7.191855733945387e-05, "loss": 0.4283, "step": 42 }, { "epoch": 0.19292604501607716, "grad_norm": 0.2730075716972351, "learning_rate": 6.710100716628344e-05, "loss": 0.4492, "step": 45 }, { "epoch": 0.19292604501607716, "eval_loss": 0.4143539071083069, "eval_runtime": 10.8825, "eval_samples_per_second": 36.113, "eval_steps_per_second": 4.595, "step": 45 }, { "epoch": 0.2057877813504823, "grad_norm": 0.30750295519828796, "learning_rate": 6.209609477998338e-05, "loss": 0.4128, "step": 48 }, { "epoch": 0.21864951768488747, "grad_norm": 0.45564326643943787, "learning_rate": 5.695865504800327e-05, "loss": 0.3809, "step": 51 }, { "epoch": 0.2315112540192926, "grad_norm": 0.34823882579803467, "learning_rate": 5.174497483512506e-05, "loss": 0.4133, "step": 54 }, { "epoch": 0.2315112540192926, "eval_loss": 0.407143771648407, "eval_runtime": 10.893, "eval_samples_per_second": 36.078, "eval_steps_per_second": 4.59, "step": 54 }, { "epoch": 0.24437299035369775, "grad_norm": 0.5110918283462524, "learning_rate": 4.6512176312793736e-05, "loss": 0.4088, "step": 57 }, { "epoch": 0.2572347266881029, "grad_norm": 0.32943594455718994, "learning_rate": 4.131759111665349e-05, "loss": 0.4107, "step": 60 }, { "epoch": 0.27009646302250806, "grad_norm": 0.4684916138648987, "learning_rate": 3.6218132209150045e-05, "loss": 0.3967, "step": 63 }, { "epoch": 0.27009646302250806, "eval_loss": 0.40750572085380554, "eval_runtime": 10.9324, "eval_samples_per_second": 35.948, "eval_steps_per_second": 4.574, "step": 63 }, { "epoch": 0.2829581993569132, "grad_norm": 0.2500722408294678, "learning_rate": 3.12696703292044e-05, "loss": 0.3752, "step": 66 }, { "epoch": 0.2958199356913183, "grad_norm": 0.9661127924919128, "learning_rate": 2.6526421860705473e-05, "loss": 0.3869, "step": 69 }, { "epoch": 0.3086816720257235, "grad_norm": 0.6735724806785583, "learning_rate": 2.2040354826462668e-05, "loss": 0.3793, "step": 72 }, { "epoch": 0.3086816720257235, "eval_loss": 0.4086286723613739, "eval_runtime": 10.898, "eval_samples_per_second": 36.062, "eval_steps_per_second": 4.588, "step": 72 }, { "epoch": 0.3215434083601286, "grad_norm": 0.6262890696525574, "learning_rate": 1.7860619515673033e-05, "loss": 0.4136, "step": 75 }, { "epoch": 0.33440514469453375, "grad_norm": 0.7589296698570251, "learning_rate": 1.4033009983067452e-05, "loss": 0.4155, "step": 78 }, { "epoch": 0.34726688102893893, "grad_norm": 0.6098915338516235, "learning_rate": 1.0599462319663905e-05, "loss": 0.4119, "step": 81 }, { "epoch": 0.34726688102893893, "eval_loss": 0.40282922983169556, "eval_runtime": 10.8945, "eval_samples_per_second": 36.073, "eval_steps_per_second": 4.589, "step": 81 }, { "epoch": 0.36012861736334406, "grad_norm": 0.23373101651668549, "learning_rate": 7.597595192178702e-06, "loss": 0.3801, "step": 84 }, { "epoch": 0.3729903536977492, "grad_norm": 0.5538727641105652, "learning_rate": 5.060297685041659e-06, "loss": 0.4045, "step": 87 }, { "epoch": 0.3858520900321543, "grad_norm": 0.44058600068092346, "learning_rate": 3.0153689607045845e-06, "loss": 0.4008, "step": 90 }, { "epoch": 0.3858520900321543, "eval_loss": 0.4011549651622772, "eval_runtime": 10.8787, "eval_samples_per_second": 36.126, "eval_steps_per_second": 4.596, "step": 90 }, { "epoch": 0.3987138263665595, "grad_norm": 0.32986781001091003, "learning_rate": 1.4852136862001764e-06, "loss": 0.3769, "step": 93 }, { "epoch": 0.4115755627009646, "grad_norm": 0.2695184350013733, "learning_rate": 4.865965629214819e-07, "loss": 0.3882, "step": 96 }, { "epoch": 0.42443729903536975, "grad_norm": 0.4361032545566559, "learning_rate": 3.04586490452119e-08, "loss": 0.4191, "step": 99 }, { "epoch": 0.42443729903536975, "eval_loss": 0.4006556570529938, "eval_runtime": 10.8903, "eval_samples_per_second": 36.087, "eval_steps_per_second": 4.591, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.61253767364608e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }