Nexspear's picture
Training in progress, step 100, checkpoint
6003729 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3325020781379884,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033250207813798837,
"eval_loss": 2.4745399951934814,
"eval_runtime": 35.0849,
"eval_samples_per_second": 14.451,
"eval_steps_per_second": 1.824,
"step": 1
},
{
"epoch": 0.00997506234413965,
"grad_norm": 1.8293392658233643,
"learning_rate": 1.5e-05,
"loss": 2.4184,
"step": 3
},
{
"epoch": 0.0199501246882793,
"grad_norm": 2.3942174911499023,
"learning_rate": 3e-05,
"loss": 2.4593,
"step": 6
},
{
"epoch": 0.029925187032418952,
"grad_norm": 1.915959358215332,
"learning_rate": 4.5e-05,
"loss": 2.2337,
"step": 9
},
{
"epoch": 0.029925187032418952,
"eval_loss": 2.385331153869629,
"eval_runtime": 35.4941,
"eval_samples_per_second": 14.284,
"eval_steps_per_second": 1.803,
"step": 9
},
{
"epoch": 0.0399002493765586,
"grad_norm": 2.4679598808288574,
"learning_rate": 4.993910125649561e-05,
"loss": 2.4138,
"step": 12
},
{
"epoch": 0.04987531172069826,
"grad_norm": 2.2216885089874268,
"learning_rate": 4.962019382530521e-05,
"loss": 2.1899,
"step": 15
},
{
"epoch": 0.059850374064837904,
"grad_norm": 2.1264395713806152,
"learning_rate": 4.9031542398457974e-05,
"loss": 1.9607,
"step": 18
},
{
"epoch": 0.059850374064837904,
"eval_loss": 1.7929750680923462,
"eval_runtime": 35.5582,
"eval_samples_per_second": 14.258,
"eval_steps_per_second": 1.8,
"step": 18
},
{
"epoch": 0.06982543640897755,
"grad_norm": 1.875053882598877,
"learning_rate": 4.817959636416969e-05,
"loss": 1.7054,
"step": 21
},
{
"epoch": 0.0798004987531172,
"grad_norm": 1.6136735677719116,
"learning_rate": 4.707368982147318e-05,
"loss": 1.4407,
"step": 24
},
{
"epoch": 0.08977556109725686,
"grad_norm": 1.430875301361084,
"learning_rate": 4.572593931387604e-05,
"loss": 1.4597,
"step": 27
},
{
"epoch": 0.08977556109725686,
"eval_loss": 1.4339165687561035,
"eval_runtime": 35.58,
"eval_samples_per_second": 14.25,
"eval_steps_per_second": 1.799,
"step": 27
},
{
"epoch": 0.09975062344139651,
"grad_norm": 1.4442172050476074,
"learning_rate": 4.415111107797445e-05,
"loss": 1.4351,
"step": 30
},
{
"epoch": 0.10972568578553615,
"grad_norm": 1.886985421180725,
"learning_rate": 4.2366459261474933e-05,
"loss": 1.4478,
"step": 33
},
{
"epoch": 0.11970074812967581,
"grad_norm": 1.4151262044906616,
"learning_rate": 4.039153688314145e-05,
"loss": 1.2824,
"step": 36
},
{
"epoch": 0.11970074812967581,
"eval_loss": 1.3912534713745117,
"eval_runtime": 35.581,
"eval_samples_per_second": 14.249,
"eval_steps_per_second": 1.799,
"step": 36
},
{
"epoch": 0.12967581047381546,
"grad_norm": 1.5250979661941528,
"learning_rate": 3.824798160583012e-05,
"loss": 1.3491,
"step": 39
},
{
"epoch": 0.1396508728179551,
"grad_norm": 1.3472334146499634,
"learning_rate": 3.5959278669726935e-05,
"loss": 1.4616,
"step": 42
},
{
"epoch": 0.14962593516209477,
"grad_norm": 1.2380613088607788,
"learning_rate": 3.355050358314172e-05,
"loss": 1.2844,
"step": 45
},
{
"epoch": 0.14962593516209477,
"eval_loss": 1.3702603578567505,
"eval_runtime": 35.6209,
"eval_samples_per_second": 14.233,
"eval_steps_per_second": 1.797,
"step": 45
},
{
"epoch": 0.1596009975062344,
"grad_norm": 1.376523494720459,
"learning_rate": 3.104804738999169e-05,
"loss": 1.3452,
"step": 48
},
{
"epoch": 0.16957605985037408,
"grad_norm": 1.2102608680725098,
"learning_rate": 2.8479327524001636e-05,
"loss": 1.3209,
"step": 51
},
{
"epoch": 0.17955112219451372,
"grad_norm": 1.7671351432800293,
"learning_rate": 2.587248741756253e-05,
"loss": 1.4073,
"step": 54
},
{
"epoch": 0.17955112219451372,
"eval_loss": 1.3600367307662964,
"eval_runtime": 35.6092,
"eval_samples_per_second": 14.238,
"eval_steps_per_second": 1.797,
"step": 54
},
{
"epoch": 0.18952618453865336,
"grad_norm": 1.6414278745651245,
"learning_rate": 2.3256088156396868e-05,
"loss": 1.4529,
"step": 57
},
{
"epoch": 0.19950124688279303,
"grad_norm": 1.6693403720855713,
"learning_rate": 2.0658795558326743e-05,
"loss": 1.263,
"step": 60
},
{
"epoch": 0.20947630922693267,
"grad_norm": 1.2293144464492798,
"learning_rate": 1.8109066104575023e-05,
"loss": 1.374,
"step": 63
},
{
"epoch": 0.20947630922693267,
"eval_loss": 1.3520565032958984,
"eval_runtime": 35.64,
"eval_samples_per_second": 14.226,
"eval_steps_per_second": 1.796,
"step": 63
},
{
"epoch": 0.2194513715710723,
"grad_norm": 1.280704379081726,
"learning_rate": 1.56348351646022e-05,
"loss": 1.2475,
"step": 66
},
{
"epoch": 0.22942643391521197,
"grad_norm": 1.503631591796875,
"learning_rate": 1.3263210930352737e-05,
"loss": 1.3387,
"step": 69
},
{
"epoch": 0.23940149625935161,
"grad_norm": 1.318060278892517,
"learning_rate": 1.1020177413231334e-05,
"loss": 1.2855,
"step": 72
},
{
"epoch": 0.23940149625935161,
"eval_loss": 1.346817970275879,
"eval_runtime": 35.6,
"eval_samples_per_second": 14.242,
"eval_steps_per_second": 1.798,
"step": 72
},
{
"epoch": 0.24937655860349128,
"grad_norm": 1.3991254568099976,
"learning_rate": 8.930309757836517e-06,
"loss": 1.2667,
"step": 75
},
{
"epoch": 0.2593516209476309,
"grad_norm": 1.3369468450546265,
"learning_rate": 7.016504991533726e-06,
"loss": 1.2339,
"step": 78
},
{
"epoch": 0.26932668329177056,
"grad_norm": 1.3982964754104614,
"learning_rate": 5.299731159831953e-06,
"loss": 1.3598,
"step": 81
},
{
"epoch": 0.26932668329177056,
"eval_loss": 1.3438166379928589,
"eval_runtime": 35.6426,
"eval_samples_per_second": 14.225,
"eval_steps_per_second": 1.796,
"step": 81
},
{
"epoch": 0.2793017456359102,
"grad_norm": 1.6268279552459717,
"learning_rate": 3.798797596089351e-06,
"loss": 1.258,
"step": 84
},
{
"epoch": 0.2892768079800499,
"grad_norm": 1.3860325813293457,
"learning_rate": 2.5301488425208296e-06,
"loss": 1.3495,
"step": 87
},
{
"epoch": 0.29925187032418954,
"grad_norm": 1.7704390287399292,
"learning_rate": 1.5076844803522922e-06,
"loss": 1.312,
"step": 90
},
{
"epoch": 0.29925187032418954,
"eval_loss": 1.3430734872817993,
"eval_runtime": 35.5853,
"eval_samples_per_second": 14.247,
"eval_steps_per_second": 1.798,
"step": 90
},
{
"epoch": 0.3092269326683292,
"grad_norm": 1.516430377960205,
"learning_rate": 7.426068431000882e-07,
"loss": 1.2602,
"step": 93
},
{
"epoch": 0.3192019950124688,
"grad_norm": 1.294288158416748,
"learning_rate": 2.4329828146074095e-07,
"loss": 1.3084,
"step": 96
},
{
"epoch": 0.32917705735660846,
"grad_norm": 1.539898157119751,
"learning_rate": 1.522932452260595e-08,
"loss": 1.4346,
"step": 99
},
{
"epoch": 0.32917705735660846,
"eval_loss": 1.3427472114562988,
"eval_runtime": 35.5324,
"eval_samples_per_second": 14.269,
"eval_steps_per_second": 1.801,
"step": 99
}
],
"logging_steps": 3,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.483774567120896e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}