|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 5492, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01820830298616169, |
|
"grad_norm": 0.22140225768089294, |
|
"learning_rate": 3.642987249544627e-06, |
|
"loss": 2.4398, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03641660597232338, |
|
"grad_norm": 0.17718623578548431, |
|
"learning_rate": 7.285974499089254e-06, |
|
"loss": 2.4058, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05462490895848507, |
|
"grad_norm": 0.26951104402542114, |
|
"learning_rate": 1.0928961748633882e-05, |
|
"loss": 2.3914, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07283321194464676, |
|
"grad_norm": 0.36239442229270935, |
|
"learning_rate": 1.4571948998178507e-05, |
|
"loss": 2.3212, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09104151493080845, |
|
"grad_norm": 0.505577564239502, |
|
"learning_rate": 1.8214936247723133e-05, |
|
"loss": 2.2799, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10924981791697014, |
|
"grad_norm": 0.5019211173057556, |
|
"learning_rate": 1.999474720010985e-05, |
|
"loss": 2.2525, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.12745812090313183, |
|
"grad_norm": 0.6163275837898254, |
|
"learning_rate": 1.9953983978532914e-05, |
|
"loss": 2.2219, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.14566642388929352, |
|
"grad_norm": 0.6447716355323792, |
|
"learning_rate": 1.987302601308333e-05, |
|
"loss": 2.1597, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1638747268754552, |
|
"grad_norm": 0.6667017936706543, |
|
"learning_rate": 1.9752200216552278e-05, |
|
"loss": 2.1629, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1820830298616169, |
|
"grad_norm": 0.7740277647972107, |
|
"learning_rate": 1.9591994490261997e-05, |
|
"loss": 2.0851, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.20029133284777859, |
|
"grad_norm": 0.8541315197944641, |
|
"learning_rate": 1.9393055753893e-05, |
|
"loss": 2.1176, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.21849963583394028, |
|
"grad_norm": 0.7334938645362854, |
|
"learning_rate": 1.915618733318621e-05, |
|
"loss": 2.1079, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.23670793882010197, |
|
"grad_norm": 0.7850305438041687, |
|
"learning_rate": 1.8882345716068708e-05, |
|
"loss": 2.0558, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.25491624180626365, |
|
"grad_norm": 0.8258129358291626, |
|
"learning_rate": 1.8572636690301997e-05, |
|
"loss": 2.0657, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.27312454479242537, |
|
"grad_norm": 0.7766979932785034, |
|
"learning_rate": 1.8228310878249212e-05, |
|
"loss": 2.061, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.29133284777858703, |
|
"grad_norm": 1.0766857862472534, |
|
"learning_rate": 1.7850758686792054e-05, |
|
"loss": 2.0807, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.30954115076474875, |
|
"grad_norm": 0.7605693936347961, |
|
"learning_rate": 1.7441504692790104e-05, |
|
"loss": 2.0452, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3277494537509104, |
|
"grad_norm": 0.8467194437980652, |
|
"learning_rate": 1.700220148675417e-05, |
|
"loss": 2.0589, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.34595775673707213, |
|
"grad_norm": 0.8118539452552795, |
|
"learning_rate": 1.6534622999593437e-05, |
|
"loss": 2.0793, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3641660597232338, |
|
"grad_norm": 0.9152652621269226, |
|
"learning_rate": 1.6040657339383255e-05, |
|
"loss": 2.0462, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3823743627093955, |
|
"grad_norm": 0.7831888794898987, |
|
"learning_rate": 1.5522299167079173e-05, |
|
"loss": 2.0275, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.40058266569555717, |
|
"grad_norm": 1.125390887260437, |
|
"learning_rate": 1.4981641641964437e-05, |
|
"loss": 2.0157, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4187909686817189, |
|
"grad_norm": 0.9301549196243286, |
|
"learning_rate": 1.44208679693558e-05, |
|
"loss": 2.0384, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.43699927166788055, |
|
"grad_norm": 0.8163829445838928, |
|
"learning_rate": 1.384224258469838e-05, |
|
"loss": 2.0331, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.45520757465404227, |
|
"grad_norm": 0.8261762261390686, |
|
"learning_rate": 1.3248102009648686e-05, |
|
"loss": 1.9856, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.47341587764020393, |
|
"grad_norm": 1.0488537549972534, |
|
"learning_rate": 1.2640845417069571e-05, |
|
"loss": 2.031, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.49162418062636565, |
|
"grad_norm": 1.400754451751709, |
|
"learning_rate": 1.2022924943036024e-05, |
|
"loss": 2.0352, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5098324836125273, |
|
"grad_norm": 0.9314346313476562, |
|
"learning_rate": 1.139683578497262e-05, |
|
"loss": 2.0303, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.528040786598689, |
|
"grad_norm": 1.420285701751709, |
|
"learning_rate": 1.0765106125906782e-05, |
|
"loss": 2.0071, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5462490895848507, |
|
"grad_norm": 1.0269055366516113, |
|
"learning_rate": 1.0130286925524367e-05, |
|
"loss": 1.9697, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5644573925710124, |
|
"grad_norm": 0.9681551456451416, |
|
"learning_rate": 9.494941619251817e-06, |
|
"loss": 2.0246, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5826656955571741, |
|
"grad_norm": 1.1162570714950562, |
|
"learning_rate": 8.861635766960579e-06, |
|
"loss": 1.9988, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6008739985433358, |
|
"grad_norm": 0.9726856350898743, |
|
"learning_rate": 8.232926693092881e-06, |
|
"loss": 1.9904, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6190823015294975, |
|
"grad_norm": 1.2489055395126343, |
|
"learning_rate": 7.611353160042658e-06, |
|
"loss": 1.9702, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6372906045156591, |
|
"grad_norm": 1.1108543872833252, |
|
"learning_rate": 6.99942511649105e-06, |
|
"loss": 2.0471, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6554989075018208, |
|
"grad_norm": 1.0132871866226196, |
|
"learning_rate": 6.399613562093272e-06, |
|
"loss": 2.0539, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6737072104879825, |
|
"grad_norm": 1.1130536794662476, |
|
"learning_rate": 5.814340569443867e-06, |
|
"loss": 2.0095, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6919155134741443, |
|
"grad_norm": 0.7807123064994812, |
|
"learning_rate": 5.245969503612125e-06, |
|
"loss": 1.9232, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7101238164603059, |
|
"grad_norm": 1.0500433444976807, |
|
"learning_rate": 4.696795478741786e-06, |
|
"loss": 1.986, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7283321194464676, |
|
"grad_norm": 0.9565054774284363, |
|
"learning_rate": 4.169036090251809e-06, |
|
"loss": 2.0505, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7465404224326293, |
|
"grad_norm": 0.8539314270019531, |
|
"learning_rate": 3.6648224600620653e-06, |
|
"loss": 2.0069, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.764748725418791, |
|
"grad_norm": 0.9163336157798767, |
|
"learning_rate": 3.1861906310038825e-06, |
|
"loss": 1.9723, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7829570284049526, |
|
"grad_norm": 0.9746565222740173, |
|
"learning_rate": 2.735073345165228e-06, |
|
"loss": 1.9787, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8011653313911143, |
|
"grad_norm": 1.0261231660842896, |
|
"learning_rate": 2.313292239370102e-06, |
|
"loss": 2.0124, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8193736343772761, |
|
"grad_norm": 1.207588791847229, |
|
"learning_rate": 1.9225504893071823e-06, |
|
"loss": 1.9749, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8375819373634378, |
|
"grad_norm": 1.0550912618637085, |
|
"learning_rate": 1.5644259320111733e-06, |
|
"loss": 1.9382, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8557902403495994, |
|
"grad_norm": 0.9657886624336243, |
|
"learning_rate": 1.2403646944686198e-06, |
|
"loss": 1.9893, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8739985433357611, |
|
"grad_norm": 1.1723967790603638, |
|
"learning_rate": 9.516753540762868e-07, |
|
"loss": 1.9816, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8922068463219228, |
|
"grad_norm": 1.271134376525879, |
|
"learning_rate": 6.995236545324624e-07, |
|
"loss": 1.9485, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9104151493080845, |
|
"grad_norm": 0.9425839185714722, |
|
"learning_rate": 4.849277984987221e-07, |
|
"loss": 1.9754, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9286234522942461, |
|
"grad_norm": 1.077772617340088, |
|
"learning_rate": 3.0875433604064976e-07, |
|
"loss": 2.0007, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9468317552804079, |
|
"grad_norm": 1.13583242893219, |
|
"learning_rate": 1.7171466545021665e-07, |
|
"loss": 1.926, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9650400582665696, |
|
"grad_norm": 0.9153927564620972, |
|
"learning_rate": 7.436216057970735e-08, |
|
"loss": 2.0055, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9832483612527313, |
|
"grad_norm": 1.2890238761901855, |
|
"learning_rate": 1.708993628716016e-08, |
|
"loss": 2.0217, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.922713041305542, |
|
"eval_runtime": 63.7429, |
|
"eval_samples_per_second": 15.578, |
|
"eval_steps_per_second": 1.961, |
|
"step": 5492 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 5492, |
|
"total_flos": 1.0003539689472e+17, |
|
"train_loss": 2.061430201075499, |
|
"train_runtime": 1617.6118, |
|
"train_samples_per_second": 6.79, |
|
"train_steps_per_second": 3.395 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5492, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0003539689472e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|