|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 17.094017094017094, |
|
"eval_steps": 1000, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005698005698005698, |
|
"grad_norm": 5.125274658203125, |
|
"learning_rate": 0.0, |
|
"loss": 1.4488, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.2849002849002849, |
|
"grad_norm": 1.6660258769989014, |
|
"learning_rate": 0.0001977977977977978, |
|
"loss": 0.7332, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5698005698005698, |
|
"grad_norm": 1.358102560043335, |
|
"learning_rate": 0.0001952952952952953, |
|
"loss": 0.6182, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8547008547008547, |
|
"grad_norm": 1.7879210710525513, |
|
"learning_rate": 0.00019279279279279282, |
|
"loss": 0.5843, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1396011396011396, |
|
"grad_norm": 2.010268449783325, |
|
"learning_rate": 0.0001902902902902903, |
|
"loss": 0.4683, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4245014245014245, |
|
"grad_norm": 2.0085535049438477, |
|
"learning_rate": 0.0001877877877877878, |
|
"loss": 0.3547, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"grad_norm": 2.418802261352539, |
|
"learning_rate": 0.00018528528528528532, |
|
"loss": 0.3364, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9943019943019942, |
|
"grad_norm": 2.5009264945983887, |
|
"learning_rate": 0.0001827827827827828, |
|
"loss": 0.366, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.2792022792022792, |
|
"grad_norm": 2.0691003799438477, |
|
"learning_rate": 0.00018028028028028027, |
|
"loss": 0.1792, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 2.531602621078491, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.1762, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.849002849002849, |
|
"grad_norm": 2.9127748012542725, |
|
"learning_rate": 0.00017527527527527528, |
|
"loss": 0.1918, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.133903133903134, |
|
"grad_norm": 2.571071147918701, |
|
"learning_rate": 0.00017277277277277277, |
|
"loss": 0.1541, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.4188034188034186, |
|
"grad_norm": 1.5915725231170654, |
|
"learning_rate": 0.00017027027027027028, |
|
"loss": 0.1156, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 1.2947059869766235, |
|
"learning_rate": 0.00016776776776776777, |
|
"loss": 0.1157, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.9886039886039883, |
|
"grad_norm": 1.7114081382751465, |
|
"learning_rate": 0.00016526526526526526, |
|
"loss": 0.1309, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.273504273504273, |
|
"grad_norm": 2.1177897453308105, |
|
"learning_rate": 0.00016276276276276275, |
|
"loss": 0.099, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.5584045584045585, |
|
"grad_norm": 2.950777292251587, |
|
"learning_rate": 0.00016026026026026027, |
|
"loss": 0.1007, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.843304843304844, |
|
"grad_norm": 2.4155728816986084, |
|
"learning_rate": 0.00015775775775775776, |
|
"loss": 0.102, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 1.3441689014434814, |
|
"learning_rate": 0.00015525525525525525, |
|
"loss": 0.0948, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.413105413105413, |
|
"grad_norm": 1.041314721107483, |
|
"learning_rate": 0.00015275275275275277, |
|
"loss": 0.0851, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 5.698005698005698, |
|
"grad_norm": 1.53568696975708, |
|
"learning_rate": 0.00015025025025025026, |
|
"loss": 0.0876, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.982905982905983, |
|
"grad_norm": 1.0592001676559448, |
|
"learning_rate": 0.00014774774774774775, |
|
"loss": 0.0909, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.267806267806268, |
|
"grad_norm": 1.4785758256912231, |
|
"learning_rate": 0.00014524524524524526, |
|
"loss": 0.0838, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.552706552706553, |
|
"grad_norm": 0.6601145267486572, |
|
"learning_rate": 0.00014274274274274275, |
|
"loss": 0.0827, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 6.837606837606837, |
|
"grad_norm": 1.1280301809310913, |
|
"learning_rate": 0.00014024024024024024, |
|
"loss": 0.0762, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 7.122507122507122, |
|
"grad_norm": 1.467143177986145, |
|
"learning_rate": 0.00013773773773773776, |
|
"loss": 0.0799, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 1.773697018623352, |
|
"learning_rate": 0.00013523523523523525, |
|
"loss": 0.0765, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 3.1821281909942627, |
|
"learning_rate": 0.00013273273273273274, |
|
"loss": 0.0787, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 7.977207977207978, |
|
"grad_norm": 0.9911046028137207, |
|
"learning_rate": 0.00013023023023023023, |
|
"loss": 0.0731, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 8.262108262108262, |
|
"grad_norm": 2.560380458831787, |
|
"learning_rate": 0.00012772772772772775, |
|
"loss": 0.069, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 8.547008547008547, |
|
"grad_norm": 1.88752281665802, |
|
"learning_rate": 0.00012522522522522524, |
|
"loss": 0.0708, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.831908831908832, |
|
"grad_norm": 1.3793889284133911, |
|
"learning_rate": 0.00012272272272272273, |
|
"loss": 0.0748, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 9.116809116809117, |
|
"grad_norm": 0.7198516130447388, |
|
"learning_rate": 0.00012022022022022023, |
|
"loss": 0.0695, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 9.401709401709402, |
|
"grad_norm": 0.4231278896331787, |
|
"learning_rate": 0.00011771771771771771, |
|
"loss": 0.0671, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 9.686609686609687, |
|
"grad_norm": 2.0300514698028564, |
|
"learning_rate": 0.00011521521521521521, |
|
"loss": 0.0659, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 9.971509971509972, |
|
"grad_norm": 1.0142486095428467, |
|
"learning_rate": 0.00011271271271271271, |
|
"loss": 0.0714, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"grad_norm": 0.3013649582862854, |
|
"learning_rate": 0.0001102102102102102, |
|
"loss": 0.0625, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 10.54131054131054, |
|
"grad_norm": 0.974827766418457, |
|
"learning_rate": 0.00010770770770770771, |
|
"loss": 0.0651, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 10.826210826210826, |
|
"grad_norm": 1.0164273977279663, |
|
"learning_rate": 0.0001052052052052052, |
|
"loss": 0.066, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 0.12340305745601654, |
|
"learning_rate": 0.0001027027027027027, |
|
"loss": 0.0658, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 11.396011396011396, |
|
"grad_norm": 1.1315168142318726, |
|
"learning_rate": 0.0001002002002002002, |
|
"loss": 0.0611, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 11.68091168091168, |
|
"grad_norm": 0.2988643944263458, |
|
"learning_rate": 9.76976976976977e-05, |
|
"loss": 0.0607, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 11.965811965811966, |
|
"grad_norm": 0.8949713110923767, |
|
"learning_rate": 9.51951951951952e-05, |
|
"loss": 0.0632, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 12.250712250712251, |
|
"grad_norm": 0.11667460948228836, |
|
"learning_rate": 9.26926926926927e-05, |
|
"loss": 0.0605, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 12.535612535612536, |
|
"grad_norm": 0.1387569159269333, |
|
"learning_rate": 9.019019019019019e-05, |
|
"loss": 0.0605, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"grad_norm": 0.4826744794845581, |
|
"learning_rate": 8.76876876876877e-05, |
|
"loss": 0.0619, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 13.105413105413106, |
|
"grad_norm": 0.09396378695964813, |
|
"learning_rate": 8.518518518518518e-05, |
|
"loss": 0.0588, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 13.39031339031339, |
|
"grad_norm": 0.6452879905700684, |
|
"learning_rate": 8.268268268268269e-05, |
|
"loss": 0.0584, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 13.675213675213675, |
|
"grad_norm": 0.5694031119346619, |
|
"learning_rate": 8.018018018018019e-05, |
|
"loss": 0.0582, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 13.96011396011396, |
|
"grad_norm": 0.34324464201927185, |
|
"learning_rate": 7.767767767767768e-05, |
|
"loss": 0.0602, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 14.245014245014245, |
|
"grad_norm": 0.07841510325670242, |
|
"learning_rate": 7.517517517517519e-05, |
|
"loss": 0.0593, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 14.52991452991453, |
|
"grad_norm": 0.1685921549797058, |
|
"learning_rate": 7.267267267267268e-05, |
|
"loss": 0.0579, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 14.814814814814815, |
|
"grad_norm": 0.32586222887039185, |
|
"learning_rate": 7.017017017017016e-05, |
|
"loss": 0.0533, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 15.0997150997151, |
|
"grad_norm": 0.6495370864868164, |
|
"learning_rate": 6.766766766766767e-05, |
|
"loss": 0.0575, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 0.10936163365840912, |
|
"learning_rate": 6.516516516516516e-05, |
|
"loss": 0.0539, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 15.66951566951567, |
|
"grad_norm": 0.09928351640701294, |
|
"learning_rate": 6.266266266266266e-05, |
|
"loss": 0.0573, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 15.954415954415955, |
|
"grad_norm": 0.07429605722427368, |
|
"learning_rate": 6.016016016016016e-05, |
|
"loss": 0.0541, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 16.23931623931624, |
|
"grad_norm": 0.0647626668214798, |
|
"learning_rate": 5.765765765765766e-05, |
|
"loss": 0.0546, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 16.524216524216524, |
|
"grad_norm": 0.06490299850702286, |
|
"learning_rate": 5.515515515515516e-05, |
|
"loss": 0.0537, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 16.80911680911681, |
|
"grad_norm": 0.07492049783468246, |
|
"learning_rate": 5.2652652652652655e-05, |
|
"loss": 0.0567, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 17.094017094017094, |
|
"grad_norm": 0.18874266743659973, |
|
"learning_rate": 5.015015015015015e-05, |
|
"loss": 0.0534, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 4001, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 23, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.179327598749286e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|