|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0896, |
|
"eval_steps": 500, |
|
"global_step": 70, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00128, |
|
"grad_norm": 1.9465926503876676, |
|
"learning_rate": 2.0833333333333333e-07, |
|
"loss": 0.6416, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00256, |
|
"grad_norm": 2.0124156993303566, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 0.6669, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00384, |
|
"grad_norm": 2.087646283120165, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.6672, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00512, |
|
"grad_norm": 2.0504712472935056, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.6653, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 1.815879626276492, |
|
"learning_rate": 1.0416666666666667e-06, |
|
"loss": 0.6509, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00768, |
|
"grad_norm": 1.8764565776467699, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.646, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00896, |
|
"grad_norm": 1.8629002346447046, |
|
"learning_rate": 1.4583333333333335e-06, |
|
"loss": 0.6572, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01024, |
|
"grad_norm": 1.8435249737816124, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.66, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01152, |
|
"grad_norm": 1.922292840935334, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.6572, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 1.8885131960372619, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 0.6462, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01408, |
|
"grad_norm": 1.6704165936125248, |
|
"learning_rate": 2.2916666666666666e-06, |
|
"loss": 0.6475, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01536, |
|
"grad_norm": 1.725582114057618, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.6193, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01664, |
|
"grad_norm": 1.6281351666821622, |
|
"learning_rate": 2.7083333333333334e-06, |
|
"loss": 0.6427, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01792, |
|
"grad_norm": 0.9742610148915729, |
|
"learning_rate": 2.916666666666667e-06, |
|
"loss": 0.6003, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 0.9408138329286213, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.5782, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02048, |
|
"grad_norm": 0.8871297966814529, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.5642, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02176, |
|
"grad_norm": 0.8677350156338169, |
|
"learning_rate": 3.5416666666666673e-06, |
|
"loss": 0.5752, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02304, |
|
"grad_norm": 0.8296766137632919, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.5495, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02432, |
|
"grad_norm": 0.6923202785291074, |
|
"learning_rate": 3.958333333333333e-06, |
|
"loss": 0.4964, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 0.7854379694338149, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.479, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02688, |
|
"grad_norm": 0.7829163717626061, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.4801, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02816, |
|
"grad_norm": 0.6533431708027024, |
|
"learning_rate": 4.583333333333333e-06, |
|
"loss": 0.4612, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.02944, |
|
"grad_norm": 0.5572023606265587, |
|
"learning_rate": 4.791666666666668e-06, |
|
"loss": 0.4605, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03072, |
|
"grad_norm": 0.5293319538350173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4639, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.41911726053111276, |
|
"learning_rate": 4.999978471321311e-06, |
|
"loss": 0.4227, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03328, |
|
"grad_norm": 0.7172502383894598, |
|
"learning_rate": 4.999913885656027e-06, |
|
"loss": 0.3964, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03456, |
|
"grad_norm": 0.5913996068423693, |
|
"learning_rate": 4.999806244116505e-06, |
|
"loss": 0.4059, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03584, |
|
"grad_norm": 0.48055748126084374, |
|
"learning_rate": 4.999655548556651e-06, |
|
"loss": 0.4107, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03712, |
|
"grad_norm": 0.41925076244744985, |
|
"learning_rate": 4.999461801571884e-06, |
|
"loss": 0.3989, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 0.3653215916567385, |
|
"learning_rate": 4.999225006499096e-06, |
|
"loss": 0.3955, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03968, |
|
"grad_norm": 0.32516349299172237, |
|
"learning_rate": 4.998945167416598e-06, |
|
"loss": 0.3883, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04096, |
|
"grad_norm": 0.27971440200729386, |
|
"learning_rate": 4.998622289144039e-06, |
|
"loss": 0.3957, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04224, |
|
"grad_norm": 0.2766170998438116, |
|
"learning_rate": 4.9982563772423375e-06, |
|
"loss": 0.3848, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04352, |
|
"grad_norm": 0.30360945437785425, |
|
"learning_rate": 4.99784743801357e-06, |
|
"loss": 0.3938, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 0.28460957744347015, |
|
"learning_rate": 4.997395478500874e-06, |
|
"loss": 0.368, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04608, |
|
"grad_norm": 0.2850953684320096, |
|
"learning_rate": 4.996900506488323e-06, |
|
"loss": 0.3724, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04736, |
|
"grad_norm": 0.25012126257873374, |
|
"learning_rate": 4.9963625305007925e-06, |
|
"loss": 0.361, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04864, |
|
"grad_norm": 0.24498070894576404, |
|
"learning_rate": 4.995781559803811e-06, |
|
"loss": 0.3806, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04992, |
|
"grad_norm": 0.22591581090418575, |
|
"learning_rate": 4.995157604403404e-06, |
|
"loss": 0.363, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 0.22056309851677902, |
|
"learning_rate": 4.99449067504592e-06, |
|
"loss": 0.3587, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05248, |
|
"grad_norm": 0.20829370113607343, |
|
"learning_rate": 4.993780783217844e-06, |
|
"loss": 0.3727, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.05376, |
|
"grad_norm": 0.19999252558449981, |
|
"learning_rate": 4.993027941145604e-06, |
|
"loss": 0.3506, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05504, |
|
"grad_norm": 0.2086064795316142, |
|
"learning_rate": 4.992232161795356e-06, |
|
"loss": 0.3485, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05632, |
|
"grad_norm": 0.18277952791784172, |
|
"learning_rate": 4.9913934588727615e-06, |
|
"loss": 0.3334, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 0.18383586373314206, |
|
"learning_rate": 4.990511846822754e-06, |
|
"loss": 0.3531, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05888, |
|
"grad_norm": 0.18390959318122982, |
|
"learning_rate": 4.9895873408292875e-06, |
|
"loss": 0.3539, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.06016, |
|
"grad_norm": 0.18173318091193896, |
|
"learning_rate": 4.988619956815074e-06, |
|
"loss": 0.3452, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.06144, |
|
"grad_norm": 0.1729819420549717, |
|
"learning_rate": 4.987609711441316e-06, |
|
"loss": 0.3239, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06272, |
|
"grad_norm": 0.17347984174169176, |
|
"learning_rate": 4.98655662210741e-06, |
|
"loss": 0.3475, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.1699972869063586, |
|
"learning_rate": 4.985460706950655e-06, |
|
"loss": 0.3513, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06528, |
|
"grad_norm": 0.1606057290926495, |
|
"learning_rate": 4.984321984845934e-06, |
|
"loss": 0.3335, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06656, |
|
"grad_norm": 0.16534392303574644, |
|
"learning_rate": 4.9831404754053935e-06, |
|
"loss": 0.3345, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06784, |
|
"grad_norm": 0.23216304038143887, |
|
"learning_rate": 4.981916198978103e-06, |
|
"loss": 0.3487, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06912, |
|
"grad_norm": 0.1516460550172589, |
|
"learning_rate": 4.980649176649705e-06, |
|
"loss": 0.3369, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 0.1448627378722896, |
|
"learning_rate": 4.979339430242053e-06, |
|
"loss": 0.3339, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07168, |
|
"grad_norm": 0.1470353182317087, |
|
"learning_rate": 4.9779869823128356e-06, |
|
"loss": 0.3356, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07296, |
|
"grad_norm": 0.14370030555945446, |
|
"learning_rate": 4.976591856155187e-06, |
|
"loss": 0.3427, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07424, |
|
"grad_norm": 0.14698377717239064, |
|
"learning_rate": 4.975154075797281e-06, |
|
"loss": 0.3364, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07552, |
|
"grad_norm": 0.1452001418009019, |
|
"learning_rate": 4.973673666001932e-06, |
|
"loss": 0.3287, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 0.1533193038026651, |
|
"learning_rate": 4.972150652266151e-06, |
|
"loss": 0.32, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07808, |
|
"grad_norm": 0.14661431291205376, |
|
"learning_rate": 4.970585060820717e-06, |
|
"loss": 0.351, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07936, |
|
"grad_norm": 0.16618365600060153, |
|
"learning_rate": 4.968976918629722e-06, |
|
"loss": 0.3545, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.08064, |
|
"grad_norm": 0.1423441821889856, |
|
"learning_rate": 4.967326253390107e-06, |
|
"loss": 0.3377, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08192, |
|
"grad_norm": 0.14484043253247622, |
|
"learning_rate": 4.965633093531186e-06, |
|
"loss": 0.3314, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 0.1465728819756019, |
|
"learning_rate": 4.963897468214154e-06, |
|
"loss": 0.3365, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08448, |
|
"grad_norm": 0.14519445175592308, |
|
"learning_rate": 4.962119407331587e-06, |
|
"loss": 0.3266, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08576, |
|
"grad_norm": 0.21106088971282316, |
|
"learning_rate": 4.960298941506927e-06, |
|
"loss": 0.3365, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08704, |
|
"grad_norm": 0.14358267564891752, |
|
"learning_rate": 4.958436102093951e-06, |
|
"loss": 0.3355, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08832, |
|
"grad_norm": 0.1392772159594871, |
|
"learning_rate": 4.956530921176238e-06, |
|
"loss": 0.326, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 0.1403897051822436, |
|
"learning_rate": 4.954583431566609e-06, |
|
"loss": 0.3113, |
|
"step": 70 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 781, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 89651498975232.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|