{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0896, "eval_steps": 500, "global_step": 70, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00128, "grad_norm": 1.9465926503876676, "learning_rate": 2.0833333333333333e-07, "loss": 0.6416, "step": 1 }, { "epoch": 0.00256, "grad_norm": 2.0124156993303566, "learning_rate": 4.1666666666666667e-07, "loss": 0.6669, "step": 2 }, { "epoch": 0.00384, "grad_norm": 2.087646283120165, "learning_rate": 6.25e-07, "loss": 0.6672, "step": 3 }, { "epoch": 0.00512, "grad_norm": 2.0504712472935056, "learning_rate": 8.333333333333333e-07, "loss": 0.6653, "step": 4 }, { "epoch": 0.0064, "grad_norm": 1.815879626276492, "learning_rate": 1.0416666666666667e-06, "loss": 0.6509, "step": 5 }, { "epoch": 0.00768, "grad_norm": 1.8764565776467699, "learning_rate": 1.25e-06, "loss": 0.646, "step": 6 }, { "epoch": 0.00896, "grad_norm": 1.8629002346447046, "learning_rate": 1.4583333333333335e-06, "loss": 0.6572, "step": 7 }, { "epoch": 0.01024, "grad_norm": 1.8435249737816124, "learning_rate": 1.6666666666666667e-06, "loss": 0.66, "step": 8 }, { "epoch": 0.01152, "grad_norm": 1.922292840935334, "learning_rate": 1.8750000000000003e-06, "loss": 0.6572, "step": 9 }, { "epoch": 0.0128, "grad_norm": 1.8885131960372619, "learning_rate": 2.0833333333333334e-06, "loss": 0.6462, "step": 10 }, { "epoch": 0.01408, "grad_norm": 1.6704165936125248, "learning_rate": 2.2916666666666666e-06, "loss": 0.6475, "step": 11 }, { "epoch": 0.01536, "grad_norm": 1.725582114057618, "learning_rate": 2.5e-06, "loss": 0.6193, "step": 12 }, { "epoch": 0.01664, "grad_norm": 1.6281351666821622, "learning_rate": 2.7083333333333334e-06, "loss": 0.6427, "step": 13 }, { "epoch": 0.01792, "grad_norm": 0.9742610148915729, "learning_rate": 2.916666666666667e-06, "loss": 0.6003, "step": 14 }, { "epoch": 0.0192, "grad_norm": 0.9408138329286213, "learning_rate": 3.125e-06, "loss": 0.5782, "step": 15 }, { "epoch": 0.02048, "grad_norm": 0.8871297966814529, "learning_rate": 3.3333333333333333e-06, "loss": 0.5642, "step": 16 }, { "epoch": 0.02176, "grad_norm": 0.8677350156338169, "learning_rate": 3.5416666666666673e-06, "loss": 0.5752, "step": 17 }, { "epoch": 0.02304, "grad_norm": 0.8296766137632919, "learning_rate": 3.7500000000000005e-06, "loss": 0.5495, "step": 18 }, { "epoch": 0.02432, "grad_norm": 0.6923202785291074, "learning_rate": 3.958333333333333e-06, "loss": 0.4964, "step": 19 }, { "epoch": 0.0256, "grad_norm": 0.7854379694338149, "learning_rate": 4.166666666666667e-06, "loss": 0.479, "step": 20 }, { "epoch": 0.02688, "grad_norm": 0.7829163717626061, "learning_rate": 4.3750000000000005e-06, "loss": 0.4801, "step": 21 }, { "epoch": 0.02816, "grad_norm": 0.6533431708027024, "learning_rate": 4.583333333333333e-06, "loss": 0.4612, "step": 22 }, { "epoch": 0.02944, "grad_norm": 0.5572023606265587, "learning_rate": 4.791666666666668e-06, "loss": 0.4605, "step": 23 }, { "epoch": 0.03072, "grad_norm": 0.5293319538350173, "learning_rate": 5e-06, "loss": 0.4639, "step": 24 }, { "epoch": 0.032, "grad_norm": 0.41911726053111276, "learning_rate": 4.999978471321311e-06, "loss": 0.4227, "step": 25 }, { "epoch": 0.03328, "grad_norm": 0.7172502383894598, "learning_rate": 4.999913885656027e-06, "loss": 0.3964, "step": 26 }, { "epoch": 0.03456, "grad_norm": 0.5913996068423693, "learning_rate": 4.999806244116505e-06, "loss": 0.4059, "step": 27 }, { "epoch": 0.03584, "grad_norm": 0.48055748126084374, "learning_rate": 4.999655548556651e-06, "loss": 0.4107, "step": 28 }, { "epoch": 0.03712, "grad_norm": 0.41925076244744985, "learning_rate": 4.999461801571884e-06, "loss": 0.3989, "step": 29 }, { "epoch": 0.0384, "grad_norm": 0.3653215916567385, "learning_rate": 4.999225006499096e-06, "loss": 0.3955, "step": 30 }, { "epoch": 0.03968, "grad_norm": 0.32516349299172237, "learning_rate": 4.998945167416598e-06, "loss": 0.3883, "step": 31 }, { "epoch": 0.04096, "grad_norm": 0.27971440200729386, "learning_rate": 4.998622289144039e-06, "loss": 0.3957, "step": 32 }, { "epoch": 0.04224, "grad_norm": 0.2766170998438116, "learning_rate": 4.9982563772423375e-06, "loss": 0.3848, "step": 33 }, { "epoch": 0.04352, "grad_norm": 0.30360945437785425, "learning_rate": 4.99784743801357e-06, "loss": 0.3938, "step": 34 }, { "epoch": 0.0448, "grad_norm": 0.28460957744347015, "learning_rate": 4.997395478500874e-06, "loss": 0.368, "step": 35 }, { "epoch": 0.04608, "grad_norm": 0.2850953684320096, "learning_rate": 4.996900506488323e-06, "loss": 0.3724, "step": 36 }, { "epoch": 0.04736, "grad_norm": 0.25012126257873374, "learning_rate": 4.9963625305007925e-06, "loss": 0.361, "step": 37 }, { "epoch": 0.04864, "grad_norm": 0.24498070894576404, "learning_rate": 4.995781559803811e-06, "loss": 0.3806, "step": 38 }, { "epoch": 0.04992, "grad_norm": 0.22591581090418575, "learning_rate": 4.995157604403404e-06, "loss": 0.363, "step": 39 }, { "epoch": 0.0512, "grad_norm": 0.22056309851677902, "learning_rate": 4.99449067504592e-06, "loss": 0.3587, "step": 40 }, { "epoch": 0.05248, "grad_norm": 0.20829370113607343, "learning_rate": 4.993780783217844e-06, "loss": 0.3727, "step": 41 }, { "epoch": 0.05376, "grad_norm": 0.19999252558449981, "learning_rate": 4.993027941145604e-06, "loss": 0.3506, "step": 42 }, { "epoch": 0.05504, "grad_norm": 0.2086064795316142, "learning_rate": 4.992232161795356e-06, "loss": 0.3485, "step": 43 }, { "epoch": 0.05632, "grad_norm": 0.18277952791784172, "learning_rate": 4.9913934588727615e-06, "loss": 0.3334, "step": 44 }, { "epoch": 0.0576, "grad_norm": 0.18383586373314206, "learning_rate": 4.990511846822754e-06, "loss": 0.3531, "step": 45 }, { "epoch": 0.05888, "grad_norm": 0.18390959318122982, "learning_rate": 4.9895873408292875e-06, "loss": 0.3539, "step": 46 }, { "epoch": 0.06016, "grad_norm": 0.18173318091193896, "learning_rate": 4.988619956815074e-06, "loss": 0.3452, "step": 47 }, { "epoch": 0.06144, "grad_norm": 0.1729819420549717, "learning_rate": 4.987609711441316e-06, "loss": 0.3239, "step": 48 }, { "epoch": 0.06272, "grad_norm": 0.17347984174169176, "learning_rate": 4.98655662210741e-06, "loss": 0.3475, "step": 49 }, { "epoch": 0.064, "grad_norm": 0.1699972869063586, "learning_rate": 4.985460706950655e-06, "loss": 0.3513, "step": 50 }, { "epoch": 0.06528, "grad_norm": 0.1606057290926495, "learning_rate": 4.984321984845934e-06, "loss": 0.3335, "step": 51 }, { "epoch": 0.06656, "grad_norm": 0.16534392303574644, "learning_rate": 4.9831404754053935e-06, "loss": 0.3345, "step": 52 }, { "epoch": 0.06784, "grad_norm": 0.23216304038143887, "learning_rate": 4.981916198978103e-06, "loss": 0.3487, "step": 53 }, { "epoch": 0.06912, "grad_norm": 0.1516460550172589, "learning_rate": 4.980649176649705e-06, "loss": 0.3369, "step": 54 }, { "epoch": 0.0704, "grad_norm": 0.1448627378722896, "learning_rate": 4.979339430242053e-06, "loss": 0.3339, "step": 55 }, { "epoch": 0.07168, "grad_norm": 0.1470353182317087, "learning_rate": 4.9779869823128356e-06, "loss": 0.3356, "step": 56 }, { "epoch": 0.07296, "grad_norm": 0.14370030555945446, "learning_rate": 4.976591856155187e-06, "loss": 0.3427, "step": 57 }, { "epoch": 0.07424, "grad_norm": 0.14698377717239064, "learning_rate": 4.975154075797281e-06, "loss": 0.3364, "step": 58 }, { "epoch": 0.07552, "grad_norm": 0.1452001418009019, "learning_rate": 4.973673666001932e-06, "loss": 0.3287, "step": 59 }, { "epoch": 0.0768, "grad_norm": 0.1533193038026651, "learning_rate": 4.972150652266151e-06, "loss": 0.32, "step": 60 }, { "epoch": 0.07808, "grad_norm": 0.14661431291205376, "learning_rate": 4.970585060820717e-06, "loss": 0.351, "step": 61 }, { "epoch": 0.07936, "grad_norm": 0.16618365600060153, "learning_rate": 4.968976918629722e-06, "loss": 0.3545, "step": 62 }, { "epoch": 0.08064, "grad_norm": 0.1423441821889856, "learning_rate": 4.967326253390107e-06, "loss": 0.3377, "step": 63 }, { "epoch": 0.08192, "grad_norm": 0.14484043253247622, "learning_rate": 4.965633093531186e-06, "loss": 0.3314, "step": 64 }, { "epoch": 0.0832, "grad_norm": 0.1465728819756019, "learning_rate": 4.963897468214154e-06, "loss": 0.3365, "step": 65 }, { "epoch": 0.08448, "grad_norm": 0.14519445175592308, "learning_rate": 4.962119407331587e-06, "loss": 0.3266, "step": 66 }, { "epoch": 0.08576, "grad_norm": 0.21106088971282316, "learning_rate": 4.960298941506927e-06, "loss": 0.3365, "step": 67 }, { "epoch": 0.08704, "grad_norm": 0.14358267564891752, "learning_rate": 4.958436102093951e-06, "loss": 0.3355, "step": 68 }, { "epoch": 0.08832, "grad_norm": 0.1392772159594871, "learning_rate": 4.956530921176238e-06, "loss": 0.326, "step": 69 }, { "epoch": 0.0896, "grad_norm": 0.1403897051822436, "learning_rate": 4.954583431566609e-06, "loss": 0.3113, "step": 70 } ], "logging_steps": 1, "max_steps": 781, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 89651498975232.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }