{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.024752475247524754, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004950495049504951, "grad_norm": 0.5114933252334595, "learning_rate": 5e-06, "loss": 2.5697, "step": 1 }, { "epoch": 0.0004950495049504951, "eval_loss": 2.622131824493408, "eval_runtime": 15.1063, "eval_samples_per_second": 56.334, "eval_steps_per_second": 28.2, "step": 1 }, { "epoch": 0.0009900990099009901, "grad_norm": 0.5429823398590088, "learning_rate": 1e-05, "loss": 2.6019, "step": 2 }, { "epoch": 0.0014851485148514852, "grad_norm": 1.1041313409805298, "learning_rate": 1.5e-05, "loss": 2.5231, "step": 3 }, { "epoch": 0.0019801980198019802, "grad_norm": 0.9193277359008789, "learning_rate": 2e-05, "loss": 2.6823, "step": 4 }, { "epoch": 0.0024752475247524753, "grad_norm": 0.7951717972755432, "learning_rate": 2.5e-05, "loss": 2.3677, "step": 5 }, { "epoch": 0.0029702970297029703, "grad_norm": 0.9375802874565125, "learning_rate": 3e-05, "loss": 2.3939, "step": 6 }, { "epoch": 0.0034653465346534654, "grad_norm": 0.6394500136375427, "learning_rate": 3.5e-05, "loss": 2.8128, "step": 7 }, { "epoch": 0.0039603960396039604, "grad_norm": 0.6920015215873718, "learning_rate": 4e-05, "loss": 2.5449, "step": 8 }, { "epoch": 0.004455445544554455, "grad_norm": 0.808408260345459, "learning_rate": 4.5e-05, "loss": 2.5139, "step": 9 }, { "epoch": 0.0049504950495049506, "grad_norm": 0.5451938509941101, "learning_rate": 5e-05, "loss": 2.6134, "step": 10 }, { "epoch": 0.005445544554455445, "grad_norm": 0.9277445673942566, "learning_rate": 4.99229333433282e-05, "loss": 2.5469, "step": 11 }, { "epoch": 0.005940594059405941, "grad_norm": 0.9077724814414978, "learning_rate": 4.9692208514878444e-05, "loss": 2.7174, "step": 12 }, { "epoch": 0.006435643564356435, "grad_norm": 0.7381893396377563, "learning_rate": 4.9309248009941914e-05, "loss": 2.759, "step": 13 }, { "epoch": 0.006435643564356435, "eval_loss": 2.598520278930664, "eval_runtime": 13.6057, "eval_samples_per_second": 62.548, "eval_steps_per_second": 31.311, "step": 13 }, { "epoch": 0.006930693069306931, "grad_norm": 0.7455027103424072, "learning_rate": 4.877641290737884e-05, "loss": 2.5129, "step": 14 }, { "epoch": 0.007425742574257425, "grad_norm": 0.7870685458183289, "learning_rate": 4.8096988312782174e-05, "loss": 2.716, "step": 15 }, { "epoch": 0.007920792079207921, "grad_norm": 0.7373952865600586, "learning_rate": 4.72751631047092e-05, "loss": 2.6652, "step": 16 }, { "epoch": 0.008415841584158416, "grad_norm": 0.709264874458313, "learning_rate": 4.6316004108852305e-05, "loss": 2.6148, "step": 17 }, { "epoch": 0.00891089108910891, "grad_norm": 0.8849121928215027, "learning_rate": 4.522542485937369e-05, "loss": 2.5974, "step": 18 }, { "epoch": 0.009405940594059406, "grad_norm": 0.6484812498092651, "learning_rate": 4.401014914000078e-05, "loss": 2.6253, "step": 19 }, { "epoch": 0.009900990099009901, "grad_norm": 0.7150304913520813, "learning_rate": 4.267766952966369e-05, "loss": 2.5675, "step": 20 }, { "epoch": 0.010396039603960397, "grad_norm": 0.848437488079071, "learning_rate": 4.123620120825459e-05, "loss": 2.5794, "step": 21 }, { "epoch": 0.01089108910891089, "grad_norm": 1.2628787755966187, "learning_rate": 3.969463130731183e-05, "loss": 2.7266, "step": 22 }, { "epoch": 0.011386138613861386, "grad_norm": 0.8389996290206909, "learning_rate": 3.8062464117898724e-05, "loss": 2.4219, "step": 23 }, { "epoch": 0.011881188118811881, "grad_norm": 0.8797109127044678, "learning_rate": 3.634976249348867e-05, "loss": 2.5167, "step": 24 }, { "epoch": 0.012376237623762377, "grad_norm": 0.7067325115203857, "learning_rate": 3.456708580912725e-05, "loss": 2.266, "step": 25 }, { "epoch": 0.01287128712871287, "grad_norm": 1.0020214319229126, "learning_rate": 3.272542485937369e-05, "loss": 2.5468, "step": 26 }, { "epoch": 0.01287128712871287, "eval_loss": 2.526824951171875, "eval_runtime": 13.5807, "eval_samples_per_second": 62.662, "eval_steps_per_second": 31.368, "step": 26 }, { "epoch": 0.013366336633663366, "grad_norm": 0.6852343678474426, "learning_rate": 3.083613409639764e-05, "loss": 2.6977, "step": 27 }, { "epoch": 0.013861386138613862, "grad_norm": 0.6660365462303162, "learning_rate": 2.8910861626005776e-05, "loss": 2.3572, "step": 28 }, { "epoch": 0.014356435643564357, "grad_norm": 0.6907557249069214, "learning_rate": 2.6961477393196126e-05, "loss": 2.4936, "step": 29 }, { "epoch": 0.01485148514851485, "grad_norm": 0.7001723051071167, "learning_rate": 2.5e-05, "loss": 2.5243, "step": 30 }, { "epoch": 0.015346534653465346, "grad_norm": 0.6474969387054443, "learning_rate": 2.303852260680388e-05, "loss": 2.566, "step": 31 }, { "epoch": 0.015841584158415842, "grad_norm": 0.7469086647033691, "learning_rate": 2.1089138373994223e-05, "loss": 2.4028, "step": 32 }, { "epoch": 0.016336633663366337, "grad_norm": 1.4682289361953735, "learning_rate": 1.9163865903602374e-05, "loss": 3.0202, "step": 33 }, { "epoch": 0.016831683168316833, "grad_norm": 0.647513747215271, "learning_rate": 1.7274575140626318e-05, "loss": 2.6409, "step": 34 }, { "epoch": 0.017326732673267328, "grad_norm": 0.5837624073028564, "learning_rate": 1.5432914190872757e-05, "loss": 2.3664, "step": 35 }, { "epoch": 0.01782178217821782, "grad_norm": 0.5973520874977112, "learning_rate": 1.3650237506511331e-05, "loss": 2.2474, "step": 36 }, { "epoch": 0.018316831683168316, "grad_norm": 0.6770974397659302, "learning_rate": 1.1937535882101281e-05, "loss": 2.4933, "step": 37 }, { "epoch": 0.01881188118811881, "grad_norm": 0.7002883553504944, "learning_rate": 1.0305368692688174e-05, "loss": 2.4098, "step": 38 }, { "epoch": 0.019306930693069307, "grad_norm": 0.5202279090881348, "learning_rate": 8.763798791745411e-06, "loss": 2.3545, "step": 39 }, { "epoch": 0.019306930693069307, "eval_loss": 2.5015273094177246, "eval_runtime": 13.8909, "eval_samples_per_second": 61.263, "eval_steps_per_second": 30.667, "step": 39 }, { "epoch": 0.019801980198019802, "grad_norm": 0.6241593956947327, "learning_rate": 7.3223304703363135e-06, "loss": 2.3477, "step": 40 }, { "epoch": 0.020297029702970298, "grad_norm": 0.7181453108787537, "learning_rate": 5.989850859999227e-06, "loss": 2.5936, "step": 41 }, { "epoch": 0.020792079207920793, "grad_norm": 0.590019941329956, "learning_rate": 4.7745751406263165e-06, "loss": 2.5446, "step": 42 }, { "epoch": 0.02128712871287129, "grad_norm": 0.6002688407897949, "learning_rate": 3.6839958911476957e-06, "loss": 2.5584, "step": 43 }, { "epoch": 0.02178217821782178, "grad_norm": 0.6000116467475891, "learning_rate": 2.7248368952908053e-06, "loss": 2.3596, "step": 44 }, { "epoch": 0.022277227722772276, "grad_norm": 0.48489993810653687, "learning_rate": 1.9030116872178316e-06, "loss": 2.4035, "step": 45 }, { "epoch": 0.02277227722772277, "grad_norm": 0.590663492679596, "learning_rate": 1.2235870926211619e-06, "loss": 2.5489, "step": 46 }, { "epoch": 0.023267326732673267, "grad_norm": 0.7829977869987488, "learning_rate": 6.907519900580861e-07, "loss": 2.6551, "step": 47 }, { "epoch": 0.023762376237623763, "grad_norm": 0.6934583783149719, "learning_rate": 3.077914851215585e-07, "loss": 2.5088, "step": 48 }, { "epoch": 0.024257425742574258, "grad_norm": 0.5398517847061157, "learning_rate": 7.706665667180091e-08, "loss": 2.4165, "step": 49 }, { "epoch": 0.024752475247524754, "grad_norm": 0.7464584112167358, "learning_rate": 0.0, "loss": 2.5148, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1624279061889024.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }