{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.037993920972644375, "eval_steps": 5, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007598784194528875, "grad_norm": 0.7123318910598755, "learning_rate": 1e-05, "loss": 22.2421, "step": 1 }, { "epoch": 0.0007598784194528875, "eval_loss": 11.113731384277344, "eval_runtime": 9.5997, "eval_samples_per_second": 115.42, "eval_steps_per_second": 14.48, "step": 1 }, { "epoch": 0.001519756838905775, "grad_norm": 0.7559512257575989, "learning_rate": 2e-05, "loss": 22.2073, "step": 2 }, { "epoch": 0.0022796352583586625, "grad_norm": 0.6938263177871704, "learning_rate": 3e-05, "loss": 22.2088, "step": 3 }, { "epoch": 0.00303951367781155, "grad_norm": 0.7358536720275879, "learning_rate": 4e-05, "loss": 22.2193, "step": 4 }, { "epoch": 0.003799392097264438, "grad_norm": 0.7246705293655396, "learning_rate": 5e-05, "loss": 22.2103, "step": 5 }, { "epoch": 0.003799392097264438, "eval_loss": 11.111109733581543, "eval_runtime": 9.1747, "eval_samples_per_second": 120.767, "eval_steps_per_second": 15.15, "step": 5 }, { "epoch": 0.004559270516717325, "grad_norm": 0.7420904636383057, "learning_rate": 6e-05, "loss": 22.2077, "step": 6 }, { "epoch": 0.005319148936170213, "grad_norm": 0.7143452763557434, "learning_rate": 7e-05, "loss": 22.2248, "step": 7 }, { "epoch": 0.0060790273556231, "grad_norm": 0.7711450457572937, "learning_rate": 8e-05, "loss": 22.2106, "step": 8 }, { "epoch": 0.006838905775075988, "grad_norm": 0.7395123243331909, "learning_rate": 9e-05, "loss": 22.2055, "step": 9 }, { "epoch": 0.007598784194528876, "grad_norm": 0.724628746509552, "learning_rate": 0.0001, "loss": 22.2055, "step": 10 }, { "epoch": 0.007598784194528876, "eval_loss": 11.098109245300293, "eval_runtime": 9.6114, "eval_samples_per_second": 115.28, "eval_steps_per_second": 14.462, "step": 10 }, { "epoch": 0.008358662613981762, "grad_norm": 0.7743872404098511, "learning_rate": 9.98458666866564e-05, "loss": 22.1964, "step": 11 }, { "epoch": 0.00911854103343465, "grad_norm": 0.8174950480461121, "learning_rate": 9.938441702975689e-05, "loss": 22.1942, "step": 12 }, { "epoch": 0.009878419452887538, "grad_norm": 0.8450507521629333, "learning_rate": 9.861849601988383e-05, "loss": 22.1764, "step": 13 }, { "epoch": 0.010638297872340425, "grad_norm": 0.7814104557037354, "learning_rate": 9.755282581475769e-05, "loss": 22.1768, "step": 14 }, { "epoch": 0.011398176291793313, "grad_norm": 0.8507615923881531, "learning_rate": 9.619397662556435e-05, "loss": 22.1678, "step": 15 }, { "epoch": 0.011398176291793313, "eval_loss": 11.078285217285156, "eval_runtime": 9.4456, "eval_samples_per_second": 117.304, "eval_steps_per_second": 14.716, "step": 15 }, { "epoch": 0.0121580547112462, "grad_norm": 1.170660138130188, "learning_rate": 9.45503262094184e-05, "loss": 22.2144, "step": 16 }, { "epoch": 0.012917933130699088, "grad_norm": 1.2283384799957275, "learning_rate": 9.263200821770461e-05, "loss": 22.2188, "step": 17 }, { "epoch": 0.013677811550151976, "grad_norm": 0.8715210556983948, "learning_rate": 9.045084971874738e-05, "loss": 22.1407, "step": 18 }, { "epoch": 0.014437689969604863, "grad_norm": 0.8453183174133301, "learning_rate": 8.802029828000156e-05, "loss": 22.1368, "step": 19 }, { "epoch": 0.015197568389057751, "grad_norm": 0.8434128761291504, "learning_rate": 8.535533905932738e-05, "loss": 22.1207, "step": 20 }, { "epoch": 0.015197568389057751, "eval_loss": 11.058117866516113, "eval_runtime": 8.7467, "eval_samples_per_second": 126.676, "eval_steps_per_second": 15.892, "step": 20 }, { "epoch": 0.015957446808510637, "grad_norm": 0.9485874176025391, "learning_rate": 8.247240241650918e-05, "loss": 22.0967, "step": 21 }, { "epoch": 0.016717325227963525, "grad_norm": 0.9110122323036194, "learning_rate": 7.938926261462366e-05, "loss": 22.0956, "step": 22 }, { "epoch": 0.017477203647416412, "grad_norm": 0.8467977046966553, "learning_rate": 7.612492823579745e-05, "loss": 22.1266, "step": 23 }, { "epoch": 0.0182370820668693, "grad_norm": 0.8551048636436462, "learning_rate": 7.269952498697734e-05, "loss": 22.1062, "step": 24 }, { "epoch": 0.018996960486322188, "grad_norm": 0.8727412819862366, "learning_rate": 6.91341716182545e-05, "loss": 22.0855, "step": 25 }, { "epoch": 0.018996960486322188, "eval_loss": 11.037599563598633, "eval_runtime": 9.5705, "eval_samples_per_second": 115.772, "eval_steps_per_second": 14.524, "step": 25 }, { "epoch": 0.019756838905775075, "grad_norm": 0.966887891292572, "learning_rate": 6.545084971874738e-05, "loss": 22.063, "step": 26 }, { "epoch": 0.020516717325227963, "grad_norm": 1.0077238082885742, "learning_rate": 6.167226819279528e-05, "loss": 22.0584, "step": 27 }, { "epoch": 0.02127659574468085, "grad_norm": 0.9766594171524048, "learning_rate": 5.782172325201155e-05, "loss": 22.0625, "step": 28 }, { "epoch": 0.022036474164133738, "grad_norm": 0.9364005327224731, "learning_rate": 5.392295478639225e-05, "loss": 22.0474, "step": 29 }, { "epoch": 0.022796352583586626, "grad_norm": 1.0432188510894775, "learning_rate": 5e-05, "loss": 22.0193, "step": 30 }, { "epoch": 0.022796352583586626, "eval_loss": 11.019878387451172, "eval_runtime": 9.1273, "eval_samples_per_second": 121.394, "eval_steps_per_second": 15.229, "step": 30 }, { "epoch": 0.023556231003039513, "grad_norm": 0.990106463432312, "learning_rate": 4.607704521360776e-05, "loss": 22.0551, "step": 31 }, { "epoch": 0.0243161094224924, "grad_norm": 0.9892701506614685, "learning_rate": 4.2178276747988446e-05, "loss": 22.0255, "step": 32 }, { "epoch": 0.02507598784194529, "grad_norm": 1.0256024599075317, "learning_rate": 3.832773180720475e-05, "loss": 22.0296, "step": 33 }, { "epoch": 0.025835866261398176, "grad_norm": 0.9178345203399658, "learning_rate": 3.4549150281252636e-05, "loss": 22.0515, "step": 34 }, { "epoch": 0.026595744680851064, "grad_norm": 1.0213465690612793, "learning_rate": 3.086582838174551e-05, "loss": 22.046, "step": 35 }, { "epoch": 0.026595744680851064, "eval_loss": 11.006331443786621, "eval_runtime": 9.6408, "eval_samples_per_second": 114.929, "eval_steps_per_second": 14.418, "step": 35 }, { "epoch": 0.02735562310030395, "grad_norm": 1.0454298257827759, "learning_rate": 2.7300475013022663e-05, "loss": 22.0217, "step": 36 }, { "epoch": 0.02811550151975684, "grad_norm": 1.0446940660476685, "learning_rate": 2.3875071764202563e-05, "loss": 21.9847, "step": 37 }, { "epoch": 0.028875379939209727, "grad_norm": 1.061407446861267, "learning_rate": 2.061073738537635e-05, "loss": 22.0118, "step": 38 }, { "epoch": 0.029635258358662615, "grad_norm": 1.0344719886779785, "learning_rate": 1.7527597583490822e-05, "loss": 21.9994, "step": 39 }, { "epoch": 0.030395136778115502, "grad_norm": 1.0409643650054932, "learning_rate": 1.4644660940672627e-05, "loss": 22.0019, "step": 40 }, { "epoch": 0.030395136778115502, "eval_loss": 10.998189926147461, "eval_runtime": 9.3925, "eval_samples_per_second": 117.967, "eval_steps_per_second": 14.799, "step": 40 }, { "epoch": 0.03115501519756839, "grad_norm": 1.0946383476257324, "learning_rate": 1.1979701719998453e-05, "loss": 21.9642, "step": 41 }, { "epoch": 0.031914893617021274, "grad_norm": 1.0578166246414185, "learning_rate": 9.549150281252633e-06, "loss": 21.9906, "step": 42 }, { "epoch": 0.03267477203647416, "grad_norm": 1.0239158868789673, "learning_rate": 7.367991782295391e-06, "loss": 21.9779, "step": 43 }, { "epoch": 0.03343465045592705, "grad_norm": 1.0567377805709839, "learning_rate": 5.449673790581611e-06, "loss": 21.9981, "step": 44 }, { "epoch": 0.03419452887537994, "grad_norm": 1.0055408477783203, "learning_rate": 3.8060233744356633e-06, "loss": 22.0115, "step": 45 }, { "epoch": 0.03419452887537994, "eval_loss": 10.994943618774414, "eval_runtime": 9.5858, "eval_samples_per_second": 115.587, "eval_steps_per_second": 14.501, "step": 45 }, { "epoch": 0.034954407294832825, "grad_norm": 1.1111423969268799, "learning_rate": 2.4471741852423237e-06, "loss": 21.9687, "step": 46 }, { "epoch": 0.03571428571428571, "grad_norm": 1.0502803325653076, "learning_rate": 1.3815039801161721e-06, "loss": 21.985, "step": 47 }, { "epoch": 0.0364741641337386, "grad_norm": 0.9838832020759583, "learning_rate": 6.15582970243117e-07, "loss": 21.9997, "step": 48 }, { "epoch": 0.03723404255319149, "grad_norm": 1.068258285522461, "learning_rate": 1.5413331334360182e-07, "loss": 21.9884, "step": 49 }, { "epoch": 0.037993920972644375, "grad_norm": 1.0668437480926514, "learning_rate": 0.0, "loss": 21.9595, "step": 50 }, { "epoch": 0.037993920972644375, "eval_loss": 10.994412422180176, "eval_runtime": 9.4009, "eval_samples_per_second": 117.861, "eval_steps_per_second": 14.786, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2121554460672.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }