{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03409187761015938, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003409187761015938, "eval_loss": 11.089664459228516, "eval_runtime": 10.5672, "eval_samples_per_second": 467.578, "eval_steps_per_second": 58.483, "step": 1 }, { "epoch": 0.0010227563283047815, "grad_norm": 0.45604798197746277, "learning_rate": 1.5e-05, "loss": 44.3568, "step": 3 }, { "epoch": 0.002045512656609563, "grad_norm": 0.46130335330963135, "learning_rate": 3e-05, "loss": 44.3545, "step": 6 }, { "epoch": 0.003068268984914344, "grad_norm": 0.44202330708503723, "learning_rate": 4.5e-05, "loss": 44.3516, "step": 9 }, { "epoch": 0.003068268984914344, "eval_loss": 11.088760375976562, "eval_runtime": 10.1289, "eval_samples_per_second": 487.81, "eval_steps_per_second": 61.013, "step": 9 }, { "epoch": 0.004091025313219126, "grad_norm": 0.4335339367389679, "learning_rate": 4.993910125649561e-05, "loss": 44.3549, "step": 12 }, { "epoch": 0.005113781641523907, "grad_norm": 0.4942709803581238, "learning_rate": 4.962019382530521e-05, "loss": 44.3544, "step": 15 }, { "epoch": 0.006136537969828688, "grad_norm": 0.4659290015697479, "learning_rate": 4.9031542398457974e-05, "loss": 44.3528, "step": 18 }, { "epoch": 0.006136537969828688, "eval_loss": 11.08657455444336, "eval_runtime": 10.2663, "eval_samples_per_second": 481.283, "eval_steps_per_second": 60.197, "step": 18 }, { "epoch": 0.007159294298133469, "grad_norm": 0.4347454905509949, "learning_rate": 4.817959636416969e-05, "loss": 44.338, "step": 21 }, { "epoch": 0.008182050626438252, "grad_norm": 0.4440023601055145, "learning_rate": 4.707368982147318e-05, "loss": 44.3387, "step": 24 }, { "epoch": 0.009204806954743032, "grad_norm": 0.4840239882469177, "learning_rate": 4.572593931387604e-05, "loss": 44.341, "step": 27 }, { "epoch": 0.009204806954743032, "eval_loss": 11.084238052368164, "eval_runtime": 10.1848, "eval_samples_per_second": 485.137, "eval_steps_per_second": 60.679, "step": 27 }, { "epoch": 0.010227563283047815, "grad_norm": 0.49456870555877686, "learning_rate": 4.415111107797445e-05, "loss": 44.3389, "step": 30 }, { "epoch": 0.011250319611352595, "grad_norm": 0.4676015377044678, "learning_rate": 4.2366459261474933e-05, "loss": 44.3346, "step": 33 }, { "epoch": 0.012273075939657376, "grad_norm": 0.4602993428707123, "learning_rate": 4.039153688314145e-05, "loss": 44.3378, "step": 36 }, { "epoch": 0.012273075939657376, "eval_loss": 11.082551956176758, "eval_runtime": 10.2119, "eval_samples_per_second": 483.848, "eval_steps_per_second": 60.518, "step": 36 }, { "epoch": 0.013295832267962158, "grad_norm": 0.49384036660194397, "learning_rate": 3.824798160583012e-05, "loss": 44.3236, "step": 39 }, { "epoch": 0.014318588596266939, "grad_norm": 0.47107622027397156, "learning_rate": 3.5959278669726935e-05, "loss": 44.3268, "step": 42 }, { "epoch": 0.015341344924571721, "grad_norm": 0.49026861786842346, "learning_rate": 3.355050358314172e-05, "loss": 44.3271, "step": 45 }, { "epoch": 0.015341344924571721, "eval_loss": 11.080031394958496, "eval_runtime": 10.1875, "eval_samples_per_second": 485.006, "eval_steps_per_second": 60.663, "step": 45 }, { "epoch": 0.016364101252876503, "grad_norm": 0.4897162914276123, "learning_rate": 3.104804738999169e-05, "loss": 44.3183, "step": 48 }, { "epoch": 0.017386857581181282, "grad_norm": 0.45754456520080566, "learning_rate": 2.8479327524001636e-05, "loss": 44.3153, "step": 51 }, { "epoch": 0.018409613909486065, "grad_norm": 0.5274067521095276, "learning_rate": 2.587248741756253e-05, "loss": 44.3143, "step": 54 }, { "epoch": 0.018409613909486065, "eval_loss": 11.078282356262207, "eval_runtime": 10.1669, "eval_samples_per_second": 485.988, "eval_steps_per_second": 60.785, "step": 54 }, { "epoch": 0.019432370237790847, "grad_norm": 0.4819900393486023, "learning_rate": 2.3256088156396868e-05, "loss": 44.3073, "step": 57 }, { "epoch": 0.02045512656609563, "grad_norm": 0.5234776735305786, "learning_rate": 2.0658795558326743e-05, "loss": 44.3053, "step": 60 }, { "epoch": 0.021477882894400408, "grad_norm": 0.5196526646614075, "learning_rate": 1.8109066104575023e-05, "loss": 44.3021, "step": 63 }, { "epoch": 0.021477882894400408, "eval_loss": 11.077031135559082, "eval_runtime": 10.1496, "eval_samples_per_second": 486.817, "eval_steps_per_second": 60.889, "step": 63 }, { "epoch": 0.02250063922270519, "grad_norm": 0.5121042728424072, "learning_rate": 1.56348351646022e-05, "loss": 44.3014, "step": 66 }, { "epoch": 0.023523395551009973, "grad_norm": 0.47857794165611267, "learning_rate": 1.3263210930352737e-05, "loss": 44.3116, "step": 69 }, { "epoch": 0.02454615187931475, "grad_norm": 0.5120978951454163, "learning_rate": 1.1020177413231334e-05, "loss": 44.3001, "step": 72 }, { "epoch": 0.02454615187931475, "eval_loss": 11.07617473602295, "eval_runtime": 10.1688, "eval_samples_per_second": 485.897, "eval_steps_per_second": 60.774, "step": 72 }, { "epoch": 0.025568908207619534, "grad_norm": 0.48292016983032227, "learning_rate": 8.930309757836517e-06, "loss": 44.2999, "step": 75 }, { "epoch": 0.026591664535924316, "grad_norm": 0.46209338307380676, "learning_rate": 7.016504991533726e-06, "loss": 44.301, "step": 78 }, { "epoch": 0.0276144208642291, "grad_norm": 0.4896546006202698, "learning_rate": 5.299731159831953e-06, "loss": 44.3038, "step": 81 }, { "epoch": 0.0276144208642291, "eval_loss": 11.075565338134766, "eval_runtime": 10.139, "eval_samples_per_second": 487.324, "eval_steps_per_second": 60.952, "step": 81 }, { "epoch": 0.028637177192533878, "grad_norm": 0.4694303274154663, "learning_rate": 3.798797596089351e-06, "loss": 44.3048, "step": 84 }, { "epoch": 0.02965993352083866, "grad_norm": 0.48952430486679077, "learning_rate": 2.5301488425208296e-06, "loss": 44.3033, "step": 87 }, { "epoch": 0.030682689849143442, "grad_norm": 0.46993014216423035, "learning_rate": 1.5076844803522922e-06, "loss": 44.3012, "step": 90 }, { "epoch": 0.030682689849143442, "eval_loss": 11.075267791748047, "eval_runtime": 10.2328, "eval_samples_per_second": 482.858, "eval_steps_per_second": 60.394, "step": 90 }, { "epoch": 0.03170544617744822, "grad_norm": 0.49486178159713745, "learning_rate": 7.426068431000882e-07, "loss": 44.3013, "step": 93 }, { "epoch": 0.03272820250575301, "grad_norm": 0.47963476181030273, "learning_rate": 2.4329828146074095e-07, "loss": 44.3049, "step": 96 }, { "epoch": 0.033750958834057786, "grad_norm": 0.4520646333694458, "learning_rate": 1.522932452260595e-08, "loss": 44.3065, "step": 99 }, { "epoch": 0.033750958834057786, "eval_loss": 11.075203895568848, "eval_runtime": 10.2212, "eval_samples_per_second": 483.409, "eval_steps_per_second": 60.463, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 748683264000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }