{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5196850393700787, "eval_steps": 10.0, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06299212598425197, "grad_norm": 2.361685037612915, "learning_rate": 5.000000000000001e-07, "loss": 0.357, "step": 1 }, { "epoch": 0.12598425196850394, "grad_norm": 2.3121159076690674, "learning_rate": 1.0000000000000002e-06, "loss": 0.3512, "step": 2 }, { "epoch": 0.1889763779527559, "grad_norm": 2.452298879623413, "learning_rate": 1.5e-06, "loss": 0.3768, "step": 3 }, { "epoch": 0.25196850393700787, "grad_norm": 2.3367598056793213, "learning_rate": 2.0000000000000003e-06, "loss": 0.361, "step": 4 }, { "epoch": 0.31496062992125984, "grad_norm": 2.326319932937622, "learning_rate": 2.5e-06, "loss": 0.3624, "step": 5 }, { "epoch": 0.3779527559055118, "grad_norm": 2.1740145683288574, "learning_rate": 3e-06, "loss": 0.3504, "step": 6 }, { "epoch": 0.4409448818897638, "grad_norm": 2.296905279159546, "learning_rate": 3.5e-06, "loss": 0.3547, "step": 7 }, { "epoch": 0.5039370078740157, "grad_norm": 2.190321445465088, "learning_rate": 4.000000000000001e-06, "loss": 0.3387, "step": 8 }, { "epoch": 0.5669291338582677, "grad_norm": 2.1358914375305176, "learning_rate": 4.5e-06, "loss": 0.3321, "step": 9 }, { "epoch": 0.6299212598425197, "grad_norm": 53.26506042480469, "learning_rate": 5e-06, "loss": 0.3143, "step": 10 }, { "epoch": 0.6929133858267716, "grad_norm": 44.34261703491211, "learning_rate": 4.99847706754774e-06, "loss": 0.3209, "step": 11 }, { "epoch": 0.7559055118110236, "grad_norm": 39.54725646972656, "learning_rate": 4.993910125649561e-06, "loss": 0.3071, "step": 12 }, { "epoch": 0.8188976377952756, "grad_norm": 9.289456367492676, "learning_rate": 4.986304738420684e-06, "loss": 0.2997, "step": 13 }, { "epoch": 0.8818897637795275, "grad_norm": 3.2083795070648193, "learning_rate": 4.975670171853926e-06, "loss": 0.3367, "step": 14 }, { "epoch": 0.9448818897637795, "grad_norm": 3.201711654663086, "learning_rate": 4.962019382530521e-06, "loss": 0.3471, "step": 15 }, { "epoch": 1.0078740157480315, "grad_norm": 2.9849319458007812, "learning_rate": 4.9453690018345144e-06, "loss": 0.3337, "step": 16 }, { "epoch": 1.0708661417322836, "grad_norm": 2.7130520343780518, "learning_rate": 4.925739315689991e-06, "loss": 0.3253, "step": 17 }, { "epoch": 1.1338582677165354, "grad_norm": 2.2203686237335205, "learning_rate": 4.903154239845798e-06, "loss": 0.3085, "step": 18 }, { "epoch": 1.1968503937007875, "grad_norm": 1.8168455362319946, "learning_rate": 4.8776412907378845e-06, "loss": 0.2976, "step": 19 }, { "epoch": 1.2598425196850394, "grad_norm": 1.3243907690048218, "learning_rate": 4.849231551964771e-06, "loss": 0.2939, "step": 20 }, { "epoch": 1.3228346456692912, "grad_norm": 0.9819900989532471, "learning_rate": 4.817959636416969e-06, "loss": 0.2918, "step": 21 }, { "epoch": 1.3858267716535433, "grad_norm": 0.7295715808868408, "learning_rate": 4.783863644106502e-06, "loss": 0.28, "step": 22 }, { "epoch": 1.4488188976377954, "grad_norm": 0.6359255909919739, "learning_rate": 4.746985115747918e-06, "loss": 0.2858, "step": 23 }, { "epoch": 1.5118110236220472, "grad_norm": 0.7184382677078247, "learning_rate": 4.707368982147318e-06, "loss": 0.2666, "step": 24 }, { "epoch": 1.574803149606299, "grad_norm": 0.743729293346405, "learning_rate": 4.665063509461098e-06, "loss": 0.2723, "step": 25 }, { "epoch": 1.6377952755905512, "grad_norm": 0.7132835984230042, "learning_rate": 4.620120240391065e-06, "loss": 0.274, "step": 26 }, { "epoch": 1.7007874015748032, "grad_norm": 0.6352850198745728, "learning_rate": 4.572593931387604e-06, "loss": 0.2717, "step": 27 }, { "epoch": 1.763779527559055, "grad_norm": 0.5060824155807495, "learning_rate": 4.522542485937369e-06, "loss": 0.2568, "step": 28 }, { "epoch": 1.826771653543307, "grad_norm": 0.44254282116889954, "learning_rate": 4.470026884016805e-06, "loss": 0.2691, "step": 29 }, { "epoch": 1.889763779527559, "grad_norm": 0.49223262071609497, "learning_rate": 4.415111107797445e-06, "loss": 0.2636, "step": 30 }, { "epoch": 1.952755905511811, "grad_norm": 0.35744452476501465, "learning_rate": 4.357862063693486e-06, "loss": 0.2584, "step": 31 }, { "epoch": 2.015748031496063, "grad_norm": 0.44493842124938965, "learning_rate": 4.2983495008466285e-06, "loss": 0.2663, "step": 32 }, { "epoch": 2.078740157480315, "grad_norm": 0.5151119232177734, "learning_rate": 4.236645926147493e-06, "loss": 0.259, "step": 33 }, { "epoch": 2.141732283464567, "grad_norm": 0.5001265406608582, "learning_rate": 4.172826515897146e-06, "loss": 0.2575, "step": 34 }, { "epoch": 2.204724409448819, "grad_norm": 0.4636068344116211, "learning_rate": 4.106969024216348e-06, "loss": 0.2472, "step": 35 }, { "epoch": 2.267716535433071, "grad_norm": 0.375308632850647, "learning_rate": 4.039153688314146e-06, "loss": 0.2495, "step": 36 }, { "epoch": 2.3307086614173227, "grad_norm": 0.31411102414131165, "learning_rate": 3.969463130731183e-06, "loss": 0.2505, "step": 37 }, { "epoch": 2.393700787401575, "grad_norm": 0.24844396114349365, "learning_rate": 3.897982258676867e-06, "loss": 0.2438, "step": 38 }, { "epoch": 2.456692913385827, "grad_norm": 0.239442840218544, "learning_rate": 3.824798160583012e-06, "loss": 0.2392, "step": 39 }, { "epoch": 2.5196850393700787, "grad_norm": 0.1867786943912506, "learning_rate": 3.7500000000000005e-06, "loss": 0.2363, "step": 40 } ], "logging_steps": 1.0, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7790197780656947e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }