{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.017106079072850514, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.553039536425257e-05, "eval_loss": 2.381143808364868, "eval_runtime": 144.1703, "eval_samples_per_second": 34.147, "eval_steps_per_second": 17.077, "step": 1 }, { "epoch": 0.0008553039536425257, "grad_norm": 5.450172424316406, "learning_rate": 0.00019967573081342103, "loss": 9.0162, "step": 10 }, { "epoch": 0.0017106079072850514, "grad_norm": 4.5537848472595215, "learning_rate": 0.0001970941817426052, "loss": 8.8664, "step": 20 }, { "epoch": 0.002565911860927577, "grad_norm": 7.618182182312012, "learning_rate": 0.00019199794436588243, "loss": 9.0587, "step": 30 }, { "epoch": 0.003421215814570103, "grad_norm": 5.113912582397461, "learning_rate": 0.0001845190085543795, "loss": 8.1141, "step": 40 }, { "epoch": 0.004276519768212628, "grad_norm": 7.7498955726623535, "learning_rate": 0.00017485107481711012, "loss": 8.7842, "step": 50 }, { "epoch": 0.004276519768212628, "eval_loss": 2.1146717071533203, "eval_runtime": 142.2534, "eval_samples_per_second": 34.607, "eval_steps_per_second": 17.307, "step": 50 }, { "epoch": 0.005131823721855154, "grad_norm": 4.736423969268799, "learning_rate": 0.00016324453755953773, "loss": 8.2631, "step": 60 }, { "epoch": 0.00598712767549768, "grad_norm": 3.9354634284973145, "learning_rate": 0.00015000000000000001, "loss": 8.4239, "step": 70 }, { "epoch": 0.006842431629140206, "grad_norm": 5.779056072235107, "learning_rate": 0.00013546048870425356, "loss": 8.0989, "step": 80 }, { "epoch": 0.007697735582782732, "grad_norm": 5.675789833068848, "learning_rate": 0.00012000256937760445, "loss": 8.2106, "step": 90 }, { "epoch": 0.008553039536425257, "grad_norm": 5.744926452636719, "learning_rate": 0.00010402659401094152, "loss": 8.2246, "step": 100 }, { "epoch": 0.008553039536425257, "eval_loss": 2.0408310890197754, "eval_runtime": 142.0789, "eval_samples_per_second": 34.65, "eval_steps_per_second": 17.328, "step": 100 }, { "epoch": 0.009408343490067783, "grad_norm": 5.404253005981445, "learning_rate": 8.79463319744677e-05, "loss": 7.483, "step": 110 }, { "epoch": 0.010263647443710309, "grad_norm": 7.058231830596924, "learning_rate": 7.217825360835473e-05, "loss": 8.0571, "step": 120 }, { "epoch": 0.011118951397352835, "grad_norm": 5.664774417877197, "learning_rate": 5.713074385969457e-05, "loss": 8.5309, "step": 130 }, { "epoch": 0.01197425535099536, "grad_norm": 4.67970085144043, "learning_rate": 4.3193525326884435e-05, "loss": 7.8239, "step": 140 }, { "epoch": 0.012829559304637885, "grad_norm": 6.211084365844727, "learning_rate": 3.072756464904006e-05, "loss": 7.6217, "step": 150 }, { "epoch": 0.012829559304637885, "eval_loss": 2.0031981468200684, "eval_runtime": 141.879, "eval_samples_per_second": 34.699, "eval_steps_per_second": 17.353, "step": 150 }, { "epoch": 0.013684863258280411, "grad_norm": 5.578930377960205, "learning_rate": 2.0055723659649904e-05, "loss": 7.9683, "step": 160 }, { "epoch": 0.014540167211922937, "grad_norm": 5.688002109527588, "learning_rate": 1.1454397434679021e-05, "loss": 8.0386, "step": 170 }, { "epoch": 0.015395471165565463, "grad_norm": 4.525981903076172, "learning_rate": 5.146355805285452e-06, "loss": 7.3559, "step": 180 }, { "epoch": 0.01625077511920799, "grad_norm": 5.752941608428955, "learning_rate": 1.2949737362087156e-06, "loss": 7.7123, "step": 190 }, { "epoch": 0.017106079072850514, "grad_norm": 15.219685554504395, "learning_rate": 0.0, "loss": 8.2222, "step": 200 }, { "epoch": 0.017106079072850514, "eval_loss": 1.9949531555175781, "eval_runtime": 142.5551, "eval_samples_per_second": 34.534, "eval_steps_per_second": 17.271, "step": 200 } ], "logging_steps": 10, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.534314936519885e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }