{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 6250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 24.225746154785156, "learning_rate": 1e-05, "loss": 10.557, "step": 100 }, { "epoch": 0.16, "grad_norm": 24.75214385986328, "learning_rate": 2e-05, "loss": 9.3749, "step": 200 }, { "epoch": 0.24, "grad_norm": 21.598390579223633, "learning_rate": 3e-05, "loss": 8.3939, "step": 300 }, { "epoch": 0.32, "grad_norm": 12.049657821655273, "learning_rate": 4e-05, "loss": 4.1354, "step": 400 }, { "epoch": 0.4, "grad_norm": 5.240299224853516, "learning_rate": 5e-05, "loss": 0.9051, "step": 500 }, { "epoch": 0.48, "grad_norm": 6.658221244812012, "learning_rate": 4.91304347826087e-05, "loss": 0.737, "step": 600 }, { "epoch": 0.56, "grad_norm": 2.5896286964416504, "learning_rate": 4.8260869565217394e-05, "loss": 0.6799, "step": 700 }, { "epoch": 0.64, "grad_norm": 6.020174026489258, "learning_rate": 4.739130434782609e-05, "loss": 0.6532, "step": 800 }, { "epoch": 0.72, "grad_norm": 3.4233405590057373, "learning_rate": 4.6521739130434785e-05, "loss": 0.6223, "step": 900 }, { "epoch": 0.8, "grad_norm": 3.2131102085113525, "learning_rate": 4.565217391304348e-05, "loss": 0.5963, "step": 1000 }, { "epoch": 0.88, "grad_norm": 2.3392815589904785, "learning_rate": 4.478260869565218e-05, "loss": 0.6014, "step": 1100 }, { "epoch": 0.96, "grad_norm": 1.9733258485794067, "learning_rate": 4.391304347826087e-05, "loss": 0.5765, "step": 1200 }, { "epoch": 1.04, "grad_norm": 2.700756788253784, "learning_rate": 4.304347826086957e-05, "loss": 0.554, "step": 1300 }, { "epoch": 1.12, "grad_norm": 5.853904724121094, "learning_rate": 4.2173913043478264e-05, "loss": 0.5718, "step": 1400 }, { "epoch": 1.2, "grad_norm": 4.578104496002197, "learning_rate": 4.130434782608696e-05, "loss": 0.5471, "step": 1500 }, { "epoch": 1.28, "grad_norm": 1.733497977256775, "learning_rate": 4.0434782608695655e-05, "loss": 0.5577, "step": 1600 }, { "epoch": 1.3599999999999999, "grad_norm": 2.4556689262390137, "learning_rate": 3.956521739130435e-05, "loss": 0.5367, "step": 1700 }, { "epoch": 1.44, "grad_norm": 1.4540470838546753, "learning_rate": 3.869565217391305e-05, "loss": 0.5277, "step": 1800 }, { "epoch": 1.52, "grad_norm": 2.833214521408081, "learning_rate": 3.7826086956521736e-05, "loss": 0.5237, "step": 1900 }, { "epoch": 1.6, "grad_norm": 1.5153348445892334, "learning_rate": 3.695652173913043e-05, "loss": 0.5183, "step": 2000 }, { "epoch": 1.6800000000000002, "grad_norm": 1.163548231124878, "learning_rate": 3.6086956521739134e-05, "loss": 0.4998, "step": 2100 }, { "epoch": 1.76, "grad_norm": 0.9056810736656189, "learning_rate": 3.521739130434783e-05, "loss": 0.5205, "step": 2200 }, { "epoch": 1.8399999999999999, "grad_norm": 2.93011736869812, "learning_rate": 3.4347826086956526e-05, "loss": 0.5067, "step": 2300 }, { "epoch": 1.92, "grad_norm": 1.668942928314209, "learning_rate": 3.347826086956522e-05, "loss": 0.5001, "step": 2400 }, { "epoch": 2.0, "grad_norm": 8.79544448852539, "learning_rate": 3.260869565217392e-05, "loss": 0.4944, "step": 2500 }, { "epoch": 2.08, "grad_norm": 2.793776035308838, "learning_rate": 3.173913043478261e-05, "loss": 0.462, "step": 2600 }, { "epoch": 2.16, "grad_norm": 1.993105173110962, "learning_rate": 3.086956521739131e-05, "loss": 0.4893, "step": 2700 }, { "epoch": 2.24, "grad_norm": 1.373683214187622, "learning_rate": 3e-05, "loss": 0.4884, "step": 2800 }, { "epoch": 2.32, "grad_norm": 2.5384674072265625, "learning_rate": 2.9130434782608696e-05, "loss": 0.4839, "step": 2900 }, { "epoch": 2.4, "grad_norm": 1.294622778892517, "learning_rate": 2.826086956521739e-05, "loss": 0.4824, "step": 3000 }, { "epoch": 2.48, "grad_norm": 1.2775664329528809, "learning_rate": 2.7391304347826085e-05, "loss": 0.4675, "step": 3100 }, { "epoch": 2.56, "grad_norm": 1.2705273628234863, "learning_rate": 2.6521739130434787e-05, "loss": 0.4785, "step": 3200 }, { "epoch": 2.64, "grad_norm": 3.2393271923065186, "learning_rate": 2.5652173913043483e-05, "loss": 0.4728, "step": 3300 }, { "epoch": 2.7199999999999998, "grad_norm": 1.82649827003479, "learning_rate": 2.4782608695652175e-05, "loss": 0.474, "step": 3400 }, { "epoch": 2.8, "grad_norm": 1.2423534393310547, "learning_rate": 2.391304347826087e-05, "loss": 0.4651, "step": 3500 }, { "epoch": 2.88, "grad_norm": 1.142115831375122, "learning_rate": 2.3043478260869567e-05, "loss": 0.4687, "step": 3600 }, { "epoch": 2.96, "grad_norm": 2.1958296298980713, "learning_rate": 2.2173913043478262e-05, "loss": 0.456, "step": 3700 }, { "epoch": 3.04, "grad_norm": 4.805281162261963, "learning_rate": 2.1304347826086958e-05, "loss": 0.4439, "step": 3800 }, { "epoch": 3.12, "grad_norm": 23.097047805786133, "learning_rate": 2.0434782608695654e-05, "loss": 0.4393, "step": 3900 }, { "epoch": 3.2, "grad_norm": 3.215237617492676, "learning_rate": 1.956521739130435e-05, "loss": 0.4443, "step": 4000 }, { "epoch": 3.2800000000000002, "grad_norm": 1.1977019309997559, "learning_rate": 1.8695652173913045e-05, "loss": 0.4333, "step": 4100 }, { "epoch": 3.36, "grad_norm": 1.6901699304580688, "learning_rate": 1.782608695652174e-05, "loss": 0.4678, "step": 4200 }, { "epoch": 3.44, "grad_norm": 2.7421112060546875, "learning_rate": 1.6956521739130433e-05, "loss": 0.4529, "step": 4300 }, { "epoch": 3.52, "grad_norm": 1.5662778615951538, "learning_rate": 1.608695652173913e-05, "loss": 0.4399, "step": 4400 }, { "epoch": 3.6, "grad_norm": 2.1081831455230713, "learning_rate": 1.5217391304347828e-05, "loss": 0.4531, "step": 4500 }, { "epoch": 3.68, "grad_norm": 1.103431224822998, "learning_rate": 1.4347826086956522e-05, "loss": 0.4401, "step": 4600 }, { "epoch": 3.76, "grad_norm": 1.3851810693740845, "learning_rate": 1.3478260869565218e-05, "loss": 0.4401, "step": 4700 }, { "epoch": 3.84, "grad_norm": 1.0307343006134033, "learning_rate": 1.2608695652173914e-05, "loss": 0.4424, "step": 4800 }, { "epoch": 3.92, "grad_norm": 1.1726175546646118, "learning_rate": 1.173913043478261e-05, "loss": 0.4459, "step": 4900 }, { "epoch": 4.0, "grad_norm": 1.542671799659729, "learning_rate": 1.0869565217391305e-05, "loss": 0.4389, "step": 5000 }, { "epoch": 4.08, "grad_norm": 1.4532389640808105, "learning_rate": 1e-05, "loss": 0.4361, "step": 5100 }, { "epoch": 4.16, "grad_norm": 3.246967077255249, "learning_rate": 9.130434782608697e-06, "loss": 0.4452, "step": 5200 }, { "epoch": 4.24, "grad_norm": 0.9646230936050415, "learning_rate": 8.26086956521739e-06, "loss": 0.4239, "step": 5300 }, { "epoch": 4.32, "grad_norm": 1.250227451324463, "learning_rate": 7.391304347826088e-06, "loss": 0.4218, "step": 5400 }, { "epoch": 4.4, "grad_norm": 1.2563761472702026, "learning_rate": 6.521739130434783e-06, "loss": 0.431, "step": 5500 }, { "epoch": 4.48, "grad_norm": 0.9907436370849609, "learning_rate": 5.652173913043479e-06, "loss": 0.4297, "step": 5600 }, { "epoch": 4.5600000000000005, "grad_norm": 1.0406742095947266, "learning_rate": 4.782608695652174e-06, "loss": 0.4403, "step": 5700 }, { "epoch": 4.64, "grad_norm": 1.2739229202270508, "learning_rate": 3.91304347826087e-06, "loss": 0.438, "step": 5800 }, { "epoch": 4.72, "grad_norm": 1.569171667098999, "learning_rate": 3.0434782608695654e-06, "loss": 0.4199, "step": 5900 }, { "epoch": 4.8, "grad_norm": 1.1578859090805054, "learning_rate": 2.173913043478261e-06, "loss": 0.4202, "step": 6000 }, { "epoch": 4.88, "grad_norm": 1.2546783685684204, "learning_rate": 1.3043478260869564e-06, "loss": 0.4451, "step": 6100 }, { "epoch": 4.96, "grad_norm": 0.965720534324646, "learning_rate": 4.347826086956522e-07, "loss": 0.4295, "step": 6200 } ], "logging_steps": 100, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.80591525888e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }