{ "best_metric": 3.1834473609924316, "best_model_checkpoint": "./snap_diff_llama/diff_llama_410m_mha/checkpoint-13000", "epoch": 0.0505, "eval_steps": 1000, "global_step": 14000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.1428571428571426e-07, "loss": 11.0337, "step": 1 }, { "epoch": 0.07, "learning_rate": 0.00021428571428571427, "loss": 6.4482, "step": 1000 }, { "epoch": 0.07, "eval_loss": 4.9442644119262695, "eval_ppl": 140.36756016320973, "eval_runtime": 30.3546, "eval_samples_per_second": 16.472, "eval_steps_per_second": 0.132, "step": 1000 }, { "epoch": 0.14, "learning_rate": 0.0002983246239337692, "loss": 4.3801, "step": 2000 }, { "epoch": 0.14, "eval_loss": 4.004600524902344, "eval_ppl": 54.84990884930932, "eval_runtime": 29.9124, "eval_samples_per_second": 16.715, "eval_steps_per_second": 0.134, "step": 2000 }, { "epoch": 0.21, "learning_rate": 0.00028822143178056114, "loss": 3.794, "step": 3000 }, { "epoch": 0.21, "eval_loss": 3.629385232925415, "eval_ppl": 37.6896391443392, "eval_runtime": 29.7452, "eval_samples_per_second": 16.809, "eval_steps_per_second": 0.134, "step": 3000 }, { "epoch": 0.29, "learning_rate": 0.0002695698760834384, "loss": 3.5789, "step": 4000 }, { "epoch": 0.29, "eval_loss": 3.499099016189575, "eval_ppl": 33.08562890966227, "eval_runtime": 29.9268, "eval_samples_per_second": 16.707, "eval_steps_per_second": 0.134, "step": 4000 }, { "epoch": 0.04, "learning_rate": 0.00024352347027881003, "loss": 3.4461, "step": 5000 }, { "epoch": 0.04, "eval_loss": 3.4322781562805176, "eval_ppl": 30.947064752985586, "eval_runtime": 30.1629, "eval_samples_per_second": 16.577, "eval_steps_per_second": 0.133, "step": 5000 }, { "epoch": 0.11, "learning_rate": 0.00021169306546959174, "loss": 3.3611, "step": 6000 }, { "epoch": 0.11, "eval_loss": 3.3286616802215576, "eval_ppl": 27.90097627697978, "eval_runtime": 30.5776, "eval_samples_per_second": 16.352, "eval_steps_per_second": 0.131, "step": 6000 }, { "epoch": 0.18, "learning_rate": 0.00017604722665003956, "loss": 3.3525, "step": 7000 }, { "epoch": 0.18, "eval_loss": 3.307777166366577, "eval_ppl": 27.324320498654966, "eval_runtime": 30.3366, "eval_samples_per_second": 16.482, "eval_steps_per_second": 0.132, "step": 7000 }, { "epoch": 0.25, "learning_rate": 0.00013879048596203636, "loss": 3.2975, "step": 8000 }, { "epoch": 0.25, "eval_loss": 3.2553489208221436, "eval_ppl": 25.928660005196736, "eval_runtime": 30.0498, "eval_samples_per_second": 16.639, "eval_steps_per_second": 0.133, "step": 8000 }, { "epoch": 0.01, "learning_rate": 0.00010222700246224735, "loss": 3.2508, "step": 9000 }, { "epoch": 0.01, "eval_loss": 3.2358081340789795, "eval_ppl": 25.426911832146725, "eval_runtime": 30.3885, "eval_samples_per_second": 16.454, "eval_steps_per_second": 0.132, "step": 9000 }, { "epoch": 0.08, "learning_rate": 6.86180604201361e-05, "loss": 3.2163, "step": 10000 }, { "epoch": 0.08, "eval_loss": 3.2036635875701904, "eval_ppl": 24.622572107305388, "eval_runtime": 30.4758, "eval_samples_per_second": 16.406, "eval_steps_per_second": 0.131, "step": 10000 }, { "epoch": 0.15, "learning_rate": 4.004221922552608e-05, "loss": 3.2326, "step": 11000 }, { "epoch": 0.15, "eval_loss": 3.188652753829956, "eval_ppl": 24.255726984975077, "eval_runtime": 29.8586, "eval_samples_per_second": 16.746, "eval_steps_per_second": 0.134, "step": 11000 }, { "epoch": 0.22, "learning_rate": 1.82667639944657e-05, "loss": 3.2173, "step": 12000 }, { "epoch": 0.22, "eval_loss": 3.1875133514404297, "eval_ppl": 24.228105690561378, "eval_runtime": 29.7409, "eval_samples_per_second": 16.812, "eval_steps_per_second": 0.134, "step": 12000 }, { "epoch": 0.3, "learning_rate": 1e-05, "loss": 3.2315, "step": 13000 }, { "epoch": 0.3, "eval_loss": 3.1834473609924316, "eval_ppl": 24.129794445983528, "eval_runtime": 29.6158, "eval_samples_per_second": 16.883, "eval_steps_per_second": 0.135, "step": 13000 }, { "epoch": 0.05, "learning_rate": 1e-05, "loss": 3.2219, "step": 14000 }, { "epoch": 0.05, "eval_loss": 3.184858560562134, "eval_ppl": 24.163870439881514, "eval_runtime": 30.4568, "eval_samples_per_second": 16.417, "eval_steps_per_second": 0.131, "step": 14000 }, { "epoch": 0.05, "step": 14000, "total_flos": 2.0002483519409357e+19, "train_loss": 0.16270504324776786, "train_runtime": 2272.7407, "train_samples_per_second": 1576.951, "train_steps_per_second": 6.16 } ], "logging_steps": 1000, "max_steps": 14000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "total_flos": 2.0002483519409357e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }