{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.94915254237288, "eval_steps": 100, "global_step": 216, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2711864406779661, "grad_norm": 0.5330628156661987, "learning_rate": 0.00039696155060244166, "loss": 3.0018, "step": 5 }, { "epoch": 0.5423728813559322, "grad_norm": 0.34631356596946716, "learning_rate": 0.0003879385241571817, "loss": 2.7414, "step": 10 }, { "epoch": 0.8135593220338984, "grad_norm": 0.4716702997684479, "learning_rate": 0.00037320508075688776, "loss": 2.6142, "step": 15 }, { "epoch": 1.0949152542372882, "grad_norm": 0.46511930227279663, "learning_rate": 0.0003532088886237956, "loss": 2.8757, "step": 20 }, { "epoch": 1.3661016949152542, "grad_norm": 0.5249255895614624, "learning_rate": 0.00032855752193730787, "loss": 2.392, "step": 25 }, { "epoch": 1.6372881355932203, "grad_norm": 0.5600208640098572, "learning_rate": 0.00030000000000000003, "loss": 2.3346, "step": 30 }, { "epoch": 1.9084745762711863, "grad_norm": 0.6260833144187927, "learning_rate": 0.00026840402866513377, "loss": 2.2452, "step": 35 }, { "epoch": 2.1898305084745764, "grad_norm": 0.7083396911621094, "learning_rate": 0.00023472963553338613, "loss": 2.5452, "step": 40 }, { "epoch": 2.461016949152542, "grad_norm": 0.5148910284042358, "learning_rate": 0.0002, "loss": 2.0905, "step": 45 }, { "epoch": 2.7322033898305085, "grad_norm": 0.5956751108169556, "learning_rate": 0.00016527036446661395, "loss": 2.1029, "step": 50 }, { "epoch": 3.013559322033898, "grad_norm": 1.3184939622879028, "learning_rate": 0.00013159597133486628, "loss": 2.3754, "step": 55 }, { "epoch": 3.2847457627118644, "grad_norm": 0.6270354390144348, "learning_rate": 0.00010000000000000005, "loss": 2.0626, "step": 60 }, { "epoch": 3.5559322033898306, "grad_norm": 0.6598231196403503, "learning_rate": 7.144247806269213e-05, "loss": 2.0278, "step": 65 }, { "epoch": 3.8271186440677964, "grad_norm": 0.5732411742210388, "learning_rate": 4.679111137620442e-05, "loss": 1.9843, "step": 70 }, { "epoch": 4.1084745762711865, "grad_norm": 0.6006195545196533, "learning_rate": 2.679491924311226e-05, "loss": 2.296, "step": 75 }, { "epoch": 4.379661016949153, "grad_norm": 0.5978462100028992, "learning_rate": 1.2061475842818338e-05, "loss": 1.9672, "step": 80 }, { "epoch": 4.650847457627119, "grad_norm": 0.5773534774780273, "learning_rate": 3.0384493975583962e-06, "loss": 1.9839, "step": 85 }, { "epoch": 4.922033898305084, "grad_norm": 0.618421196937561, "learning_rate": 0.0, "loss": 1.9694, "step": 90 }, { "epoch": 5.271186440677966, "grad_norm": 0.7557504773139954, "learning_rate": 0.0001463654774478725, "loss": 1.9701, "step": 95 }, { "epoch": 5.5423728813559325, "grad_norm": 0.7439705729484558, "learning_rate": 0.00012796445503905797, "loss": 1.944, "step": 100 }, { "epoch": 5.5423728813559325, "eval_loss": 2.1080985069274902, "eval_runtime": 54.8392, "eval_samples_per_second": 10.74, "eval_steps_per_second": 1.349, "step": 100 }, { "epoch": 5.813559322033898, "grad_norm": 0.866310179233551, "learning_rate": 0.00011024016395990757, "loss": 1.9449, "step": 105 }, { "epoch": 6.094915254237288, "grad_norm": 0.7313957810401917, "learning_rate": 9.33591134396618e-05, "loss": 2.2185, "step": 110 }, { "epoch": 6.366101694915255, "grad_norm": 0.7987199425697327, "learning_rate": 7.747989096135944e-05, "loss": 1.9012, "step": 115 }, { "epoch": 6.63728813559322, "grad_norm": 0.7827264666557312, "learning_rate": 6.27516724262533e-05, "loss": 1.8952, "step": 120 }, { "epoch": 6.908474576271186, "grad_norm": 0.8806806206703186, "learning_rate": 4.931282073446786e-05, "loss": 1.8705, "step": 125 }, { "epoch": 7.189830508474576, "grad_norm": 0.7403010129928589, "learning_rate": 3.72895859474065e-05, "loss": 2.1358, "step": 130 }, { "epoch": 7.461016949152542, "grad_norm": 0.744999349117279, "learning_rate": 2.679491924311226e-05, "loss": 1.831, "step": 135 }, { "epoch": 7.7322033898305085, "grad_norm": 0.7374749183654785, "learning_rate": 1.792741180677069e-05, "loss": 1.8739, "step": 140 }, { "epoch": 8.013559322033899, "grad_norm": 1.6001635789871216, "learning_rate": 1.0770368624849948e-05, "loss": 2.1764, "step": 145 }, { "epoch": 8.284745762711864, "grad_norm": 0.7245858311653137, "learning_rate": 5.391025884035239e-06, "loss": 1.8174, "step": 150 }, { "epoch": 8.55593220338983, "grad_norm": 0.6350526213645935, "learning_rate": 1.8399193270309367e-06, "loss": 1.8341, "step": 155 }, { "epoch": 8.827118644067797, "grad_norm": 0.7198626399040222, "learning_rate": 1.5040949915399172e-07, "loss": 1.854, "step": 160 }, { "epoch": 9.162711864406779, "grad_norm": 0.7952471971511841, "learning_rate": 5.2544532637975206e-05, "loss": 1.8478, "step": 165 }, { "epoch": 9.433898305084746, "grad_norm": 0.8388130068778992, "learning_rate": 4.311686701608486e-05, "loss": 1.8267, "step": 170 }, { "epoch": 9.705084745762711, "grad_norm": 0.7769240736961365, "learning_rate": 3.451851177169789e-05, "loss": 1.8362, "step": 175 }, { "epoch": 9.976271186440679, "grad_norm": 0.8936498761177063, "learning_rate": 2.679491924311226e-05, "loss": 1.8369, "step": 180 }, { "epoch": 10.257627118644068, "grad_norm": 0.7819103598594666, "learning_rate": 1.998691762717577e-05, "loss": 2.0798, "step": 185 }, { "epoch": 10.528813559322034, "grad_norm": 0.8176268935203552, "learning_rate": 1.4130495154635493e-05, "loss": 1.8395, "step": 190 }, { "epoch": 10.8, "grad_norm": 0.7900301814079285, "learning_rate": 9.256609850354636e-06, "loss": 1.7801, "step": 195 }, { "epoch": 11.08135593220339, "grad_norm": 0.7704123258590698, "learning_rate": 5.391025884035239e-06, "loss": 2.1206, "step": 200 }, { "epoch": 11.08135593220339, "eval_loss": 2.0445444583892822, "eval_runtime": 53.6802, "eval_samples_per_second": 10.972, "eval_steps_per_second": 1.379, "step": 200 }, { "epoch": 11.352542372881356, "grad_norm": 0.702042818069458, "learning_rate": 2.554177376525191e-06, "loss": 1.7821, "step": 205 }, { "epoch": 11.623728813559323, "grad_norm": 0.7518762946128845, "learning_rate": 7.61060381650891e-07, "loss": 1.8043, "step": 210 }, { "epoch": 11.894915254237288, "grad_norm": 0.7772343754768372, "learning_rate": 2.1153614578506286e-08, "loss": 1.8231, "step": 215 } ], "logging_steps": 5, "max_steps": 216, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5355128486546637e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }