{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9724770642201834, "eval_steps": 40, "global_step": 81, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1834862385321101, "grad_norm": 87.98338604183625, "learning_rate": 5e-07, "logits/chosen": -2.7504944801330566, "logits/rejected": -2.7376608848571777, "logps/chosen": -366.7567138671875, "logps/rejected": -269.1701965332031, "loss": 0.684, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.036848217248916626, "rewards/margins": 0.02083454094827175, "rewards/rejected": 0.016013674437999725, "step": 5 }, { "epoch": 0.3669724770642202, "grad_norm": 56.51853547131025, "learning_rate": 1e-06, "logits/chosen": -2.680494785308838, "logits/rejected": -2.666748285293579, "logps/chosen": -349.07464599609375, "logps/rejected": -251.41049194335938, "loss": 0.5321, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.2032493352890015, "rewards/margins": 0.6572977900505066, "rewards/rejected": 0.5459514260292053, "step": 10 }, { "epoch": 0.5504587155963303, "grad_norm": 50.68732493997359, "learning_rate": 9.878131657762535e-07, "logits/chosen": -2.505305290222168, "logits/rejected": -2.4723613262176514, "logps/chosen": -344.2197265625, "logps/rejected": -244.26724243164062, "loss": 0.4596, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 3.3816330432891846, "rewards/margins": 1.7416467666625977, "rewards/rejected": 1.639986276626587, "step": 15 }, { "epoch": 0.7339449541284404, "grad_norm": 48.68084600724118, "learning_rate": 9.518467388186019e-07, "logits/chosen": -2.3610739707946777, "logits/rejected": -2.312293529510498, "logps/chosen": -348.7062072753906, "logps/rejected": -251.2678985595703, "loss": 0.4779, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 3.990943431854248, "rewards/margins": 2.4283714294433594, "rewards/rejected": 1.5625723600387573, "step": 20 }, { "epoch": 0.9174311926605505, "grad_norm": 43.576923617976306, "learning_rate": 8.938539866588592e-07, "logits/chosen": -2.161365509033203, "logits/rejected": -2.169524669647217, "logps/chosen": -337.71240234375, "logps/rejected": -223.12796020507812, "loss": 0.4429, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 3.8454651832580566, "rewards/margins": 2.679399013519287, "rewards/rejected": 1.16606605052948, "step": 25 }, { "epoch": 1.1009174311926606, "grad_norm": 15.165089558011688, "learning_rate": 8.166619015240235e-07, "logits/chosen": -2.103114366531372, "logits/rejected": -2.0902843475341797, "logps/chosen": -333.57958984375, "logps/rejected": -255.9213409423828, "loss": 0.2402, "rewards/accuracies": 0.90625, "rewards/chosen": 4.685088157653809, "rewards/margins": 3.5494351387023926, "rewards/rejected": 1.135652780532837, "step": 30 }, { "epoch": 1.2844036697247707, "grad_norm": 16.2176567972852, "learning_rate": 7.240333919937892e-07, "logits/chosen": -2.1111814975738525, "logits/rejected": -2.107719898223877, "logps/chosen": -336.36669921875, "logps/rejected": -258.0069885253906, "loss": 0.0996, "rewards/accuracies": 0.96875, "rewards/chosen": 4.797186374664307, "rewards/margins": 4.8145318031311035, "rewards/rejected": -0.01734566129744053, "step": 35 }, { "epoch": 1.4678899082568808, "grad_norm": 23.35951964253066, "learning_rate": 6.204838512283071e-07, "logits/chosen": -2.1685638427734375, "logits/rejected": -2.134986400604248, "logps/chosen": -321.1785583496094, "logps/rejected": -274.7965087890625, "loss": 0.1046, "rewards/accuracies": 0.96875, "rewards/chosen": 4.402096271514893, "rewards/margins": 4.899113178253174, "rewards/rejected": -0.4970162808895111, "step": 40 }, { "epoch": 1.4678899082568808, "eval_logits/chosen": -2.157338857650757, "eval_logits/rejected": -2.1405954360961914, "eval_logps/chosen": -318.1454772949219, "eval_logps/rejected": -250.12286376953125, "eval_loss": 0.41041094064712524, "eval_rewards/accuracies": 0.8557692170143127, "eval_rewards/chosen": 3.198086977005005, "eval_rewards/margins": 2.8902511596679688, "eval_rewards/rejected": 0.30783578753471375, "eval_runtime": 54.4376, "eval_samples_per_second": 14.126, "eval_steps_per_second": 0.239, "step": 40 }, { "epoch": 1.6513761467889907, "grad_norm": 15.026762247253014, "learning_rate": 5.110610435765934e-07, "logits/chosen": -2.1868062019348145, "logits/rejected": -2.1655430793762207, "logps/chosen": -328.25323486328125, "logps/rejected": -256.1419677734375, "loss": 0.1147, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 5.091620445251465, "rewards/margins": 5.604047775268555, "rewards/rejected": -0.5124271512031555, "step": 45 }, { "epoch": 1.834862385321101, "grad_norm": 20.207846152861585, "learning_rate": 4.010990395072413e-07, "logits/chosen": -2.1973319053649902, "logits/rejected": -2.182528018951416, "logps/chosen": -318.63800048828125, "logps/rejected": -282.4230041503906, "loss": 0.1138, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.645673751831055, "rewards/margins": 5.373934268951416, "rewards/rejected": -0.7282606959342957, "step": 50 }, { "epoch": 2.018348623853211, "grad_norm": 11.21625290323588, "learning_rate": 2.9595819387826747e-07, "logits/chosen": -2.2084720134735107, "logits/rejected": -2.195996046066284, "logps/chosen": -319.896484375, "logps/rejected": -269.0523986816406, "loss": 0.0766, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 5.313421726226807, "rewards/margins": 5.936707496643066, "rewards/rejected": -0.6232857704162598, "step": 55 }, { "epoch": 2.2018348623853212, "grad_norm": 12.136370494834754, "learning_rate": 2.0076384291297133e-07, "logits/chosen": -2.221501588821411, "logits/rejected": -2.2167458534240723, "logps/chosen": -319.49652099609375, "logps/rejected": -252.19302368164062, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": 5.2415971755981445, "rewards/margins": 5.633230686187744, "rewards/rejected": -0.39163410663604736, "step": 60 }, { "epoch": 2.385321100917431, "grad_norm": 12.64835328707971, "learning_rate": 1.2015645770835764e-07, "logits/chosen": -2.235281467437744, "logits/rejected": -2.1987690925598145, "logps/chosen": -336.44158935546875, "logps/rejected": -248.538330078125, "loss": 0.0343, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 5.285120487213135, "rewards/margins": 5.766313552856445, "rewards/rejected": -0.48119330406188965, "step": 65 }, { "epoch": 2.5688073394495414, "grad_norm": 9.209595589190794, "learning_rate": 5.806543362721944e-08, "logits/chosen": -2.2242534160614014, "logits/rejected": -2.2229883670806885, "logps/chosen": -326.62701416015625, "logps/rejected": -266.9122619628906, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 5.088566780090332, "rewards/margins": 5.991884708404541, "rewards/rejected": -0.9033180475234985, "step": 70 }, { "epoch": 2.7522935779816513, "grad_norm": 6.119217367247741, "learning_rate": 1.751754273859507e-08, "logits/chosen": -2.2509617805480957, "logits/rejected": -2.230938673019409, "logps/chosen": -326.35186767578125, "logps/rejected": -290.31903076171875, "loss": 0.0466, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 5.807272911071777, "rewards/margins": 6.667996883392334, "rewards/rejected": -0.8607242703437805, "step": 75 }, { "epoch": 2.9357798165137616, "grad_norm": 13.26496056644377, "learning_rate": 4.893867400131979e-10, "logits/chosen": -2.244044780731201, "logits/rejected": -2.239701986312866, "logps/chosen": -341.17584228515625, "logps/rejected": -274.3210144042969, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 5.888747215270996, "rewards/margins": 6.674208641052246, "rewards/rejected": -0.7854617238044739, "step": 80 }, { "epoch": 2.9357798165137616, "eval_logits/chosen": -2.2088987827301025, "eval_logits/rejected": -2.196852445602417, "eval_logps/chosen": -315.8323669433594, "eval_logps/rejected": -252.4415740966797, "eval_loss": 0.4136447310447693, "eval_rewards/accuracies": 0.8557692170143127, "eval_rewards/chosen": 3.4293932914733887, "eval_rewards/margins": 3.3534319400787354, "eval_rewards/rejected": 0.07596174627542496, "eval_runtime": 54.1579, "eval_samples_per_second": 14.199, "eval_steps_per_second": 0.24, "step": 80 }, { "epoch": 2.9724770642201834, "step": 81, "total_flos": 954757539692544.0, "train_loss": 0.21779433532077588, "train_runtime": 2944.3137, "train_samples_per_second": 7.049, "train_steps_per_second": 0.028 } ], "logging_steps": 5, "max_steps": 81, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 954757539692544.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }