{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15991471215351813, "eval_steps": 50, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010660980810234541, "grad_norm": 0.051327500492334366, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.755006790161133, "logits/rejected": 14.735244750976562, "logps/chosen": -0.29377540946006775, "logps/rejected": -0.30969956517219543, "loss": 0.952, "rewards/accuracies": 0.4375, "rewards/chosen": -0.44066309928894043, "rewards/margins": 0.023886267095804214, "rewards/rejected": -0.46454939246177673, "step": 10 }, { "epoch": 0.021321961620469083, "grad_norm": 0.04346882924437523, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.513801574707031, "logits/rejected": 14.946454048156738, "logps/chosen": -0.27995699644088745, "logps/rejected": -0.30138006806373596, "loss": 0.9726, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.4199354648590088, "rewards/margins": 0.03213457390666008, "rewards/rejected": -0.45207005739212036, "step": 20 }, { "epoch": 0.031982942430703626, "grad_norm": 0.05228634551167488, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.266324043273926, "logits/rejected": 14.423965454101562, "logps/chosen": -0.2919609546661377, "logps/rejected": -0.32358455657958984, "loss": 0.9622, "rewards/accuracies": 0.5, "rewards/chosen": -0.43794146180152893, "rewards/margins": 0.047435395419597626, "rewards/rejected": -0.48537683486938477, "step": 30 }, { "epoch": 0.042643923240938165, "grad_norm": 0.05487598106265068, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.965211868286133, "logits/rejected": 15.058088302612305, "logps/chosen": -0.277716726064682, "logps/rejected": -0.3055034577846527, "loss": 0.9403, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.4165751039981842, "rewards/margins": 0.04168009012937546, "rewards/rejected": -0.4582551419734955, "step": 40 }, { "epoch": 0.053304904051172705, "grad_norm": 0.057255037128925323, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.539288520812988, "logits/rejected": 15.174041748046875, "logps/chosen": -0.26362231373786926, "logps/rejected": -0.3325727581977844, "loss": 0.9588, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.3954334557056427, "rewards/margins": 0.10342560708522797, "rewards/rejected": -0.49885907769203186, "step": 50 }, { "epoch": 0.053304904051172705, "eval_logits/chosen": 14.618952751159668, "eval_logits/rejected": 15.176809310913086, "eval_logps/chosen": -0.2685677409172058, "eval_logps/rejected": -0.3283654451370239, "eval_loss": 0.9551004767417908, "eval_rewards/accuracies": 0.5131579041481018, "eval_rewards/chosen": -0.4028516113758087, "eval_rewards/margins": 0.08969658613204956, "eval_rewards/rejected": -0.4925481975078583, "eval_runtime": 21.4453, "eval_samples_per_second": 28.305, "eval_steps_per_second": 3.544, "step": 50 }, { "epoch": 0.06396588486140725, "grad_norm": 0.05227242782711983, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.787714004516602, "logits/rejected": 15.379422187805176, "logps/chosen": -0.3143109679222107, "logps/rejected": -0.3425135612487793, "loss": 0.9636, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4714665412902832, "rewards/margins": 0.042303841561079025, "rewards/rejected": -0.513770341873169, "step": 60 }, { "epoch": 0.07462686567164178, "grad_norm": 0.0658508762717247, "learning_rate": 4.973180832407471e-06, "logits/chosen": 15.149365425109863, "logits/rejected": 15.115835189819336, "logps/chosen": -0.31501108407974243, "logps/rejected": -0.2854115962982178, "loss": 0.9677, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.47251659631729126, "rewards/margins": -0.04439922422170639, "rewards/rejected": -0.4281173646450043, "step": 70 }, { "epoch": 0.08528784648187633, "grad_norm": 0.06567618995904922, "learning_rate": 4.964990092676263e-06, "logits/chosen": 15.393908500671387, "logits/rejected": 15.454248428344727, "logps/chosen": -0.31166282296180725, "logps/rejected": -0.3178747594356537, "loss": 0.9609, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.4674941897392273, "rewards/margins": 0.009317949414253235, "rewards/rejected": -0.47681212425231934, "step": 80 }, { "epoch": 0.09594882729211088, "grad_norm": 0.07566913962364197, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 15.229632377624512, "logits/rejected": 15.477168083190918, "logps/chosen": -0.3294064998626709, "logps/rejected": -0.3528878390789032, "loss": 0.9587, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.49410971999168396, "rewards/margins": 0.03522203490138054, "rewards/rejected": -0.5293318033218384, "step": 90 }, { "epoch": 0.10660980810234541, "grad_norm": 0.09082464128732681, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 14.481330871582031, "logits/rejected": 15.092982292175293, "logps/chosen": -0.2656436562538147, "logps/rejected": -0.33982905745506287, "loss": 0.9548, "rewards/accuracies": 0.5625, "rewards/chosen": -0.39846545457839966, "rewards/margins": 0.11127817630767822, "rewards/rejected": -0.5097435712814331, "step": 100 }, { "epoch": 0.10660980810234541, "eval_logits/chosen": 14.7100830078125, "eval_logits/rejected": 15.274725914001465, "eval_logps/chosen": -0.26462864875793457, "eval_logps/rejected": -0.331702321767807, "eval_loss": 0.947841465473175, "eval_rewards/accuracies": 0.5394737124443054, "eval_rewards/chosen": -0.39694297313690186, "eval_rewards/margins": 0.10061051696538925, "eval_rewards/rejected": -0.4975534677505493, "eval_runtime": 21.4421, "eval_samples_per_second": 28.309, "eval_steps_per_second": 3.544, "step": 100 }, { "epoch": 0.11727078891257996, "grad_norm": 0.20198923349380493, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.932653427124023, "logits/rejected": 15.476409912109375, "logps/chosen": -0.27830976247787476, "logps/rejected": -0.34150317311286926, "loss": 0.9487, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.41746464371681213, "rewards/margins": 0.09479012340307236, "rewards/rejected": -0.5122548341751099, "step": 110 }, { "epoch": 0.1279317697228145, "grad_norm": 0.31938356161117554, "learning_rate": 4.921457902821578e-06, "logits/chosen": 15.280967712402344, "logits/rejected": 15.5416259765625, "logps/chosen": -0.2816022038459778, "logps/rejected": -0.3262938857078552, "loss": 0.9483, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.4224032461643219, "rewards/margins": 0.06703753769397736, "rewards/rejected": -0.48944082856178284, "step": 120 }, { "epoch": 0.13859275053304904, "grad_norm": 0.12567812204360962, "learning_rate": 4.907906416994146e-06, "logits/chosen": 14.967382431030273, "logits/rejected": 15.351877212524414, "logps/chosen": -0.3148510456085205, "logps/rejected": -0.3488944172859192, "loss": 0.957, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.47227659821510315, "rewards/margins": 0.05106503888964653, "rewards/rejected": -0.5233416557312012, "step": 130 }, { "epoch": 0.14925373134328357, "grad_norm": 0.09151162207126617, "learning_rate": 4.893298743830168e-06, "logits/chosen": 14.900466918945312, "logits/rejected": 15.075350761413574, "logps/chosen": -0.2766302227973938, "logps/rejected": -0.312236487865448, "loss": 0.9373, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.4149452745914459, "rewards/margins": 0.05340944975614548, "rewards/rejected": -0.4683547616004944, "step": 140 }, { "epoch": 0.15991471215351813, "grad_norm": 0.1259378045797348, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 14.528109550476074, "logits/rejected": 14.861102104187012, "logps/chosen": -0.2683579921722412, "logps/rejected": -0.33838269114494324, "loss": 0.9388, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.40253695845603943, "rewards/margins": 0.10503707826137543, "rewards/rejected": -0.5075740218162537, "step": 150 }, { "epoch": 0.15991471215351813, "eval_logits/chosen": 14.12246036529541, "eval_logits/rejected": 14.733266830444336, "eval_logps/chosen": -0.2611957788467407, "eval_logps/rejected": -0.3492279350757599, "eval_loss": 0.9302574396133423, "eval_rewards/accuracies": 0.5657894611358643, "eval_rewards/chosen": -0.3917936384677887, "eval_rewards/margins": 0.13204820454120636, "eval_rewards/rejected": -0.5238418579101562, "eval_runtime": 21.4406, "eval_samples_per_second": 28.311, "eval_steps_per_second": 3.545, "step": 150 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.602224826757939e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }