{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13321492007104796, "eval_steps": 50, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008880994671403197, "grad_norm": 0.04571289196610451, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.56671142578125, "logits/rejected": 15.112574577331543, "logps/chosen": -0.26506316661834717, "logps/rejected": -0.3439488410949707, "loss": 0.9267, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.39759472012519836, "rewards/margins": 0.11832849681377411, "rewards/rejected": -0.5159232020378113, "step": 10 }, { "epoch": 0.017761989342806393, "grad_norm": 0.0512714721262455, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.376543045043945, "logits/rejected": 14.862703323364258, "logps/chosen": -0.2708089351654053, "logps/rejected": -0.32412824034690857, "loss": 0.936, "rewards/accuracies": 0.5, "rewards/chosen": -0.4062133729457855, "rewards/margins": 0.07997899502515793, "rewards/rejected": -0.4861923158168793, "step": 20 }, { "epoch": 0.02664298401420959, "grad_norm": 0.058383647352457047, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.208717346191406, "logits/rejected": 15.370651245117188, "logps/chosen": -0.28206294775009155, "logps/rejected": -0.38387423753738403, "loss": 0.9215, "rewards/accuracies": 0.625, "rewards/chosen": -0.42309442162513733, "rewards/margins": 0.15271687507629395, "rewards/rejected": -0.5758112668991089, "step": 30 }, { "epoch": 0.035523978685612786, "grad_norm": 0.06262075155973434, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.768765449523926, "logits/rejected": 15.169331550598145, "logps/chosen": -0.27857059240341187, "logps/rejected": -0.3388269543647766, "loss": 0.9386, "rewards/accuracies": 0.5, "rewards/chosen": -0.4178559184074402, "rewards/margins": 0.09038447588682175, "rewards/rejected": -0.5082404017448425, "step": 40 }, { "epoch": 0.04440497335701599, "grad_norm": 0.06259036809206009, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.950456619262695, "logits/rejected": 15.232122421264648, "logps/chosen": -0.2961367070674896, "logps/rejected": -0.3322262465953827, "loss": 0.9317, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.44420504570007324, "rewards/margins": 0.054134320467710495, "rewards/rejected": -0.4983394145965576, "step": 50 }, { "epoch": 0.04440497335701599, "eval_logits/chosen": 14.56529426574707, "eval_logits/rejected": 14.895020484924316, "eval_logps/chosen": -0.2806546986103058, "eval_logps/rejected": -0.3486972451210022, "eval_loss": 0.9381324052810669, "eval_rewards/accuracies": 0.5274725556373596, "eval_rewards/chosen": -0.4209820330142975, "eval_rewards/margins": 0.10206379741430283, "eval_rewards/rejected": -0.5230458974838257, "eval_runtime": 25.2574, "eval_samples_per_second": 28.823, "eval_steps_per_second": 3.603, "step": 50 }, { "epoch": 0.05328596802841918, "grad_norm": 0.07301533967256546, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.195574760437012, "logits/rejected": 15.173194885253906, "logps/chosen": -0.2693648636341095, "logps/rejected": -0.33997970819473267, "loss": 0.9319, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.40404725074768066, "rewards/margins": 0.10592226684093475, "rewards/rejected": -0.5099694728851318, "step": 60 }, { "epoch": 0.06216696269982238, "grad_norm": 0.0659889206290245, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.910173416137695, "logits/rejected": 15.361429214477539, "logps/chosen": -0.28456225991249084, "logps/rejected": -0.3702812194824219, "loss": 0.9185, "rewards/accuracies": 0.625, "rewards/chosen": -0.42684346437454224, "rewards/margins": 0.12857840955257416, "rewards/rejected": -0.5554218292236328, "step": 70 }, { "epoch": 0.07104795737122557, "grad_norm": 0.05815625935792923, "learning_rate": 4.964990092676263e-06, "logits/chosen": 14.407182693481445, "logits/rejected": 14.948204040527344, "logps/chosen": -0.292889267206192, "logps/rejected": -0.3381648063659668, "loss": 0.9388, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.43933385610580444, "rewards/margins": 0.06791339069604874, "rewards/rejected": -0.5072472095489502, "step": 80 }, { "epoch": 0.07992895204262877, "grad_norm": 0.06627190113067627, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 14.622471809387207, "logits/rejected": 15.167770385742188, "logps/chosen": -0.28155821561813354, "logps/rejected": -0.33633899688720703, "loss": 0.9256, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4223373532295227, "rewards/margins": 0.08217118680477142, "rewards/rejected": -0.5045084953308105, "step": 90 }, { "epoch": 0.08880994671403197, "grad_norm": 0.0724545568227768, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 14.289724349975586, "logits/rejected": 14.882037162780762, "logps/chosen": -0.2791440486907959, "logps/rejected": -0.35329627990722656, "loss": 0.9374, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.41871610283851624, "rewards/margins": 0.11122839152812958, "rewards/rejected": -0.5299445390701294, "step": 100 }, { "epoch": 0.08880994671403197, "eval_logits/chosen": 14.337930679321289, "eval_logits/rejected": 14.689269065856934, "eval_logps/chosen": -0.2726942300796509, "eval_logps/rejected": -0.34668418765068054, "eval_loss": 0.9302808046340942, "eval_rewards/accuracies": 0.5384615659713745, "eval_rewards/chosen": -0.40904131531715393, "eval_rewards/margins": 0.11098497360944748, "eval_rewards/rejected": -0.5200263261795044, "eval_runtime": 25.2585, "eval_samples_per_second": 28.822, "eval_steps_per_second": 3.603, "step": 100 }, { "epoch": 0.09769094138543517, "grad_norm": 0.08156246691942215, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.499124526977539, "logits/rejected": 14.916313171386719, "logps/chosen": -0.2798352837562561, "logps/rejected": -0.3477734327316284, "loss": 0.9243, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4197530150413513, "rewards/margins": 0.10190720856189728, "rewards/rejected": -0.5216602087020874, "step": 110 }, { "epoch": 0.10657193605683836, "grad_norm": 0.08161844313144684, "learning_rate": 4.921457902821578e-06, "logits/chosen": 13.595013618469238, "logits/rejected": 14.390353202819824, "logps/chosen": -0.26682502031326294, "logps/rejected": -0.3336995542049408, "loss": 0.9123, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.400237500667572, "rewards/margins": 0.10031183809041977, "rewards/rejected": -0.5005493760108948, "step": 120 }, { "epoch": 0.11545293072824156, "grad_norm": 0.28624778985977173, "learning_rate": 4.907906416994146e-06, "logits/chosen": 13.711044311523438, "logits/rejected": 14.558542251586914, "logps/chosen": -0.27874043583869934, "logps/rejected": -0.3582325279712677, "loss": 0.9163, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.41811060905456543, "rewards/margins": 0.11923813819885254, "rewards/rejected": -0.537348747253418, "step": 130 }, { "epoch": 0.12433392539964476, "grad_norm": 0.10971464216709137, "learning_rate": 4.893298743830168e-06, "logits/chosen": 14.18798828125, "logits/rejected": 14.993026733398438, "logps/chosen": -0.2750400900840759, "logps/rejected": -0.39451608061790466, "loss": 0.9098, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4125601351261139, "rewards/margins": 0.17921395599842072, "rewards/rejected": -0.5917741060256958, "step": 140 }, { "epoch": 0.13321492007104796, "grad_norm": 0.09321591258049011, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 12.775139808654785, "logits/rejected": 13.751996994018555, "logps/chosen": -0.28446996212005615, "logps/rejected": -0.36404967308044434, "loss": 0.9104, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.42670494318008423, "rewards/margins": 0.11936960369348526, "rewards/rejected": -0.5460745096206665, "step": 150 }, { "epoch": 0.13321492007104796, "eval_logits/chosen": 12.97266960144043, "eval_logits/rejected": 13.47339916229248, "eval_logps/chosen": -0.27297571301460266, "eval_logps/rejected": -0.36854612827301025, "eval_loss": 0.9143257737159729, "eval_rewards/accuracies": 0.5824176073074341, "eval_rewards/chosen": -0.4094635546207428, "eval_rewards/margins": 0.14335563778877258, "eval_rewards/rejected": -0.5528191924095154, "eval_runtime": 25.2406, "eval_samples_per_second": 28.842, "eval_steps_per_second": 3.605, "step": 150 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.560389740760924e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }