diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,27034 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "episode": 36000, + "epoch": 0.07189847934716181, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "episode": 24, + "epoch": 4.793231956477454e-05, + "loss/policy_avg": -0.013041017577052116, + "lr": 3e-06, + "objective/entropy": 96.43083190917969, + "objective/kl": 0.9096126556396484, + "objective/non_score_reward": -0.0454806312918663, + "objective/rlhf_reward": -0.27288377098739147, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6026546955108643, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4345703125, + "step": 0, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016520023345947 + }, + { + "episode": 48, + "epoch": 9.586463912954908e-05, + "loss/policy_avg": 0.010270234197378159, + "lr": 2.9997124233128835e-06, + "objective/entropy": 90.00291442871094, + "objective/kl": 1.4358863830566406, + "objective/non_score_reward": -0.07179431617259979, + "objective/rlhf_reward": 2.569234097842127, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.944652557373047, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.38671875, + "step": 1, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0005974769592285 + }, + { + "episode": 72, + "epoch": 0.00014379695869432362, + "loss/policy_avg": 0.022733237594366074, + "lr": 2.999424846625767e-06, + "objective/entropy": 78.91166687011719, + "objective/kl": 2.956300735473633, + "objective/non_score_reward": -0.1478150486946106, + "objective/rlhf_reward": -0.8868902213871479, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.658272743225098, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4052734375, + "step": 2, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998960018157959 + }, + { + "episode": 96, + "epoch": 0.00019172927825909816, + "loss/policy_avg": 0.007084307726472616, + "lr": 2.9991372699386504e-06, + "objective/entropy": 77.19866180419922, + "objective/kl": 1.919674277305603, + "objective/non_score_reward": -0.09598371386528015, + "objective/rlhf_reward": 2.4240977559238672, + "objective/scores": 0.5, + "policy/approxkl_avg": 5.6697306632995605, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3466796875, + "step": 3, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995243549346924 + }, + { + "episode": 120, + "epoch": 0.0002396615978238727, + "loss/policy_avg": 0.015304439701139927, + "lr": 2.9988496932515338e-06, + "objective/entropy": 121.04771423339844, + "objective/kl": 1.123799204826355, + "objective/non_score_reward": -0.05618995428085327, + "objective/rlhf_reward": -0.33713974617421627, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.481837749481201, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.529296875, + "step": 4, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002387046813965 + }, + { + "episode": 144, + "epoch": 0.00028759391738864725, + "loss/policy_avg": 0.039275527000427246, + "lr": 2.998562116564417e-06, + "objective/entropy": 92.93133544921875, + "objective/kl": 1.780539870262146, + "objective/non_score_reward": -0.08902700245380402, + "objective/rlhf_reward": -0.5341619625687599, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.650364637374878, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3818359375, + "step": 5, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987859725952148 + }, + { + "episode": 168, + "epoch": 0.0003355262369534218, + "loss/policy_avg": 0.013892962597310543, + "lr": 2.9982745398773006e-06, + "objective/entropy": 122.23117065429688, + "objective/kl": 0.5922457575798035, + "objective/non_score_reward": -0.029612286016345024, + "objective/rlhf_reward": 1.7151155427536573, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.10055673122406, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.578125, + "step": 6, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000465154647827 + }, + { + "episode": 192, + "epoch": 0.0003834585565181963, + "loss/policy_avg": 0.02055109664797783, + "lr": 2.9979869631901845e-06, + "objective/entropy": 115.41616821289062, + "objective/kl": 0.5446109175682068, + "objective/non_score_reward": -0.027230549603700638, + "objective/rlhf_reward": 1.973859845515026, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.757310390472412, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.5, + "step": 7, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989192485809326 + }, + { + "episode": 216, + "epoch": 0.00043139087608297085, + "loss/policy_avg": -0.0146332997828722, + "lr": 2.9976993865030675e-06, + "objective/entropy": 116.53536224365234, + "objective/kl": 1.0814995765686035, + "objective/non_score_reward": -0.054074980318546295, + "objective/rlhf_reward": -0.32444986142218113, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.292798042297363, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.515625, + "step": 8, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002993583679199 + }, + { + "episode": 240, + "epoch": 0.0004793231956477454, + "loss/policy_avg": 0.0008333213627338409, + "lr": 2.997411809815951e-06, + "objective/entropy": 99.98373413085938, + "objective/kl": 1.5731289386749268, + "objective/non_score_reward": -0.07865644991397858, + "objective/rlhf_reward": -0.4719386510550976, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.496209144592285, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4599609375, + "step": 9, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984419345855713 + }, + { + "episode": 264, + "epoch": 0.0005272555152125199, + "loss/policy_avg": 0.02827448770403862, + "lr": 2.9971242331288343e-06, + "objective/entropy": 84.26155090332031, + "objective/kl": 0.8945560455322266, + "objective/non_score_reward": -0.04472780227661133, + "objective/rlhf_reward": -0.26836680620908737, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.9589601159095764, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.369140625, + "step": 10, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0027925968170166 + }, + { + "episode": 288, + "epoch": 0.0005751878347772945, + "loss/policy_avg": -0.002038330305367708, + "lr": 2.9968366564417178e-06, + "objective/entropy": 59.743736267089844, + "objective/kl": 1.4409098625183105, + "objective/non_score_reward": -0.07204549759626389, + "objective/rlhf_reward": 2.5677270144224167, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.32855224609375, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3818359375, + "step": 11, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0015673637390137 + }, + { + "episode": 312, + "epoch": 0.000623120154342069, + "loss/policy_avg": 0.001968122087419033, + "lr": 2.996549079754601e-06, + "objective/entropy": 87.41310119628906, + "objective/kl": 1.9559903144836426, + "objective/non_score_reward": -0.09779952466487885, + "objective/rlhf_reward": 1.734319725220299, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.4970078468322754, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.412109375, + "step": 12, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987527132034302 + }, + { + "episode": 336, + "epoch": 0.0006710524739068436, + "loss/policy_avg": 0.03522108495235443, + "lr": 2.9962615030674846e-06, + "objective/entropy": 108.93758392333984, + "objective/kl": 2.577143669128418, + "objective/non_score_reward": -0.12885719537734985, + "objective/rlhf_reward": 1.1196461517802085, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.357511520385742, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.435546875, + "step": 13, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997353553771973 + }, + { + "episode": 360, + "epoch": 0.000718984793471618, + "loss/policy_avg": -0.02092193439602852, + "lr": 2.995973926380368e-06, + "objective/entropy": 101.91810607910156, + "objective/kl": 1.7172842025756836, + "objective/non_score_reward": -0.08586421608924866, + "objective/rlhf_reward": -0.5151853114366531, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.758852005004883, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.47265625, + "step": 14, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002822875976562 + }, + { + "episode": 384, + "epoch": 0.0007669171130363926, + "loss/policy_avg": 0.052892543375492096, + "lr": 2.9956863496932515e-06, + "objective/entropy": 98.4802474975586, + "objective/kl": 0.05314210057258606, + "objective/non_score_reward": -0.002657103817909956, + "objective/rlhf_reward": -0.01594262197613716, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.809172630310059, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.404296875, + "step": 15, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0018439292907715 + }, + { + "episode": 408, + "epoch": 0.0008148494326011671, + "loss/policy_avg": -0.009771937504410744, + "lr": 2.995398773006135e-06, + "objective/entropy": 97.52227783203125, + "objective/kl": 1.2108625173568726, + "objective/non_score_reward": -0.060543131083250046, + "objective/rlhf_reward": 1.9578580680836204, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.177501201629639, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.427734375, + "step": 16, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004074573516846 + }, + { + "episode": 432, + "epoch": 0.0008627817521659417, + "loss/policy_avg": 0.004377430770546198, + "lr": 2.9951111963190187e-06, + "objective/entropy": 90.71334075927734, + "objective/kl": 2.2206411361694336, + "objective/non_score_reward": -0.11103205382823944, + "objective/rlhf_reward": 1.0681966293499552, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.2645750045776367, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41796875, + "step": 17, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994053840637207 + }, + { + "episode": 456, + "epoch": 0.0009107140717307163, + "loss/policy_avg": 0.040060993283987045, + "lr": 2.994823619631902e-06, + "objective/entropy": 86.31655883789062, + "objective/kl": 1.4693106412887573, + "objective/non_score_reward": -0.07346553355455399, + "objective/rlhf_reward": -0.4407931864261627, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.526700496673584, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4052734375, + "step": 18, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998227596282959 + }, + { + "episode": 480, + "epoch": 0.0009586463912954908, + "loss/policy_avg": 0.023717161267995834, + "lr": 2.9945360429447856e-06, + "objective/entropy": 112.423828125, + "objective/kl": 2.577183246612549, + "objective/non_score_reward": -0.12885916233062744, + "objective/rlhf_reward": 3.0124234841150455, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.664585113525391, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.498046875, + "step": 19, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988532066345215 + }, + { + "episode": 504, + "epoch": 0.0010065787108602653, + "loss/policy_avg": 0.009363665245473385, + "lr": 2.994248466257669e-06, + "objective/entropy": 118.11949157714844, + "objective/kl": 0.3790988624095917, + "objective/non_score_reward": -0.018954943865537643, + "objective/rlhf_reward": 2.886270336806774, + "objective/scores": 0.5, + "policy/approxkl_avg": 0.6107279658317566, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.615234375, + "step": 20, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0039258003234863 + }, + { + "episode": 528, + "epoch": 0.0010545110304250398, + "loss/policy_avg": -0.004461180418729782, + "lr": 2.9939608895705524e-06, + "objective/entropy": 108.08396911621094, + "objective/kl": 2.7581233978271484, + "objective/non_score_reward": -0.13790617883205414, + "objective/rlhf_reward": -0.8274370171129704, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2817142009735107, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.46875, + "step": 21, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0034942626953125 + }, + { + "episode": 552, + "epoch": 0.0011024433499898143, + "loss/policy_avg": 0.03343503549695015, + "lr": 2.993673312883436e-06, + "objective/entropy": 102.45246887207031, + "objective/kl": 1.738468885421753, + "objective/non_score_reward": -0.08692345023155212, + "objective/rlhf_reward": -0.5215406883507967, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.634500503540039, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.453125, + "step": 22, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018749237060547 + }, + { + "episode": 576, + "epoch": 0.001150375669554589, + "loss/policy_avg": 0.06928712874650955, + "lr": 2.993385736196319e-06, + "objective/entropy": 80.66422271728516, + "objective/kl": 1.6413224935531616, + "objective/non_score_reward": -0.0820661410689354, + "objective/rlhf_reward": -0.4923968277871609, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6620593070983887, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4013671875, + "step": 23, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0036168098449707 + }, + { + "episode": 600, + "epoch": 0.0011983079891193635, + "loss/policy_avg": -0.03531235456466675, + "lr": 2.9930981595092023e-06, + "objective/entropy": 111.01676940917969, + "objective/kl": 1.518481731414795, + "objective/non_score_reward": -0.0759240984916687, + "objective/rlhf_reward": -0.4555445574223995, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7691116333007812, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4677734375, + "step": 24, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001146078109741 + }, + { + "episode": 624, + "epoch": 0.001246240308684138, + "loss/policy_avg": 0.08211535960435867, + "lr": 2.9928105828220857e-06, + "objective/entropy": 129.5863037109375, + "objective/kl": -0.12661974132061005, + "objective/non_score_reward": 0.006330984644591808, + "objective/rlhf_reward": 0.03798590041697025, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9213603734970093, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.56640625, + "step": 25, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0044949054718018 + }, + { + "episode": 648, + "epoch": 0.0012941726282489124, + "loss/policy_avg": 0.030280165374279022, + "lr": 2.992523006134969e-06, + "objective/entropy": 86.26570892333984, + "objective/kl": 2.4319276809692383, + "objective/non_score_reward": -0.12159638106822968, + "objective/rlhf_reward": -0.7295782342553139, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.084470272064209, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3916015625, + "step": 26, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980859756469727 + }, + { + "episode": 672, + "epoch": 0.0013421049478136871, + "loss/policy_avg": 0.011679768562316895, + "lr": 2.9922354294478526e-06, + "objective/entropy": 110.69743347167969, + "objective/kl": 1.8585236072540283, + "objective/non_score_reward": -0.09292618930339813, + "objective/rlhf_reward": 2.026502242422292, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.7785848379135132, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.478515625, + "step": 27, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011019706726074 + }, + { + "episode": 696, + "epoch": 0.0013900372673784616, + "loss/policy_avg": 0.07599033415317535, + "lr": 2.9919478527607364e-06, + "objective/entropy": 77.32418060302734, + "objective/kl": 2.5337939262390137, + "objective/non_score_reward": -0.12668970227241516, + "objective/rlhf_reward": 1.239861786365509, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.775485038757324, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.43359375, + "step": 28, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960627555847168 + }, + { + "episode": 720, + "epoch": 0.001437969586943236, + "loss/policy_avg": 0.04309334233403206, + "lr": 2.99166027607362e-06, + "objective/entropy": 113.55000305175781, + "objective/kl": -0.30347657203674316, + "objective/non_score_reward": 0.015173825435340405, + "objective/rlhf_reward": 0.09104295447468758, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.3768930435180664, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.47265625, + "step": 29, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002551078796387 + }, + { + "episode": 744, + "epoch": 0.0014859019065080108, + "loss/policy_avg": -0.0015950440429151058, + "lr": 2.9913726993865033e-06, + "objective/entropy": 87.54519653320312, + "objective/kl": 1.4883391857147217, + "objective/non_score_reward": -0.07441696524620056, + "objective/rlhf_reward": -0.44650180265307426, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6208293437957764, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4169921875, + "step": 30, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994879961013794 + }, + { + "episode": 768, + "epoch": 0.0015338342260727853, + "loss/policy_avg": -0.0030770781449973583, + "lr": 2.9910851226993867e-06, + "objective/entropy": 80.45647430419922, + "objective/kl": 2.122265577316284, + "objective/non_score_reward": -0.10611327737569809, + "objective/rlhf_reward": -0.6366796642541885, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.708068370819092, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3857421875, + "step": 31, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001967668533325 + }, + { + "episode": 792, + "epoch": 0.0015817665456375597, + "loss/policy_avg": 0.06681559979915619, + "lr": 2.99079754601227e-06, + "objective/entropy": 97.40707397460938, + "objective/kl": 2.401008367538452, + "objective/non_score_reward": -0.12005043029785156, + "objective/rlhf_reward": -0.7203025612980127, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.73689603805542, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.423828125, + "step": 32, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9948208332061768 + }, + { + "episode": 816, + "epoch": 0.0016296988652023342, + "loss/policy_avg": -0.001742723397910595, + "lr": 2.9905099693251536e-06, + "objective/entropy": 106.9861068725586, + "objective/kl": 2.0796632766723633, + "objective/non_score_reward": -0.10398316383361816, + "objective/rlhf_reward": 5.376101037487388, + "objective/scores": 1.0, + "policy/approxkl_avg": 4.6362433433532715, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.453125, + "step": 33, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998645782470703 + }, + { + "episode": 840, + "epoch": 0.001677631184767109, + "loss/policy_avg": 0.06996232271194458, + "lr": 2.990222392638037e-06, + "objective/entropy": 132.39630126953125, + "objective/kl": 1.8880865573883057, + "objective/non_score_reward": -0.09440433979034424, + "objective/rlhf_reward": -0.5664259977638721, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.8089118003845215, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.583984375, + "step": 34, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000971555709839 + }, + { + "episode": 864, + "epoch": 0.0017255635043318834, + "loss/policy_avg": -0.03146294876933098, + "lr": 2.9899348159509204e-06, + "objective/entropy": 86.75841522216797, + "objective/kl": 1.7685987949371338, + "objective/non_score_reward": -0.08842994272708893, + "objective/rlhf_reward": -0.5305796042084694, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.046278476715088, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3994140625, + "step": 35, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001410484313965 + }, + { + "episode": 888, + "epoch": 0.0017734958238966579, + "loss/policy_avg": 0.14591673016548157, + "lr": 2.989647239263804e-06, + "objective/entropy": 101.65887451171875, + "objective/kl": 0.947630763053894, + "objective/non_score_reward": -0.047381531447172165, + "objective/rlhf_reward": -0.284289188683033, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.750121831893921, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.474609375, + "step": 36, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0045502185821533 + }, + { + "episode": 912, + "epoch": 0.0018214281434614326, + "loss/policy_avg": 0.029154781252145767, + "lr": 2.9893596625766873e-06, + "objective/entropy": 83.12118530273438, + "objective/kl": 1.340437889099121, + "objective/non_score_reward": -0.06702189892530441, + "objective/rlhf_reward": 5.597868639975786, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.6840639114379883, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.373046875, + "step": 37, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001082181930542 + }, + { + "episode": 936, + "epoch": 0.001869360463026207, + "loss/policy_avg": -0.01927729696035385, + "lr": 2.9890720858895707e-06, + "objective/entropy": 99.88578796386719, + "objective/kl": 2.280059814453125, + "objective/non_score_reward": -0.11400298774242401, + "objective/rlhf_reward": 1.208771401315054, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.2143168449401855, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4443359375, + "step": 38, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0025439262390137 + }, + { + "episode": 960, + "epoch": 0.0019172927825909815, + "loss/policy_avg": 0.03183969110250473, + "lr": 2.988784509202454e-06, + "objective/entropy": 101.01339721679688, + "objective/kl": 2.3351173400878906, + "objective/non_score_reward": -0.1167558878660202, + "objective/rlhf_reward": 1.2994647435843945, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.214784622192383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44140625, + "step": 39, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975650310516357 + }, + { + "episode": 984, + "epoch": 0.001965225102155756, + "loss/policy_avg": 0.09327712655067444, + "lr": 2.9884969325153375e-06, + "objective/entropy": 109.26126098632812, + "objective/kl": 3.5423667430877686, + "objective/non_score_reward": -0.17711836099624634, + "objective/rlhf_reward": -1.062710128724575, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.617654800415039, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4814453125, + "step": 40, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981582164764404 + }, + { + "episode": 1008, + "epoch": 0.0020131574217205307, + "loss/policy_avg": 0.008852366358041763, + "lr": 2.988209355828221e-06, + "objective/entropy": 130.198486328125, + "objective/kl": 0.7239395976066589, + "objective/non_score_reward": -0.036196980625391006, + "objective/rlhf_reward": 1.78281812928617, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.6649408340454102, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 41, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0044517517089844 + }, + { + "episode": 1032, + "epoch": 0.002061089741285305, + "loss/policy_avg": -0.014182470738887787, + "lr": 2.9879217791411044e-06, + "objective/entropy": 126.1592025756836, + "objective/kl": 1.4269241094589233, + "objective/non_score_reward": -0.07134620100259781, + "objective/rlhf_reward": -0.4280772153288126, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5852861404418945, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.533203125, + "step": 42, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0034825801849365 + }, + { + "episode": 1056, + "epoch": 0.0021090220608500796, + "loss/policy_avg": 0.045917633920907974, + "lr": 2.987634202453988e-06, + "objective/entropy": 110.22311401367188, + "objective/kl": 2.330240488052368, + "objective/non_score_reward": -0.11651202291250229, + "objective/rlhf_reward": -0.6990721449255943, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.5963006019592285, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4580078125, + "step": 43, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988900423049927 + }, + { + "episode": 1080, + "epoch": 0.002156954380414854, + "loss/policy_avg": -0.00011269142851233482, + "lr": 2.9873466257668712e-06, + "objective/entropy": 113.4417724609375, + "objective/kl": 2.0038790702819824, + "objective/non_score_reward": -0.10019394010305405, + "objective/rlhf_reward": -0.6011636406183243, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.943531036376953, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4658203125, + "step": 44, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018107891082764 + }, + { + "episode": 1104, + "epoch": 0.0022048866999796286, + "loss/policy_avg": 0.1533784121274948, + "lr": 2.9870590490797547e-06, + "objective/entropy": 115.51705169677734, + "objective/kl": 3.176539897918701, + "objective/non_score_reward": -0.15882699191570282, + "objective/rlhf_reward": 1.3681549366165164, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.0104780197143555, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4951171875, + "step": 45, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003603935241699 + }, + { + "episode": 1128, + "epoch": 0.0022528190195444035, + "loss/policy_avg": 0.03211810439825058, + "lr": 2.986771472392638e-06, + "objective/entropy": 97.28558349609375, + "objective/kl": 2.0158324241638184, + "objective/non_score_reward": -0.10079163312911987, + "objective/rlhf_reward": -0.6047497857362032, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.161699295043945, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.447265625, + "step": 46, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967930316925049 + }, + { + "episode": 1152, + "epoch": 0.002300751339109178, + "loss/policy_avg": -0.007760944776237011, + "lr": 2.9864838957055215e-06, + "objective/entropy": 104.68029022216797, + "objective/kl": 1.9852409362792969, + "objective/non_score_reward": -0.0992620438337326, + "objective/rlhf_reward": -0.5955722518265247, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6401946544647217, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.505859375, + "step": 47, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0021700859069824 + }, + { + "episode": 1176, + "epoch": 0.0023486836586739525, + "loss/policy_avg": 0.014630027115345001, + "lr": 2.986196319018405e-06, + "objective/entropy": 92.4415283203125, + "objective/kl": 1.856212854385376, + "objective/non_score_reward": -0.09281064569950104, + "objective/rlhf_reward": 2.027195485419223, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.8408725261688232, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.462890625, + "step": 48, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000603199005127 + }, + { + "episode": 1200, + "epoch": 0.002396615978238727, + "loss/policy_avg": -0.007463025860488415, + "lr": 2.9859087423312884e-06, + "objective/entropy": 135.1586151123047, + "objective/kl": 1.350188970565796, + "objective/non_score_reward": -0.06750945746898651, + "objective/rlhf_reward": -0.4050567075610161, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3395063877105713, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.6171875, + "step": 49, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006229877471924 + }, + { + "episode": 1224, + "epoch": 0.0024445482978035014, + "loss/policy_avg": -0.0012800309341400862, + "lr": 2.985621165644172e-06, + "objective/entropy": 124.57469177246094, + "objective/kl": 1.3836421966552734, + "objective/non_score_reward": -0.06918211281299591, + "objective/rlhf_reward": 2.584907352924347, + "objective/scores": 0.5, + "policy/approxkl_avg": 5.105938911437988, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.544921875, + "step": 50, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999441146850586 + }, + { + "episode": 1248, + "epoch": 0.002492480617368276, + "loss/policy_avg": 0.03625974431633949, + "lr": 2.9853335889570556e-06, + "objective/entropy": 103.56734466552734, + "objective/kl": 1.2284643650054932, + "objective/non_score_reward": -0.061423223465681076, + "objective/rlhf_reward": -0.36853931099176407, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7215933799743652, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4677734375, + "step": 51, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010008811950684 + }, + { + "episode": 1272, + "epoch": 0.0025404129369330504, + "loss/policy_avg": 0.03962823003530502, + "lr": 2.9850460122699387e-06, + "objective/entropy": 111.87564086914062, + "objective/kl": 2.174031972885132, + "objective/non_score_reward": -0.10870160907506943, + "objective/rlhf_reward": -0.652209646999836, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7371536493301392, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.546875, + "step": 52, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0022969245910645 + }, + { + "episode": 1296, + "epoch": 0.002588345256497825, + "loss/policy_avg": 0.006499301642179489, + "lr": 2.984758435582822e-06, + "objective/entropy": 81.53741455078125, + "objective/kl": 4.5559587478637695, + "objective/non_score_reward": -0.2277979552745819, + "objective/rlhf_reward": 0.367601315666803, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 8.04394817352295, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.365234375, + "step": 53, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973303079605103 + }, + { + "episode": 1320, + "epoch": 0.0026362775760625998, + "loss/policy_avg": 0.024123912677168846, + "lr": 2.9844708588957055e-06, + "objective/entropy": 112.91667938232422, + "objective/kl": 0.8611132502555847, + "objective/non_score_reward": -0.043055661022663116, + "objective/rlhf_reward": -0.2583339363336563, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.597766399383545, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4765625, + "step": 54, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000613212585449 + }, + { + "episode": 1344, + "epoch": 0.0026842098956273742, + "loss/policy_avg": -0.021352242678403854, + "lr": 2.984183282208589e-06, + "objective/entropy": 91.80636596679688, + "objective/kl": 2.2884745597839355, + "objective/non_score_reward": -0.1144237369298935, + "objective/rlhf_reward": -0.6865423545241356, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7493300437927246, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.435546875, + "step": 55, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999685287475586 + }, + { + "episode": 1368, + "epoch": 0.0027321422151921487, + "loss/policy_avg": -0.015930399298667908, + "lr": 2.9838957055214724e-06, + "objective/entropy": 127.53816986083984, + "objective/kl": 1.0015300512313843, + "objective/non_score_reward": -0.05007650703191757, + "objective/rlhf_reward": 1.4339299380675636, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.3522987365722656, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.5625, + "step": 56, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001187324523926 + }, + { + "episode": 1392, + "epoch": 0.002780074534756923, + "loss/policy_avg": 0.0818919986486435, + "lr": 2.9836081288343558e-06, + "objective/entropy": 105.96234130859375, + "objective/kl": 2.0821733474731445, + "objective/non_score_reward": -0.10410867631435394, + "objective/rlhf_reward": 5.37534798309207, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.8340463638305664, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.50390625, + "step": 57, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0005693435668945 + }, + { + "episode": 1416, + "epoch": 0.0028280068543216977, + "loss/policy_avg": 0.00405135378241539, + "lr": 2.983320552147239e-06, + "objective/entropy": 101.6596908569336, + "objective/kl": 2.182565212249756, + "objective/non_score_reward": -0.10912825912237167, + "objective/rlhf_reward": -0.6547695100307465, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.017915725708008, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.447265625, + "step": 58, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000117063522339 + }, + { + "episode": 1440, + "epoch": 0.002875939173886472, + "loss/policy_avg": 0.03558550775051117, + "lr": 2.9830329754601226e-06, + "objective/entropy": 89.91297912597656, + "objective/kl": 2.4323856830596924, + "objective/non_score_reward": -0.12161927670240402, + "objective/rlhf_reward": -0.7297156751155853, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1262941360473633, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.40234375, + "step": 59, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990912675857544 + }, + { + "episode": 1464, + "epoch": 0.0029238714934512466, + "loss/policy_avg": -0.026221033185720444, + "lr": 2.982745398773006e-06, + "objective/entropy": 89.79360961914062, + "objective/kl": 1.544703722000122, + "objective/non_score_reward": -0.07723518460988998, + "objective/rlhf_reward": -0.46341110207140446, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.772399663925171, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.39453125, + "step": 60, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0020394325256348 + }, + { + "episode": 1488, + "epoch": 0.0029718038130160216, + "loss/policy_avg": -0.002859845757484436, + "lr": 2.9824578220858895e-06, + "objective/entropy": 101.88462829589844, + "objective/kl": 2.8430614471435547, + "objective/non_score_reward": -0.1421530842781067, + "objective/rlhf_reward": -0.8529184609651566, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.381649971008301, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.443359375, + "step": 61, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00014591217041 + }, + { + "episode": 1512, + "epoch": 0.003019736132580796, + "loss/policy_avg": 0.037695880979299545, + "lr": 2.9821702453987733e-06, + "objective/entropy": 122.28804016113281, + "objective/kl": 1.2046512365341187, + "objective/non_score_reward": -0.06023256108164787, + "objective/rlhf_reward": -0.36139537021517754, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.061966896057129, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.6640625, + "step": 62, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998297929763794 + }, + { + "episode": 1536, + "epoch": 0.0030676684521455705, + "loss/policy_avg": 0.01118068303912878, + "lr": 2.9818826687116568e-06, + "objective/entropy": 75.64812469482422, + "objective/kl": 2.9311397075653076, + "objective/non_score_reward": -0.14655698835849762, + "objective/rlhf_reward": -0.8793419227004051, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.1292877197265625, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3486328125, + "step": 63, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968452453613281 + }, + { + "episode": 1560, + "epoch": 0.003115600771710345, + "loss/policy_avg": 0.027278564870357513, + "lr": 2.98159509202454e-06, + "objective/entropy": 82.08084869384766, + "objective/kl": 1.2625969648361206, + "objective/non_score_reward": -0.06312984228134155, + "objective/rlhf_reward": 2.2052802910270186, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.924602508544922, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44140625, + "step": 64, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000145673751831 + }, + { + "episode": 1584, + "epoch": 0.0031635330912751195, + "loss/policy_avg": -0.016411226242780685, + "lr": 2.9813075153374236e-06, + "objective/entropy": 94.44204711914062, + "objective/kl": 2.554050922393799, + "objective/non_score_reward": -0.127702534198761, + "objective/rlhf_reward": 1.3710279249061479, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.9047609567642212, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4140625, + "step": 65, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002592086791992 + }, + { + "episode": 1608, + "epoch": 0.003211465410839894, + "loss/policy_avg": 0.001919352449476719, + "lr": 2.981019938650307e-06, + "objective/entropy": 103.5399398803711, + "objective/kl": 0.8462371826171875, + "objective/non_score_reward": -0.042311858385801315, + "objective/rlhf_reward": -0.2538711577653885, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3145625591278076, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4462890625, + "step": 66, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999384880065918 + }, + { + "episode": 1632, + "epoch": 0.0032593977304046684, + "loss/policy_avg": 0.010066835209727287, + "lr": 2.98073236196319e-06, + "objective/entropy": 77.00733947753906, + "objective/kl": 2.2377119064331055, + "objective/non_score_reward": -0.11188560724258423, + "objective/rlhf_reward": -0.671313613653183, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.251017451286316, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40234375, + "step": 67, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001082181930542 + }, + { + "episode": 1656, + "epoch": 0.0033073300499694433, + "loss/policy_avg": -0.01637902483344078, + "lr": 2.9804447852760735e-06, + "objective/entropy": 84.3310546875, + "objective/kl": 2.1696829795837402, + "objective/non_score_reward": -0.10848414152860641, + "objective/rlhf_reward": 5.349095143377781, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.275608777999878, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3955078125, + "step": 68, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024964809417725 + }, + { + "episode": 1680, + "epoch": 0.003355262369534218, + "loss/policy_avg": 0.020744197070598602, + "lr": 2.980157208588957e-06, + "objective/entropy": 102.61216735839844, + "objective/kl": 2.5076379776000977, + "objective/non_score_reward": -0.12538188695907593, + "objective/rlhf_reward": -0.7522913031280041, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9914926290512085, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4521484375, + "step": 69, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001898765563965 + }, + { + "episode": 1704, + "epoch": 0.0034031946890989923, + "loss/policy_avg": 0.053751811385154724, + "lr": 2.9798696319018403e-06, + "objective/entropy": 136.97921752929688, + "objective/kl": 1.779276728630066, + "objective/non_score_reward": -0.0889638289809227, + "objective/rlhf_reward": 2.466217018663883, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.333052635192871, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.6171875, + "step": 70, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000499725341797 + }, + { + "episode": 1728, + "epoch": 0.0034511270086637668, + "loss/policy_avg": 0.015965720638632774, + "lr": 2.9795820552147237e-06, + "objective/entropy": 109.212890625, + "objective/kl": 3.335113048553467, + "objective/non_score_reward": -0.16675564646720886, + "objective/rlhf_reward": 2.785044694779556, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 5.5428056716918945, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4765625, + "step": 71, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998443126678467 + }, + { + "episode": 1752, + "epoch": 0.0034990593282285412, + "loss/policy_avg": 0.040991850197315216, + "lr": 2.9792944785276076e-06, + "objective/entropy": 123.80986785888672, + "objective/kl": 3.368000030517578, + "objective/non_score_reward": -0.16840001940727234, + "objective/rlhf_reward": -1.0103999972343445, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.042376518249512, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.5234375, + "step": 72, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967386722564697 + }, + { + "episode": 1776, + "epoch": 0.0035469916477933157, + "loss/policy_avg": -0.05000199377536774, + "lr": 2.979006901840491e-06, + "objective/entropy": 104.7856674194336, + "objective/kl": 3.610349655151367, + "objective/non_score_reward": -0.18051749467849731, + "objective/rlhf_reward": -1.0831049289554358, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.466495990753174, + "policy/clipfrac_avg": 1.8333333730697632, + "policy/entropy_avg": 0.4912109375, + "step": 73, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983489513397217 + }, + { + "episode": 1800, + "epoch": 0.00359492396735809, + "loss/policy_avg": 0.02436569705605507, + "lr": 2.9787193251533744e-06, + "objective/entropy": 119.0743637084961, + "objective/kl": 0.3175499141216278, + "objective/non_score_reward": -0.015877505764365196, + "objective/rlhf_reward": 2.4887943231673932, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.799847602844238, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.505859375, + "step": 74, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998891353607178 + }, + { + "episode": 1824, + "epoch": 0.003642856286922865, + "loss/policy_avg": 0.01721281185746193, + "lr": 2.978431748466258e-06, + "objective/entropy": 107.76780700683594, + "objective/kl": 1.2089662551879883, + "objective/non_score_reward": -0.060448311269283295, + "objective/rlhf_reward": 1.5300994154504146, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.245328187942505, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.48828125, + "step": 75, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977939128875732 + }, + { + "episode": 1848, + "epoch": 0.0036907886064876396, + "loss/policy_avg": 0.08342660963535309, + "lr": 2.9781441717791413e-06, + "objective/entropy": 105.63685607910156, + "objective/kl": 2.596284866333008, + "objective/non_score_reward": -0.1298142373561859, + "objective/rlhf_reward": -0.7788853570818901, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.291785717010498, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.427734375, + "step": 76, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985402822494507 + }, + { + "episode": 1872, + "epoch": 0.003738720926052414, + "loss/policy_avg": 0.007005157880485058, + "lr": 2.9778565950920247e-06, + "objective/entropy": 95.5741195678711, + "objective/kl": 2.6469011306762695, + "objective/non_score_reward": -0.1323450803756714, + "objective/rlhf_reward": 5.20592962577939, + "objective/scores": 1.0, + "policy/approxkl_avg": 5.797138214111328, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.453125, + "step": 77, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972796440124512 + }, + { + "episode": 1896, + "epoch": 0.0037866532456171885, + "loss/policy_avg": 0.009619046002626419, + "lr": 2.977569018404908e-06, + "objective/entropy": 116.42654418945312, + "objective/kl": 2.789475202560425, + "objective/non_score_reward": -0.13947376608848572, + "objective/rlhf_reward": 1.4842742804039482, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.990802526473999, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.525390625, + "step": 78, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000570297241211 + }, + { + "episode": 1920, + "epoch": 0.003834585565181963, + "loss/policy_avg": 0.11157067865133286, + "lr": 2.9772814417177916e-06, + "objective/entropy": 97.83265686035156, + "objective/kl": 3.069126844406128, + "objective/non_score_reward": -0.15345636010169983, + "objective/rlhf_reward": -0.9207380786538124, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.101073145866394, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.43359375, + "step": 79, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0020620822906494 + }, + { + "episode": 1944, + "epoch": 0.0038825178847467375, + "loss/policy_avg": 0.04139215499162674, + "lr": 2.976993865030675e-06, + "objective/entropy": 120.03080749511719, + "objective/kl": 1.5484662055969238, + "objective/non_score_reward": -0.07742331922054291, + "objective/rlhf_reward": -0.4645399060100317, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.9850070476531982, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 80, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986765384674072 + }, + { + "episode": 1968, + "epoch": 0.003930450204311512, + "loss/policy_avg": 0.05352923646569252, + "lr": 2.9767062883435584e-06, + "objective/entropy": 95.07389831542969, + "objective/kl": 1.793378233909607, + "objective/non_score_reward": -0.08966891467571259, + "objective/rlhf_reward": -0.5380134172737598, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5347137451171875, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4091796875, + "step": 81, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014615058898926 + }, + { + "episode": 1992, + "epoch": 0.003978382523876287, + "loss/policy_avg": 0.013508323580026627, + "lr": 2.9764187116564414e-06, + "objective/entropy": 94.24886322021484, + "objective/kl": 3.0578458309173584, + "objective/non_score_reward": -0.1528923064470291, + "objective/rlhf_reward": -0.9173537567257881, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.590622901916504, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4208984375, + "step": 82, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997955322265625 + }, + { + "episode": 2016, + "epoch": 0.004026314843441061, + "loss/policy_avg": -0.02853899821639061, + "lr": 2.9761311349693253e-06, + "objective/entropy": 106.18245697021484, + "objective/kl": 3.122627019882202, + "objective/non_score_reward": -0.15613135695457458, + "objective/rlhf_reward": -0.9367880821228027, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.230337142944336, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4326171875, + "step": 83, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997738599777222 + }, + { + "episode": 2040, + "epoch": 0.004074247163005836, + "loss/policy_avg": 0.023475507274270058, + "lr": 2.9758435582822087e-06, + "objective/entropy": 106.70518493652344, + "objective/kl": 3.7937560081481934, + "objective/non_score_reward": -0.1896878182888031, + "objective/rlhf_reward": 0.8618731498718261, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.81584095954895, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.470703125, + "step": 84, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9960724115371704 + }, + { + "episode": 2064, + "epoch": 0.00412217948257061, + "loss/policy_avg": 0.0033472366631031036, + "lr": 2.975555981595092e-06, + "objective/entropy": 78.1265869140625, + "objective/kl": 2.3197784423828125, + "objective/non_score_reward": -0.11598893254995346, + "objective/rlhf_reward": -0.6959335878491402, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9514119625091553, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.431640625, + "step": 85, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0014963150024414 + }, + { + "episode": 2088, + "epoch": 0.004170111802135385, + "loss/policy_avg": 0.042981959879398346, + "lr": 2.9752684049079756e-06, + "objective/entropy": 92.20894622802734, + "objective/kl": 2.407242774963379, + "objective/non_score_reward": -0.12036213278770447, + "objective/rlhf_reward": -0.7221727818250656, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.617156505584717, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.400390625, + "step": 86, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964863061904907 + }, + { + "episode": 2112, + "epoch": 0.004218044121700159, + "loss/policy_avg": 0.03690001741051674, + "lr": 2.974980828220859e-06, + "objective/entropy": 92.78817749023438, + "objective/kl": 0.4987431466579437, + "objective/non_score_reward": -0.024937158450484276, + "objective/rlhf_reward": 1.987620183121098, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.091235637664795, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.380859375, + "step": 87, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003743171691895 + }, + { + "episode": 2136, + "epoch": 0.004265976441264934, + "loss/policy_avg": 0.08167079091072083, + "lr": 2.9746932515337424e-06, + "objective/entropy": 87.13999938964844, + "objective/kl": 2.3516900539398193, + "objective/non_score_reward": -0.11758449673652649, + "objective/rlhf_reward": 1.431736101250781, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.2190849781036377, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.42578125, + "step": 88, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0030479431152344 + }, + { + "episode": 2160, + "epoch": 0.004313908760829708, + "loss/policy_avg": 0.03783843666315079, + "lr": 2.974405674846626e-06, + "objective/entropy": 127.44854736328125, + "objective/kl": 2.025526523590088, + "objective/non_score_reward": -0.10127630829811096, + "objective/rlhf_reward": -0.6076578870415688, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1914784908294678, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.5703125, + "step": 89, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977067708969116 + }, + { + "episode": 2184, + "epoch": 0.004361841080394483, + "loss/policy_avg": -0.00790494680404663, + "lr": 2.9741180981595093e-06, + "objective/entropy": 91.36763763427734, + "objective/kl": 0.832406222820282, + "objective/non_score_reward": -0.041620321571826935, + "objective/rlhf_reward": 1.6430673452532496, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.4380722045898438, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4140625, + "step": 90, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013461112976074 + }, + { + "episode": 2208, + "epoch": 0.004409773399959257, + "loss/policy_avg": 0.04621221870183945, + "lr": 2.9738305214723927e-06, + "objective/entropy": 118.92340087890625, + "objective/kl": 2.0420455932617188, + "objective/non_score_reward": -0.10210227966308594, + "objective/rlhf_reward": -0.612613670527935, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9362707138061523, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.5078125, + "step": 91, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000077724456787 + }, + { + "episode": 2232, + "epoch": 0.004457705719524032, + "loss/policy_avg": 0.029883865267038345, + "lr": 2.973542944785276e-06, + "objective/entropy": 105.30355072021484, + "objective/kl": 0.4042707085609436, + "objective/non_score_reward": -0.020213529467582703, + "objective/rlhf_reward": -0.12128117680549622, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.728107452392578, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.46484375, + "step": 92, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012707710266113 + }, + { + "episode": 2256, + "epoch": 0.004505638039088807, + "loss/policy_avg": -0.008897566236555576, + "lr": 2.9732553680981595e-06, + "objective/entropy": 91.919677734375, + "objective/kl": 1.430572271347046, + "objective/non_score_reward": -0.0715285986661911, + "objective/rlhf_reward": -0.4291716106235981, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.643986701965332, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.392578125, + "step": 93, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976251125335693 + }, + { + "episode": 2280, + "epoch": 0.0045535703586535815, + "loss/policy_avg": 0.020692484453320503, + "lr": 2.972967791411043e-06, + "objective/entropy": 64.9782943725586, + "objective/kl": 1.972738265991211, + "objective/non_score_reward": -0.0986369177699089, + "objective/rlhf_reward": -0.5918214991688728, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.020811080932617, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3623046875, + "step": 94, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995330572128296 + }, + { + "episode": 2304, + "epoch": 0.004601502678218356, + "loss/policy_avg": 0.008337275125086308, + "lr": 2.9726802147239264e-06, + "objective/entropy": 85.09711456298828, + "objective/kl": 2.6009159088134766, + "objective/non_score_reward": -0.1300458014011383, + "objective/rlhf_reward": 1.5408420871544841, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.1648292541503906, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.404296875, + "step": 95, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000575304031372 + }, + { + "episode": 2328, + "epoch": 0.0046494349977831305, + "loss/policy_avg": -0.004021456465125084, + "lr": 2.9723926380368102e-06, + "objective/entropy": 86.31969451904297, + "objective/kl": 3.004448413848877, + "objective/non_score_reward": -0.15022242069244385, + "objective/rlhf_reward": 1.4197823565054897, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.9986915588378906, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.3798828125, + "step": 96, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019335746765137 + }, + { + "episode": 2352, + "epoch": 0.004697367317347905, + "loss/policy_avg": 0.01552601158618927, + "lr": 2.9721050613496932e-06, + "objective/entropy": 81.71778106689453, + "objective/kl": 1.586479902267456, + "objective/non_score_reward": -0.07932399213314056, + "objective/rlhf_reward": -0.4759439080953598, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3105335235595703, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.404296875, + "step": 97, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003232479095459 + }, + { + "episode": 2376, + "epoch": 0.004745299636912679, + "loss/policy_avg": 0.5144245624542236, + "lr": 2.9718174846625767e-06, + "objective/entropy": 116.70284271240234, + "objective/kl": 3.161834716796875, + "objective/non_score_reward": -0.1580917239189148, + "objective/rlhf_reward": 2.837028148112934, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 7.081441879272461, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4990234375, + "step": 98, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003156661987305 + }, + { + "episode": 2400, + "epoch": 0.004793231956477454, + "loss/policy_avg": -0.010393455624580383, + "lr": 2.97152990797546e-06, + "objective/entropy": 73.36244201660156, + "objective/kl": 3.1763648986816406, + "objective/non_score_reward": -0.15881824493408203, + "objective/rlhf_reward": 2.832669074175995, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.9224071502685547, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4443359375, + "step": 99, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0003247261047363 + }, + { + "episode": 2424, + "epoch": 0.004841164276042228, + "loss/policy_avg": 0.015790797770023346, + "lr": 2.9712423312883435e-06, + "objective/entropy": 74.77686309814453, + "objective/kl": 1.7198359966278076, + "objective/non_score_reward": -0.08599179983139038, + "objective/rlhf_reward": -0.5159508343786001, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.674623727798462, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3447265625, + "step": 100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000511646270752 + }, + { + "episode": 2448, + "epoch": 0.004889096595607003, + "loss/policy_avg": 0.11094634979963303, + "lr": 2.970954754601227e-06, + "objective/entropy": 89.0152816772461, + "objective/kl": 3.2250144481658936, + "objective/non_score_reward": -0.16125072538852692, + "objective/rlhf_reward": 0.9252849232843723, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.034808158874512, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.400390625, + "step": 101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984198808670044 + }, + { + "episode": 2472, + "epoch": 0.004937028915171777, + "loss/policy_avg": 0.04626832902431488, + "lr": 2.9706671779141104e-06, + "objective/entropy": 92.89080810546875, + "objective/kl": 3.2350356578826904, + "objective/non_score_reward": -0.16175177693367004, + "objective/rlhf_reward": -0.9705106616020203, + "objective/scores": 0.0, + "policy/approxkl_avg": 15.815196990966797, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.427734375, + "step": 102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99477219581604 + }, + { + "episode": 2496, + "epoch": 0.004984961234736552, + "loss/policy_avg": 0.039788223803043365, + "lr": 2.970379601226994e-06, + "objective/entropy": 96.42967987060547, + "objective/kl": 3.4554505348205566, + "objective/non_score_reward": -0.17277252674102783, + "objective/rlhf_reward": -1.0366351641714573, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5951766967773438, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.41796875, + "step": 103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008797645568848 + }, + { + "episode": 2520, + "epoch": 0.005032893554301326, + "loss/policy_avg": -0.02835722267627716, + "lr": 2.9700920245398772e-06, + "objective/entropy": 89.68648529052734, + "objective/kl": 0.20460037887096405, + "objective/non_score_reward": -0.01023002527654171, + "objective/rlhf_reward": 2.25973669919858, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.5472068786621094, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3994140625, + "step": 104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001133680343628 + }, + { + "episode": 2544, + "epoch": 0.005080825873866101, + "loss/policy_avg": 0.011577663943171501, + "lr": 2.9698044478527607e-06, + "objective/entropy": 104.98710632324219, + "objective/kl": 2.2366867065429688, + "objective/non_score_reward": -0.1118343323469162, + "objective/rlhf_reward": -0.6710060108453035, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.71445631980896, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.44921875, + "step": 105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987014532089233 + }, + { + "episode": 2568, + "epoch": 0.005128758193430875, + "loss/policy_avg": -0.01193756889551878, + "lr": 2.9695168711656445e-06, + "objective/entropy": 151.76824951171875, + "objective/kl": 2.7941370010375977, + "objective/non_score_reward": -0.13970685005187988, + "objective/rlhf_reward": -0.8382411040365696, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1941494941711426, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.6796875, + "step": 106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998120069503784 + }, + { + "episode": 2592, + "epoch": 0.00517669051299565, + "loss/policy_avg": 0.04677446559071541, + "lr": 2.969229294478528e-06, + "objective/entropy": 99.01734924316406, + "objective/kl": 2.224540948867798, + "objective/non_score_reward": -0.11122703552246094, + "objective/rlhf_reward": -0.6673622205853462, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.9934004545211792, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.564453125, + "step": 107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002680778503418 + }, + { + "episode": 2616, + "epoch": 0.005224622832560425, + "loss/policy_avg": 0.05939783155918121, + "lr": 2.9689417177914114e-06, + "objective/entropy": 110.12539672851562, + "objective/kl": 2.120410442352295, + "objective/non_score_reward": -0.10602051019668579, + "objective/rlhf_reward": -0.6361230779439211, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8223364353179932, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.615234375, + "step": 108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0003457069396973 + }, + { + "episode": 2640, + "epoch": 0.0052725551521251995, + "loss/policy_avg": 0.010696610435843468, + "lr": 2.9686541411042948e-06, + "objective/entropy": 75.35289764404297, + "objective/kl": 4.2061381340026855, + "objective/non_score_reward": -0.21030691266059875, + "objective/rlhf_reward": 2.5237371199709586, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 7.109445095062256, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4267578125, + "step": 109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9952830076217651 + }, + { + "episode": 2664, + "epoch": 0.005320487471689974, + "loss/policy_avg": 0.06283558905124664, + "lr": 2.968366564417178e-06, + "objective/entropy": 85.45372009277344, + "objective/kl": 2.91688871383667, + "objective/non_score_reward": -0.1458444446325302, + "objective/rlhf_reward": 0.9311133359910283, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.6572680473327637, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4150390625, + "step": 110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994902610778809 + }, + { + "episode": 2688, + "epoch": 0.0053684197912547485, + "loss/policy_avg": 0.057504087686538696, + "lr": 2.9680789877300616e-06, + "objective/entropy": 109.15007019042969, + "objective/kl": 2.3043277263641357, + "objective/non_score_reward": -0.11521639674901962, + "objective/rlhf_reward": -0.6912983246147633, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3089663982391357, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.45703125, + "step": 111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977518320083618 + }, + { + "episode": 2712, + "epoch": 0.005416352110819523, + "loss/policy_avg": 0.028755519539117813, + "lr": 2.9677914110429446e-06, + "objective/entropy": 83.31957244873047, + "objective/kl": 2.9333150386810303, + "objective/non_score_reward": -0.1466657519340515, + "objective/rlhf_reward": 1.704064784681985, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.097446918487549, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3779296875, + "step": 112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999340295791626 + }, + { + "episode": 2736, + "epoch": 0.0054642844303842975, + "loss/policy_avg": -0.004807117860764265, + "lr": 2.967503834355828e-06, + "objective/entropy": 108.69938659667969, + "objective/kl": 2.6207327842712402, + "objective/non_score_reward": -0.131036639213562, + "objective/rlhf_reward": 1.019960164779547, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.50050163269043, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.5234375, + "step": 113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996802806854248 + }, + { + "episode": 2760, + "epoch": 0.005512216749949072, + "loss/policy_avg": 0.023727577179670334, + "lr": 2.9672162576687115e-06, + "objective/entropy": 92.37568664550781, + "objective/kl": 3.591444730758667, + "objective/non_score_reward": -0.1795722246170044, + "objective/rlhf_reward": 1.5066259672107192, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.471770286560059, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.396484375, + "step": 114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001786708831787 + }, + { + "episode": 2784, + "epoch": 0.005560149069513846, + "loss/policy_avg": -0.0030838537495583296, + "lr": 2.966928680981595e-06, + "objective/entropy": 132.52212524414062, + "objective/kl": 1.8538861274719238, + "objective/non_score_reward": -0.09269431233406067, + "objective/rlhf_reward": -0.5561658591032028, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.6720802783966064, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62109375, + "step": 115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001795768737793 + }, + { + "episode": 2808, + "epoch": 0.005608081389078621, + "loss/policy_avg": 0.041441939771175385, + "lr": 2.9666411042944783e-06, + "objective/entropy": 66.52481079101562, + "objective/kl": 1.795864462852478, + "objective/non_score_reward": -0.08979322761297226, + "objective/rlhf_reward": 5.461240652948618, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.1926597356796265, + "policy/clipfrac_avg": 0.1666666716337204, + "policy/entropy_avg": 0.328125, + "step": 116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0014500617980957 + }, + { + "episode": 2832, + "epoch": 0.005656013708643395, + "loss/policy_avg": 0.035618532449007034, + "lr": 2.966353527607362e-06, + "objective/entropy": 82.55915832519531, + "objective/kl": 2.4185874462127686, + "objective/non_score_reward": -0.12092937529087067, + "objective/rlhf_reward": 1.8584831413986183, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 5.106599807739258, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3681640625, + "step": 117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982081651687622 + }, + { + "episode": 2856, + "epoch": 0.00570394602820817, + "loss/policy_avg": 0.034708861261606216, + "lr": 2.9660659509202456e-06, + "objective/entropy": 92.83428955078125, + "objective/kl": 3.7101073265075684, + "objective/non_score_reward": -0.18550537526607513, + "objective/rlhf_reward": 1.4710271191956492, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.8645012378692627, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4658203125, + "step": 118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9967145919799805 + }, + { + "episode": 2880, + "epoch": 0.005751878347772944, + "loss/policy_avg": 0.042165808379650116, + "lr": 2.965778374233129e-06, + "objective/entropy": 88.99725341796875, + "objective/kl": 3.481161594390869, + "objective/non_score_reward": -0.17405806481838226, + "objective/rlhf_reward": -1.0443483591079712, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.332367420196533, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4189453125, + "step": 119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9940729141235352 + }, + { + "episode": 2904, + "epoch": 0.005799810667337719, + "loss/policy_avg": -0.0020185885950922966, + "lr": 2.9654907975460125e-06, + "objective/entropy": 86.50545501708984, + "objective/kl": 2.8491549491882324, + "objective/non_score_reward": -0.1424577534198761, + "objective/rlhf_reward": -0.8547464869916439, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.421205997467041, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.380859375, + "step": 120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011210441589355 + }, + { + "episode": 2928, + "epoch": 0.005847742986902493, + "loss/policy_avg": 0.003570199478417635, + "lr": 2.965203220858896e-06, + "objective/entropy": 108.00729370117188, + "objective/kl": 0.9019589424133301, + "objective/non_score_reward": -0.045097947120666504, + "objective/rlhf_reward": 1.7294123247265816, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.512500762939453, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.5, + "step": 121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988384246826172 + }, + { + "episode": 2952, + "epoch": 0.005895675306467269, + "loss/policy_avg": 0.06972122192382812, + "lr": 2.9649156441717793e-06, + "objective/entropy": 79.64564514160156, + "objective/kl": 4.181931972503662, + "objective/non_score_reward": -0.20909659564495087, + "objective/rlhf_reward": 0.47980943619168626, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.783114433288574, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.373046875, + "step": 122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998268961906433 + }, + { + "episode": 2976, + "epoch": 0.005943607626032043, + "loss/policy_avg": 0.023752108216285706, + "lr": 2.9646280674846627e-06, + "objective/entropy": 114.72842407226562, + "objective/kl": 1.663236379623413, + "objective/non_score_reward": -0.08316181600093842, + "objective/rlhf_reward": 5.501029096543789, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.9570660591125488, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.541015625, + "step": 123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0014548301696777 + }, + { + "episode": 3000, + "epoch": 0.005991539945596818, + "loss/policy_avg": 0.028327319771051407, + "lr": 2.964340490797546e-06, + "objective/entropy": 116.76904296875, + "objective/kl": 2.7088029384613037, + "objective/non_score_reward": -0.1354401558637619, + "objective/rlhf_reward": -0.8126408718526363, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9975041151046753, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.59375, + "step": 124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992276430130005 + }, + { + "episode": 3024, + "epoch": 0.006039472265161592, + "loss/policy_avg": 0.04839755967259407, + "lr": 2.9640529141104296e-06, + "objective/entropy": 79.92448425292969, + "objective/kl": 0.4934648871421814, + "objective/non_score_reward": -0.02467324212193489, + "objective/rlhf_reward": 3.6375390742850713, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 0.8837705254554749, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.375, + "step": 125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0031301975250244 + }, + { + "episode": 3048, + "epoch": 0.0060874045847263665, + "loss/policy_avg": 0.008615978062152863, + "lr": 2.9637653374233126e-06, + "objective/entropy": 96.88383483886719, + "objective/kl": 4.236352920532227, + "objective/non_score_reward": -0.21181762218475342, + "objective/rlhf_reward": -1.2709058001637459, + "objective/scores": 0.0, + "policy/approxkl_avg": 14.248335838317871, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4677734375, + "step": 126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001118183135986 + }, + { + "episode": 3072, + "epoch": 0.006135336904291141, + "loss/policy_avg": 0.0364036038517952, + "lr": 2.9634777607361964e-06, + "objective/entropy": 94.80747985839844, + "objective/kl": 1.827704668045044, + "objective/non_score_reward": -0.09138523042201996, + "objective/rlhf_reward": -0.54831138625741, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.417388916015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.380859375, + "step": 127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999357461929321 + }, + { + "episode": 3096, + "epoch": 0.0061832692238559155, + "loss/policy_avg": 0.009975227527320385, + "lr": 2.96319018404908e-06, + "objective/entropy": 99.26768493652344, + "objective/kl": 2.533238172531128, + "objective/non_score_reward": -0.12666192650794983, + "objective/rlhf_reward": 1.37727163438095, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.9337663650512695, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.470703125, + "step": 128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998363971710205 + }, + { + "episode": 3120, + "epoch": 0.00623120154342069, + "loss/policy_avg": 0.0010055415332317352, + "lr": 2.9629026073619633e-06, + "objective/entropy": 73.36211395263672, + "objective/kl": 3.567695379257202, + "objective/non_score_reward": -0.17838476598262787, + "objective/rlhf_reward": 0.8224806983462181, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 5.058878421783447, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3671875, + "step": 129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992671012878418 + }, + { + "episode": 3144, + "epoch": 0.0062791338629854644, + "loss/policy_avg": 0.08540584146976471, + "lr": 2.9626150306748467e-06, + "objective/entropy": 100.85992431640625, + "objective/kl": 3.6413960456848145, + "objective/non_score_reward": -0.1820698082447052, + "objective/rlhf_reward": 1.044824347685708, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.9973220825195312, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4326171875, + "step": 130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985313415527344 + }, + { + "episode": 3168, + "epoch": 0.006327066182550239, + "loss/policy_avg": -0.025395512580871582, + "lr": 2.96232745398773e-06, + "objective/entropy": 89.28819274902344, + "objective/kl": 3.8203017711639404, + "objective/non_score_reward": -0.1910150945186615, + "objective/rlhf_reward": 2.639488028822582, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 7.501034259796143, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.412109375, + "step": 131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998079538345337 + }, + { + "episode": 3192, + "epoch": 0.006374998502115013, + "loss/policy_avg": -0.005867543164640665, + "lr": 2.9620398773006136e-06, + "objective/entropy": 68.26222229003906, + "objective/kl": 1.8740122318267822, + "objective/non_score_reward": -0.09370061010122299, + "objective/rlhf_reward": 1.3305856001070346, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 0.8239774703979492, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.318359375, + "step": 132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0033674240112305 + }, + { + "episode": 3216, + "epoch": 0.006422930821679788, + "loss/policy_avg": 0.0696270763874054, + "lr": 2.961752300613497e-06, + "objective/entropy": 97.19991302490234, + "objective/kl": 4.19876766204834, + "objective/non_score_reward": -0.20993834733963013, + "objective/rlhf_reward": 0.6331591431489791, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 8.167587280273438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.447265625, + "step": 133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9949493408203125 + }, + { + "episode": 3240, + "epoch": 0.006470863141244562, + "loss/policy_avg": -0.018206734210252762, + "lr": 2.9614647239263804e-06, + "objective/entropy": 94.9989242553711, + "objective/kl": 2.0642008781433105, + "objective/non_score_reward": -0.10321004688739777, + "objective/rlhf_reward": 1.273529016642889, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.7244369983673096, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4130859375, + "step": 134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.005453586578369 + }, + { + "episode": 3264, + "epoch": 0.006518795460809337, + "loss/policy_avg": 0.019078437238931656, + "lr": 2.961177147239264e-06, + "objective/entropy": 121.76716613769531, + "objective/kl": 3.5134658813476562, + "objective/non_score_reward": -0.17567329108715057, + "objective/rlhf_reward": 0.9459602460265158, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.777169704437256, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.49609375, + "step": 135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978556632995605 + }, + { + "episode": 3288, + "epoch": 0.006566727780374112, + "loss/policy_avg": -0.009853353723883629, + "lr": 2.9608895705521473e-06, + "objective/entropy": 82.51039123535156, + "objective/kl": 2.13692045211792, + "objective/non_score_reward": -0.10684603452682495, + "objective/rlhf_reward": -0.6410762034356594, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7726022005081177, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3798828125, + "step": 136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0021941661834717 + }, + { + "episode": 3312, + "epoch": 0.006614660099938887, + "loss/policy_avg": 0.005692376289516687, + "lr": 2.9606019938650307e-06, + "objective/entropy": 98.13601684570312, + "objective/kl": 1.3870224952697754, + "objective/non_score_reward": -0.06935112178325653, + "objective/rlhf_reward": -0.41610670927911997, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8871004581451416, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.41796875, + "step": 137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001054286956787 + }, + { + "episode": 3336, + "epoch": 0.006662592419503661, + "loss/policy_avg": 0.03439762443304062, + "lr": 2.960314417177914e-06, + "objective/entropy": 99.6649398803711, + "objective/kl": 1.8266404867172241, + "objective/non_score_reward": -0.09133201837539673, + "objective/rlhf_reward": -0.547992117702961, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.54854679107666, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44921875, + "step": 138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974285364151 + }, + { + "episode": 3360, + "epoch": 0.006710524739068436, + "loss/policy_avg": 0.035291895270347595, + "lr": 2.9600268404907976e-06, + "objective/entropy": 98.41807556152344, + "objective/kl": 1.7121431827545166, + "objective/non_score_reward": -0.08560715615749359, + "objective/rlhf_reward": -0.5136429183185101, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.381733417510986, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4248046875, + "step": 139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998841404914856 + }, + { + "episode": 3384, + "epoch": 0.00675845705863321, + "loss/policy_avg": 0.013827784918248653, + "lr": 2.9597392638036814e-06, + "objective/entropy": 113.13424682617188, + "objective/kl": 3.708888053894043, + "objective/non_score_reward": -0.1854443997144699, + "objective/rlhf_reward": -1.1126664280891418, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.854903221130371, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.494140625, + "step": 140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013322830200195 + }, + { + "episode": 3408, + "epoch": 0.006806389378197985, + "loss/policy_avg": -0.030264653265476227, + "lr": 2.9594516871165644e-06, + "objective/entropy": 94.34181213378906, + "objective/kl": 4.2759199142456055, + "objective/non_score_reward": -0.2137959599494934, + "objective/rlhf_reward": 0.5234041993857655, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.057184219360352, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.40625, + "step": 141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0022776126861572 + }, + { + "episode": 3432, + "epoch": 0.006854321697762759, + "loss/policy_avg": 0.0027130991220474243, + "lr": 2.959164110429448e-06, + "objective/entropy": 102.87808227539062, + "objective/kl": 3.251103162765503, + "objective/non_score_reward": -0.16255515813827515, + "objective/rlhf_reward": 1.6087283996107078, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 8.038439750671387, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.458984375, + "step": 142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974896907806396 + }, + { + "episode": 3456, + "epoch": 0.0069022540173275335, + "loss/policy_avg": -0.004272802267223597, + "lr": 2.9588765337423313e-06, + "objective/entropy": 86.80547332763672, + "objective/kl": 3.4430394172668457, + "objective/non_score_reward": -0.17215196788311005, + "objective/rlhf_reward": 2.752666762558859, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.9540340900421143, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3779296875, + "step": 143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995760917663574 + }, + { + "episode": 3480, + "epoch": 0.006950186336892308, + "loss/policy_avg": 0.032740674912929535, + "lr": 2.9585889570552147e-06, + "objective/entropy": 105.7743148803711, + "objective/kl": 2.3428237438201904, + "objective/non_score_reward": -0.11714118719100952, + "objective/rlhf_reward": -0.7028470821678638, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1152796745300293, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4736328125, + "step": 144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999332427978516 + }, + { + "episode": 3504, + "epoch": 0.0069981186564570825, + "loss/policy_avg": 0.07414033263921738, + "lr": 2.958301380368098e-06, + "objective/entropy": 107.20855712890625, + "objective/kl": 2.0231757164001465, + "objective/non_score_reward": -0.10115879029035568, + "objective/rlhf_reward": -0.606952715665102, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6780776977539062, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4892578125, + "step": 145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974359273910522 + }, + { + "episode": 3528, + "epoch": 0.007046050976021857, + "loss/policy_avg": 0.0006100209429860115, + "lr": 2.9580138036809815e-06, + "objective/entropy": 120.5149154663086, + "objective/kl": 0.8261145353317261, + "objective/non_score_reward": -0.04130573198199272, + "objective/rlhf_reward": 3.537744146300595, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.7252533435821533, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.541015625, + "step": 146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991565942764282 + }, + { + "episode": 3552, + "epoch": 0.0070939832955866314, + "loss/policy_avg": 0.02252659574151039, + "lr": 2.957726226993865e-06, + "objective/entropy": 110.61607360839844, + "objective/kl": 2.5127158164978027, + "objective/non_score_reward": -0.1256357729434967, + "objective/rlhf_reward": 2.2461853697896004, + "objective/scores": 0.5, + "policy/approxkl_avg": 6.023884296417236, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4951171875, + "step": 147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993090629577637 + }, + { + "episode": 3576, + "epoch": 0.007141915615151406, + "loss/policy_avg": 0.0941631868481636, + "lr": 2.9574386503067484e-06, + "objective/entropy": 97.95420837402344, + "objective/kl": 1.7816667556762695, + "objective/non_score_reward": -0.08908335864543915, + "objective/rlhf_reward": -0.5345001071691513, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3279268741607666, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4716796875, + "step": 148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001986503601074 + }, + { + "episode": 3600, + "epoch": 0.00718984793471618, + "loss/policy_avg": -0.014646870084106922, + "lr": 2.957151073619632e-06, + "objective/entropy": 102.42832946777344, + "objective/kl": 3.9100356101989746, + "objective/non_score_reward": -0.19550180435180664, + "objective/rlhf_reward": -1.1730107590556145, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.219538688659668, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4638671875, + "step": 149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99825918674469 + }, + { + "episode": 3624, + "epoch": 0.007237780254280955, + "loss/policy_avg": 0.003555338829755783, + "lr": 2.9568634969325152e-06, + "objective/entropy": 116.14208984375, + "objective/kl": 3.1522469520568848, + "objective/non_score_reward": -0.15761233866214752, + "objective/rlhf_reward": -0.9456739947199821, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.453514099121094, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.521484375, + "step": 150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997190237045288 + }, + { + "episode": 3648, + "epoch": 0.00728571257384573, + "loss/policy_avg": -0.007403637282550335, + "lr": 2.956575920245399e-06, + "objective/entropy": 78.631591796875, + "objective/kl": 4.22671365737915, + "objective/non_score_reward": -0.2113357037305832, + "objective/rlhf_reward": 0.6247750867596473, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.373077869415283, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3740234375, + "step": 151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999770164489746 + }, + { + "episode": 3672, + "epoch": 0.007333644893410505, + "loss/policy_avg": -0.00014569982886314392, + "lr": 2.9562883435582825e-06, + "objective/entropy": 82.0683822631836, + "objective/kl": 2.368504047393799, + "objective/non_score_reward": -0.11842521280050278, + "objective/rlhf_reward": -0.7105511948466301, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.596038341522217, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3994140625, + "step": 152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985384941101074 + }, + { + "episode": 3696, + "epoch": 0.007381577212975279, + "loss/policy_avg": -0.019364282488822937, + "lr": 2.956000766871166e-06, + "objective/entropy": 94.45833587646484, + "objective/kl": 1.380356788635254, + "objective/non_score_reward": -0.06901785731315613, + "objective/rlhf_reward": 2.585892900824547, + "objective/scores": 0.5, + "policy/approxkl_avg": 6.881087303161621, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.451171875, + "step": 153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978630542755127 + }, + { + "episode": 3720, + "epoch": 0.007429509532540054, + "loss/policy_avg": 0.008754800073802471, + "lr": 2.9557131901840494e-06, + "objective/entropy": 80.05279541015625, + "objective/kl": 4.150510787963867, + "objective/non_score_reward": -0.20752553641796112, + "objective/rlhf_reward": 1.7548467516899109, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.920881748199463, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3662109375, + "step": 154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992568492889404 + }, + { + "episode": 3744, + "epoch": 0.007477441852104828, + "loss/policy_avg": 0.048283614218235016, + "lr": 2.955425613496933e-06, + "objective/entropy": 83.7545394897461, + "objective/kl": 2.769890308380127, + "objective/non_score_reward": -0.13849452137947083, + "objective/rlhf_reward": 1.3062760390747918, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.2954061031341553, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.37890625, + "step": 155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993659257888794 + }, + { + "episode": 3768, + "epoch": 0.007525374171669603, + "loss/policy_avg": -0.008645785972476006, + "lr": 2.955138036809816e-06, + "objective/entropy": 114.5855941772461, + "objective/kl": 1.7211155891418457, + "objective/non_score_reward": -0.08605578541755676, + "objective/rlhf_reward": -0.5163346920162439, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4327452182769775, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.5, + "step": 156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008654594421387 + }, + { + "episode": 3792, + "epoch": 0.007573306491234377, + "loss/policy_avg": 0.005367459263652563, + "lr": 2.9548504601226992e-06, + "objective/entropy": 136.01806640625, + "objective/kl": 1.2361669540405273, + "objective/non_score_reward": -0.06180834397673607, + "objective/rlhf_reward": 3.4147284538430385, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.365799903869629, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.587890625, + "step": 157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994893074035645 + }, + { + "episode": 3816, + "epoch": 0.007621238810799152, + "loss/policy_avg": 0.05374927446246147, + "lr": 2.9545628834355827e-06, + "objective/entropy": 111.06832122802734, + "objective/kl": 3.125086545944214, + "objective/non_score_reward": -0.15625432133674622, + "objective/rlhf_reward": 2.8480526530129127, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 5.463106155395508, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4765625, + "step": 158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998197317123413 + }, + { + "episode": 3840, + "epoch": 0.007669171130363926, + "loss/policy_avg": -0.00819938350468874, + "lr": 2.954275306748466e-06, + "objective/entropy": 120.80804443359375, + "objective/kl": 0.8327538371086121, + "objective/non_score_reward": -0.04163769632577896, + "objective/rlhf_reward": 2.3342331853868457, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.127401351928711, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.51953125, + "step": 159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999913215637207 + }, + { + "episode": 3864, + "epoch": 0.0077171034499287005, + "loss/policy_avg": 0.010007216595113277, + "lr": 2.9539877300613495e-06, + "objective/entropy": 95.33415985107422, + "objective/kl": 2.7522780895233154, + "objective/non_score_reward": -0.1376139223575592, + "objective/rlhf_reward": 2.1743165515363216, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.405463218688965, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.43359375, + "step": 160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987680912017822 + }, + { + "episode": 3888, + "epoch": 0.007765035769493475, + "loss/policy_avg": -0.012971251271665096, + "lr": 2.9537001533742334e-06, + "objective/entropy": 99.96492767333984, + "objective/kl": 2.322258472442627, + "objective/non_score_reward": -0.11611293256282806, + "objective/rlhf_reward": -0.6966775134205818, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.1049418449401855, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4755859375, + "step": 161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979031085968018 + }, + { + "episode": 3912, + "epoch": 0.00781296808905825, + "loss/policy_avg": -0.025686660781502724, + "lr": 2.9534125766871168e-06, + "objective/entropy": 78.95204162597656, + "objective/kl": 4.09630012512207, + "objective/non_score_reward": -0.20481500029563904, + "objective/rlhf_reward": -1.22888994961977, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.167758941650391, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3701171875, + "step": 162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9969532489776611 + }, + { + "episode": 3936, + "epoch": 0.007860900408623025, + "loss/policy_avg": 0.04312625154852867, + "lr": 2.953125e-06, + "objective/entropy": 83.40502166748047, + "objective/kl": 2.593156576156616, + "objective/non_score_reward": -0.1296578347682953, + "objective/rlhf_reward": -0.7779469415545464, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5280444622039795, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4619140625, + "step": 163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998535394668579 + }, + { + "episode": 3960, + "epoch": 0.0079088327281878, + "loss/policy_avg": -0.0026622358709573746, + "lr": 2.9528374233128836e-06, + "objective/entropy": 78.30680847167969, + "objective/kl": 3.6898269653320312, + "objective/non_score_reward": -0.184491366147995, + "objective/rlhf_reward": -1.1069481372833252, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.509169578552246, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4990234375, + "step": 164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.996406078338623 + }, + { + "episode": 3984, + "epoch": 0.007956765047752574, + "loss/policy_avg": -0.014012967236340046, + "lr": 2.952549846625767e-06, + "objective/entropy": 79.10055541992188, + "objective/kl": 2.7429778575897217, + "objective/non_score_reward": -0.1371489018201828, + "objective/rlhf_reward": -0.8228934183716774, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4424331188201904, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.330078125, + "step": 165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998531699180603 + }, + { + "episode": 4008, + "epoch": 0.008004697367317348, + "loss/policy_avg": 0.15797269344329834, + "lr": 2.9522622699386505e-06, + "objective/entropy": 90.83220672607422, + "objective/kl": 3.6862807273864746, + "objective/non_score_reward": -0.18431401252746582, + "objective/rlhf_reward": -1.1058840937912464, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.749197483062744, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.404296875, + "step": 166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998784065246582 + }, + { + "episode": 4032, + "epoch": 0.008052629686882123, + "loss/policy_avg": 0.06913164258003235, + "lr": 2.951974693251534e-06, + "objective/entropy": 115.73429870605469, + "objective/kl": 1.077301025390625, + "objective/non_score_reward": -0.05386505275964737, + "objective/rlhf_reward": -0.3231903091073036, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7101439237594604, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.5, + "step": 167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0039100646972656 + }, + { + "episode": 4056, + "epoch": 0.008100562006446897, + "loss/policy_avg": -0.02192726731300354, + "lr": 2.9516871165644173e-06, + "objective/entropy": 95.98412322998047, + "objective/kl": 2.261876344680786, + "objective/non_score_reward": -0.11309381574392319, + "objective/rlhf_reward": 1.9054964763285613, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.353433609008789, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4462890625, + "step": 168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999911785125732 + }, + { + "episode": 4080, + "epoch": 0.008148494326011672, + "loss/policy_avg": 0.13195769488811493, + "lr": 2.9513995398773008e-06, + "objective/entropy": 100.91707611083984, + "objective/kl": 1.6543936729431152, + "objective/non_score_reward": -0.08271969109773636, + "objective/rlhf_reward": -0.49631809815764427, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.574903726577759, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.439453125, + "step": 169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993833303451538 + }, + { + "episode": 4104, + "epoch": 0.008196426645576446, + "loss/policy_avg": 0.03052515722811222, + "lr": 2.951111963190184e-06, + "objective/entropy": 103.54852294921875, + "objective/kl": 2.2816290855407715, + "objective/non_score_reward": -0.11408144235610962, + "objective/rlhf_reward": 1.121691290044907, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.325289726257324, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.462890625, + "step": 170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998672008514404 + }, + { + "episode": 4128, + "epoch": 0.00824435896514122, + "loss/policy_avg": 0.02215631864964962, + "lr": 2.9508243865030676e-06, + "objective/entropy": 98.87413024902344, + "objective/kl": 1.944648265838623, + "objective/non_score_reward": -0.09723242372274399, + "objective/rlhf_reward": 1.5538485747237338, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.168482303619385, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3955078125, + "step": 171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993770122528076 + }, + { + "episode": 4152, + "epoch": 0.008292291284705995, + "loss/policy_avg": 0.0014503588899970055, + "lr": 2.950536809815951e-06, + "objective/entropy": 111.60205078125, + "objective/kl": 2.5749354362487793, + "objective/non_score_reward": -0.12874677777290344, + "objective/rlhf_reward": 1.033699327835563, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.381861686706543, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.51171875, + "step": 172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000516891479492 + }, + { + "episode": 4176, + "epoch": 0.00834022360427077, + "loss/policy_avg": 0.04625057056546211, + "lr": 2.9502492331288345e-06, + "objective/entropy": 131.60983276367188, + "objective/kl": 2.3548214435577393, + "objective/non_score_reward": -0.11774107068777084, + "objective/rlhf_reward": 1.1863428589394893, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 7.6308274269104, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6796875, + "step": 173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971277713775635 + }, + { + "episode": 4200, + "epoch": 0.008388155923835544, + "loss/policy_avg": -0.0020451643504202366, + "lr": 2.949961656441718e-06, + "objective/entropy": 74.44924926757812, + "objective/kl": 1.943938136100769, + "objective/non_score_reward": -0.09719689935445786, + "objective/rlhf_reward": 1.4168185889720917, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.046441078186035, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.357421875, + "step": 174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995932579040527 + }, + { + "episode": 4224, + "epoch": 0.008436088243400319, + "loss/policy_avg": 0.03397057205438614, + "lr": 2.9496740797546013e-06, + "objective/entropy": 81.3310546875, + "objective/kl": 3.0391221046447754, + "objective/non_score_reward": -0.15195612609386444, + "objective/rlhf_reward": 2.088263288140297, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.109105348587036, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4501953125, + "step": 175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994224309921265 + }, + { + "episode": 4248, + "epoch": 0.008484020562965093, + "loss/policy_avg": 0.013863434083759785, + "lr": 2.9493865030674847e-06, + "objective/entropy": 130.79148864746094, + "objective/kl": 3.822122097015381, + "objective/non_score_reward": -0.19110608100891113, + "objective/rlhf_reward": 4.853363424539566, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.8759713172912598, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.556640625, + "step": 176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999608993530273 + }, + { + "episode": 4272, + "epoch": 0.008531952882529868, + "loss/policy_avg": 0.018201462924480438, + "lr": 2.949098926380368e-06, + "objective/entropy": 99.84855651855469, + "objective/kl": 2.9463653564453125, + "objective/non_score_reward": -0.1473182737827301, + "objective/rlhf_reward": 0.9222703759909902, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.760159969329834, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4462890625, + "step": 177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981515407562256 + }, + { + "episode": 4296, + "epoch": 0.008579885202094642, + "loss/policy_avg": 0.07175146043300629, + "lr": 2.9488113496932516e-06, + "objective/entropy": 92.46806335449219, + "objective/kl": 2.2200350761413574, + "objective/non_score_reward": -0.11100175231695175, + "objective/rlhf_reward": 1.1401694563568863, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.010188579559326, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.400390625, + "step": 178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0035808086395264 + }, + { + "episode": 4320, + "epoch": 0.008627817521659416, + "loss/policy_avg": 0.007675966713577509, + "lr": 2.948523773006135e-06, + "objective/entropy": 78.8734130859375, + "objective/kl": 3.0995230674743652, + "objective/non_score_reward": -0.15497615933418274, + "objective/rlhf_reward": -0.9298568814992905, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.705993175506592, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3857421875, + "step": 179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990061521530151 + }, + { + "episode": 4344, + "epoch": 0.008675749841224191, + "loss/policy_avg": 0.04107556492090225, + "lr": 2.9482361963190184e-06, + "objective/entropy": 107.2726058959961, + "objective/kl": 4.423355579376221, + "objective/non_score_reward": -0.22116778790950775, + "objective/rlhf_reward": 4.672993302345276, + "objective/scores": 1.0, + "policy/approxkl_avg": 4.791415214538574, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4658203125, + "step": 180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003271102905273 + }, + { + "episode": 4368, + "epoch": 0.008723682160788965, + "loss/policy_avg": 0.06325562298297882, + "lr": 2.947948619631902e-06, + "objective/entropy": 92.09099578857422, + "objective/kl": 3.0468149185180664, + "objective/non_score_reward": -0.15234076976776123, + "objective/rlhf_reward": 0.9787446868112888, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.4912400245666504, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.421875, + "step": 181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980504512786865 + }, + { + "episode": 4392, + "epoch": 0.00877161448035374, + "loss/policy_avg": 0.011448364704847336, + "lr": 2.9476610429447853e-06, + "objective/entropy": 121.05316162109375, + "objective/kl": 1.7364263534545898, + "objective/non_score_reward": -0.08682131767272949, + "objective/rlhf_reward": -0.5209279283881187, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5513181686401367, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.50390625, + "step": 182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001152515411377 + }, + { + "episode": 4416, + "epoch": 0.008819546799918514, + "loss/policy_avg": -0.011562827974557877, + "lr": 2.9473734662576687e-06, + "objective/entropy": 111.20529174804688, + "objective/kl": 1.483881950378418, + "objective/non_score_reward": -0.07419410347938538, + "objective/rlhf_reward": 1.5548354145139456, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.9162025451660156, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.5234375, + "step": 183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024704933166504 + }, + { + "episode": 4440, + "epoch": 0.008867479119483289, + "loss/policy_avg": -0.06913043558597565, + "lr": 2.947085889570552e-06, + "objective/entropy": 104.00546264648438, + "objective/kl": 2.555075168609619, + "objective/non_score_reward": -0.12775374948978424, + "objective/rlhf_reward": -0.7665224988013506, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8956241607666016, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4560546875, + "step": 184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0050532817840576 + }, + { + "episode": 4464, + "epoch": 0.008915411439048063, + "loss/policy_avg": -0.015017647296190262, + "lr": 2.946798312883436e-06, + "objective/entropy": 89.32418823242188, + "objective/kl": 3.777963638305664, + "objective/non_score_reward": -0.18889819085597992, + "objective/rlhf_reward": 1.8666108772158623, + "objective/scores": 0.5, + "policy/approxkl_avg": 7.5635666847229, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4423828125, + "step": 185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980634450912476 + }, + { + "episode": 4488, + "epoch": 0.008963343758612838, + "loss/policy_avg": 0.01807108148932457, + "lr": 2.946510736196319e-06, + "objective/entropy": 70.30358123779297, + "objective/kl": 4.923575401306152, + "objective/non_score_reward": -0.2461787760257721, + "objective/rlhf_reward": 2.3085060515386275, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 6.517728805541992, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3193359375, + "step": 186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992297887802124 + }, + { + "episode": 4512, + "epoch": 0.009011276078177614, + "loss/policy_avg": 0.048264261335134506, + "lr": 2.9462231595092024e-06, + "objective/entropy": 89.6055908203125, + "objective/kl": 2.6018030643463135, + "objective/non_score_reward": -0.1300901472568512, + "objective/rlhf_reward": -0.7805408239364624, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.367836952209473, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4033203125, + "step": 187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970029592514038 + }, + { + "episode": 4536, + "epoch": 0.009059208397742389, + "loss/policy_avg": -0.018038026988506317, + "lr": 2.945935582822086e-06, + "objective/entropy": 111.50608825683594, + "objective/kl": 3.1104469299316406, + "objective/non_score_reward": -0.15552234649658203, + "objective/rlhf_reward": 0.9596551817348804, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 6.256875038146973, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4921875, + "step": 188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998650074005127 + }, + { + "episode": 4560, + "epoch": 0.009107140717307163, + "loss/policy_avg": 0.0629761666059494, + "lr": 2.9456480061349693e-06, + "objective/entropy": 94.15446472167969, + "objective/kl": 7.072273254394531, + "objective/non_score_reward": -0.3536137044429779, + "objective/rlhf_reward": 0.19943478066215548, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 5.015763759613037, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4111328125, + "step": 189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977893829345703 + }, + { + "episode": 4584, + "epoch": 0.009155073036871937, + "loss/policy_avg": 0.09882394969463348, + "lr": 2.9453604294478527e-06, + "objective/entropy": 110.31282806396484, + "objective/kl": 3.7707881927490234, + "objective/non_score_reward": -0.18853941559791565, + "objective/rlhf_reward": 1.006006673764123, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.8660739660263062, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.525390625, + "step": 190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003352165222168 + }, + { + "episode": 4608, + "epoch": 0.009203005356436712, + "loss/policy_avg": 0.05224372074007988, + "lr": 2.945072852760736e-06, + "objective/entropy": 77.56993103027344, + "objective/kl": 2.3601417541503906, + "objective/non_score_reward": -0.11800706386566162, + "objective/rlhf_reward": -0.7080423720180988, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.3037220239639282, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4619140625, + "step": 191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002467632293701 + }, + { + "episode": 4632, + "epoch": 0.009250937676001486, + "loss/policy_avg": 0.025189347565174103, + "lr": 2.9447852760736196e-06, + "objective/entropy": 94.3932113647461, + "objective/kl": 3.729519844055176, + "objective/non_score_reward": -0.1864759922027588, + "objective/rlhf_reward": 4.8811439499258995, + "objective/scores": 1.0, + "policy/approxkl_avg": 5.248610973358154, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4130859375, + "step": 192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970158338546753 + }, + { + "episode": 4656, + "epoch": 0.009298869995566261, + "loss/policy_avg": 0.08707498013973236, + "lr": 2.944497699386503e-06, + "objective/entropy": 83.36654663085938, + "objective/kl": 1.8804826736450195, + "objective/non_score_reward": -0.09402414411306381, + "objective/rlhf_reward": 1.573098265420331, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.446077585220337, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.5703125, + "step": 193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9964580535888672 + }, + { + "episode": 4680, + "epoch": 0.009346802315131035, + "loss/policy_avg": 0.2270449846982956, + "lr": 2.9442101226993864e-06, + "objective/entropy": 79.55392456054688, + "objective/kl": 2.850280284881592, + "objective/non_score_reward": -0.14251400530338287, + "objective/rlhf_reward": 1.7289753762247058, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.7298240661621094, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.36328125, + "step": 194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002413272857666 + }, + { + "episode": 4704, + "epoch": 0.00939473463469581, + "loss/policy_avg": 0.031376324594020844, + "lr": 2.9439225460122703e-06, + "objective/entropy": 109.0032958984375, + "objective/kl": 2.929868459701538, + "objective/non_score_reward": -0.14649342000484467, + "objective/rlhf_reward": -0.8789605014026165, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.109142780303955, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.53515625, + "step": 195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997551679611206 + }, + { + "episode": 4728, + "epoch": 0.009442666954260584, + "loss/policy_avg": -0.010848847217857838, + "lr": 2.9436349693251537e-06, + "objective/entropy": 112.34270477294922, + "objective/kl": 2.358966588973999, + "objective/non_score_reward": -0.11794832348823547, + "objective/rlhf_reward": -0.7076899390667677, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8509020805358887, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.533203125, + "step": 196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003365993499756 + }, + { + "episode": 4752, + "epoch": 0.009490599273825359, + "loss/policy_avg": 0.006295612081885338, + "lr": 2.943347392638037e-06, + "objective/entropy": 79.82736206054688, + "objective/kl": 2.0001227855682373, + "objective/non_score_reward": -0.10000614821910858, + "objective/rlhf_reward": 2.3999631367623806, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.3670144081115723, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3701171875, + "step": 197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000711441040039 + }, + { + "episode": 4776, + "epoch": 0.009538531593390133, + "loss/policy_avg": 0.023274961858987808, + "lr": 2.9430598159509205e-06, + "objective/entropy": 143.14405822753906, + "objective/kl": 2.683361530303955, + "objective/non_score_reward": -0.1341680884361267, + "objective/rlhf_reward": -0.8050084561109543, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6977474689483643, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.6484375, + "step": 198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967350959777832 + }, + { + "episode": 4800, + "epoch": 0.009586463912954908, + "loss/policy_avg": 0.0484347827732563, + "lr": 2.942772239263804e-06, + "objective/entropy": 70.44638061523438, + "objective/kl": 2.6708407402038574, + "objective/non_score_reward": -0.1335420310497284, + "objective/rlhf_reward": -0.8012522086501122, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.359506368637085, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.2998046875, + "step": 199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001941680908203 + }, + { + "episode": 4824, + "epoch": 0.009634396232519682, + "loss/policy_avg": 0.0490962453186512, + "lr": 2.9424846625766874e-06, + "objective/entropy": 138.9716339111328, + "objective/kl": 1.4425926208496094, + "objective/non_score_reward": -0.07212963700294495, + "objective/rlhf_reward": -0.4327778033912182, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.90745210647583, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.712890625, + "step": 200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985578060150146 + }, + { + "episode": 4848, + "epoch": 0.009682328552084457, + "loss/policy_avg": 0.011111119762063026, + "lr": 2.9421970858895704e-06, + "objective/entropy": 94.70375061035156, + "objective/kl": 2.026273727416992, + "objective/non_score_reward": -0.10131368041038513, + "objective/rlhf_reward": 1.1982979101480278, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.106295585632324, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.48046875, + "step": 201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998087882995605 + }, + { + "episode": 4872, + "epoch": 0.009730260871649231, + "loss/policy_avg": 0.14467109739780426, + "lr": 2.941909509202454e-06, + "objective/entropy": 132.40115356445312, + "objective/kl": 2.6548614501953125, + "objective/non_score_reward": -0.13274309039115906, + "objective/rlhf_reward": -0.7964584790170193, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.2515690326690674, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.583984375, + "step": 202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024476051330566 + }, + { + "episode": 4896, + "epoch": 0.009778193191214006, + "loss/policy_avg": -0.015946131199598312, + "lr": 2.9416219325153372e-06, + "objective/entropy": 89.10612487792969, + "objective/kl": 3.743453025817871, + "objective/non_score_reward": -0.18717265129089355, + "objective/rlhf_reward": 1.0142072074521913, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.4171063899993896, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4140625, + "step": 203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999828815460205 + }, + { + "episode": 4920, + "epoch": 0.00982612551077878, + "loss/policy_avg": -0.004729601554572582, + "lr": 2.9413343558282207e-06, + "objective/entropy": 90.44398498535156, + "objective/kl": 2.668938398361206, + "objective/non_score_reward": -0.13344691693782806, + "objective/rlhf_reward": 1.336561611707939, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.1168293952941895, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4140625, + "step": 204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0030312538146973 + }, + { + "episode": 4944, + "epoch": 0.009874057830343555, + "loss/policy_avg": 0.009045930579304695, + "lr": 2.9410467791411045e-06, + "objective/entropy": 91.077880859375, + "objective/kl": 4.464130401611328, + "objective/non_score_reward": -0.2232065498828888, + "objective/rlhf_reward": -1.339239191263914, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.489830017089844, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3955078125, + "step": 205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992249011993408 + }, + { + "episode": 4968, + "epoch": 0.009921990149908329, + "loss/policy_avg": 0.03462841734290123, + "lr": 2.940759202453988e-06, + "objective/entropy": 80.1475601196289, + "objective/kl": 3.1364684104919434, + "objective/non_score_reward": -0.15682342648506165, + "objective/rlhf_reward": 1.1963025898147954, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.8880414962768555, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4736328125, + "step": 206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999406099319458 + }, + { + "episode": 4992, + "epoch": 0.009969922469473104, + "loss/policy_avg": 0.04384815692901611, + "lr": 2.9404716257668714e-06, + "objective/entropy": 143.44097900390625, + "objective/kl": 4.342629909515381, + "objective/non_score_reward": -0.21713152527809143, + "objective/rlhf_reward": 0.5033908735380802, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.68842887878418, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.599609375, + "step": 207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998138189315796 + }, + { + "episode": 5016, + "epoch": 0.010017854789037878, + "loss/policy_avg": 0.0068609099835157394, + "lr": 2.940184049079755e-06, + "objective/entropy": 95.34283447265625, + "objective/kl": 0.6369936466217041, + "objective/non_score_reward": -0.031849682331085205, + "objective/rlhf_reward": -0.19109809957444668, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6118857860565186, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4248046875, + "step": 208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000188112258911 + }, + { + "episode": 5040, + "epoch": 0.010065787108602653, + "loss/policy_avg": 0.020278628915548325, + "lr": 2.9398964723926382e-06, + "objective/entropy": 104.02425384521484, + "objective/kl": 2.4724924564361572, + "objective/non_score_reward": -0.12362462282180786, + "objective/rlhf_reward": -0.7417477183043957, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.406173825263977, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4384765625, + "step": 209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024924278259277 + }, + { + "episode": 5064, + "epoch": 0.010113719428167427, + "loss/policy_avg": 0.0054527949541807175, + "lr": 2.9396088957055217e-06, + "objective/entropy": 79.85182189941406, + "objective/kl": 3.1846823692321777, + "objective/non_score_reward": -0.15923413634300232, + "objective/rlhf_reward": -0.9554047547280788, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.934493064880371, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3837890625, + "step": 210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988563060760498 + }, + { + "episode": 5088, + "epoch": 0.010161651747732202, + "loss/policy_avg": 0.026497410610318184, + "lr": 2.939321319018405e-06, + "objective/entropy": 95.25618743896484, + "objective/kl": 5.95845890045166, + "objective/non_score_reward": -0.29792293906211853, + "objective/rlhf_reward": -1.7875376045703888, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.662364482879639, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4482421875, + "step": 211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997788429260254 + }, + { + "episode": 5112, + "epoch": 0.010209584067296976, + "loss/policy_avg": -0.0156431645154953, + "lr": 2.9390337423312885e-06, + "objective/entropy": 93.28901672363281, + "objective/kl": 2.0243005752563477, + "objective/non_score_reward": -0.10121503472328186, + "objective/rlhf_reward": 3.1782883056384734, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 6.154999732971191, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.486328125, + "step": 212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996551513671875 + }, + { + "episode": 5136, + "epoch": 0.01025751638686175, + "loss/policy_avg": 0.11566158384084702, + "lr": 2.938746165644172e-06, + "objective/entropy": 84.64888000488281, + "objective/kl": 3.7028586864471436, + "objective/non_score_reward": -0.18514293432235718, + "objective/rlhf_reward": 4.889142356812954, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.0737574100494385, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3779296875, + "step": 213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002824068069458 + }, + { + "episode": 5160, + "epoch": 0.010305448706426525, + "loss/policy_avg": 0.03255853429436684, + "lr": 2.9384585889570554e-06, + "objective/entropy": 93.01973724365234, + "objective/kl": 3.0276973247528076, + "objective/non_score_reward": -0.1513848900794983, + "objective/rlhf_reward": -0.9083093330264091, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.688408851623535, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4326171875, + "step": 214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980957508087158 + }, + { + "episode": 5184, + "epoch": 0.0103533810259913, + "loss/policy_avg": 0.010334126651287079, + "lr": 2.9381710122699384e-06, + "objective/entropy": 96.66116333007812, + "objective/kl": 1.7254347801208496, + "objective/non_score_reward": -0.0862717479467392, + "objective/rlhf_reward": 1.4823695756495, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.9884190559387207, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.451171875, + "step": 215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018539428710938 + }, + { + "episode": 5208, + "epoch": 0.010401313345556076, + "loss/policy_avg": 0.03999362140893936, + "lr": 2.937883435582822e-06, + "objective/entropy": 106.20416259765625, + "objective/kl": 4.020906448364258, + "objective/non_score_reward": -0.20104531943798065, + "objective/rlhf_reward": -1.2062720246613026, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.893670082092285, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4794921875, + "step": 216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967663288116455 + }, + { + "episode": 5232, + "epoch": 0.01044924566512085, + "loss/policy_avg": 0.013381986878812313, + "lr": 2.9375958588957056e-06, + "objective/entropy": 90.11424255371094, + "objective/kl": 2.356520652770996, + "objective/non_score_reward": -0.11782602965831757, + "objective/rlhf_reward": -0.7069562170654535, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.7618801593780518, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3935546875, + "step": 217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0025219917297363 + }, + { + "episode": 5256, + "epoch": 0.010497177984685625, + "loss/policy_avg": 0.024674871936440468, + "lr": 2.937308282208589e-06, + "objective/entropy": 93.43382263183594, + "objective/kl": 5.286586761474609, + "objective/non_score_reward": -0.26432931423187256, + "objective/rlhf_reward": -1.5859759449958801, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.952975273132324, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4375, + "step": 218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9958469867706299 + }, + { + "episode": 5280, + "epoch": 0.010545110304250399, + "loss/policy_avg": 0.11505468934774399, + "lr": 2.9370207055214725e-06, + "objective/entropy": 82.6742172241211, + "objective/kl": 1.9254010915756226, + "objective/non_score_reward": -0.09627006947994232, + "objective/rlhf_reward": -0.5776203647255898, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.854060173034668, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3759765625, + "step": 219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002312660217285 + }, + { + "episode": 5304, + "epoch": 0.010593042623815174, + "loss/policy_avg": 0.03429210186004639, + "lr": 2.936733128834356e-06, + "objective/entropy": 84.30593872070312, + "objective/kl": 3.3239188194274902, + "objective/non_score_reward": -0.16619592905044556, + "objective/rlhf_reward": -0.997175544500351, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.019024848937988, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3857421875, + "step": 220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975643157958984 + }, + { + "episode": 5328, + "epoch": 0.010640974943379948, + "loss/policy_avg": 0.05632080137729645, + "lr": 2.9364455521472393e-06, + "objective/entropy": 87.685546875, + "objective/kl": 4.226175308227539, + "objective/non_score_reward": -0.2113087773323059, + "objective/rlhf_reward": -1.2678526304662228, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.492901802062988, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4873046875, + "step": 221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981372356414795 + }, + { + "episode": 5352, + "epoch": 0.010688907262944723, + "loss/policy_avg": 0.05630398541688919, + "lr": 2.9361579754601228e-06, + "objective/entropy": 93.71208190917969, + "objective/kl": 2.805910110473633, + "objective/non_score_reward": -0.14029550552368164, + "objective/rlhf_reward": -0.841773010790348, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.403786659240723, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4150390625, + "step": 222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993641376495361 + }, + { + "episode": 5376, + "epoch": 0.010736839582509497, + "loss/policy_avg": 0.058135662227869034, + "lr": 2.935870398773006e-06, + "objective/entropy": 97.3525390625, + "objective/kl": 3.797070264816284, + "objective/non_score_reward": -0.1898535192012787, + "objective/rlhf_reward": -1.1391210034489632, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6912543773651123, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.421875, + "step": 223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999557733535767 + }, + { + "episode": 5400, + "epoch": 0.010784771902074271, + "loss/policy_avg": 0.09301318228244781, + "lr": 2.9355828220858896e-06, + "objective/entropy": 85.90758514404297, + "objective/kl": 1.0830625295639038, + "objective/non_score_reward": -0.05415312573313713, + "objective/rlhf_reward": -0.32491875626146793, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.186797142028809, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3759765625, + "step": 224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986255168914795 + }, + { + "episode": 5424, + "epoch": 0.010832704221639046, + "loss/policy_avg": 0.1373138725757599, + "lr": 2.935295245398773e-06, + "objective/entropy": 82.23826599121094, + "objective/kl": 2.4237821102142334, + "objective/non_score_reward": -0.12118911743164062, + "objective/rlhf_reward": 1.5939821276415351, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.9385035037994385, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3564453125, + "step": 225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998719692230225 + }, + { + "episode": 5448, + "epoch": 0.01088063654120382, + "loss/policy_avg": 0.005655727814882994, + "lr": 2.9350076687116565e-06, + "objective/entropy": 92.49668884277344, + "objective/kl": 3.041985511779785, + "objective/non_score_reward": -0.15209928154945374, + "objective/rlhf_reward": 1.0874043926596642, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.270963668823242, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4423828125, + "step": 226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977118968963623 + }, + { + "episode": 5472, + "epoch": 0.010928568860768595, + "loss/policy_avg": -0.001242516329512, + "lr": 2.93472009202454e-06, + "objective/entropy": 85.49160766601562, + "objective/kl": 3.974921941757202, + "objective/non_score_reward": -0.19874611496925354, + "objective/rlhf_reward": 2.5931018912178687, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 7.586650848388672, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4287109375, + "step": 227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998878240585327 + }, + { + "episode": 5496, + "epoch": 0.01097650118033337, + "loss/policy_avg": 0.056750498712062836, + "lr": 2.9344325153374233e-06, + "objective/entropy": 98.63762664794922, + "objective/kl": 5.498817443847656, + "objective/non_score_reward": -0.2749408781528473, + "objective/rlhf_reward": -1.6496451944112778, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.921656847000122, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4130859375, + "step": 228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001258134841919 + }, + { + "episode": 5520, + "epoch": 0.011024433499898144, + "loss/policy_avg": -0.019674280658364296, + "lr": 2.934144938650307e-06, + "objective/entropy": 79.04399871826172, + "objective/kl": 3.4161670207977295, + "objective/non_score_reward": -0.1708083599805832, + "objective/rlhf_reward": -1.0248500891029835, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.534562110900879, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.408203125, + "step": 229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0021302700042725 + }, + { + "episode": 5544, + "epoch": 0.011072365819462918, + "loss/policy_avg": 0.020816290751099586, + "lr": 2.93385736196319e-06, + "objective/entropy": 77.131591796875, + "objective/kl": 2.6513423919677734, + "objective/non_score_reward": -0.1325671374797821, + "objective/rlhf_reward": 1.0973864656380024, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.8550195693969727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4072265625, + "step": 230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0004072189331055 + }, + { + "episode": 5568, + "epoch": 0.011120298139027693, + "loss/policy_avg": 0.012486208230257034, + "lr": 2.9335697852760736e-06, + "objective/entropy": 118.64522552490234, + "objective/kl": 6.101411819458008, + "objective/non_score_reward": -0.3050706088542938, + "objective/rlhf_reward": 0.1695764362812041, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 6.955193519592285, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.53515625, + "step": 231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976553916931152 + }, + { + "episode": 5592, + "epoch": 0.011168230458592467, + "loss/policy_avg": 0.12744972109794617, + "lr": 2.933282208588957e-06, + "objective/entropy": 101.43260192871094, + "objective/kl": 5.118263244628906, + "objective/non_score_reward": -0.2559131979942322, + "objective/rlhf_reward": -1.5354789718985558, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7828927040100098, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.474609375, + "step": 232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0029213428497314 + }, + { + "episode": 5616, + "epoch": 0.011216162778157242, + "loss/policy_avg": 0.037266749888658524, + "lr": 2.9329946319018405e-06, + "objective/entropy": 81.0868148803711, + "objective/kl": 3.3699440956115723, + "objective/non_score_reward": -0.1684972047805786, + "objective/rlhf_reward": 0.7234057832405649, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 5.356492042541504, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3955078125, + "step": 233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997218132019043 + }, + { + "episode": 5640, + "epoch": 0.011264095097722016, + "loss/policy_avg": 0.08187304437160492, + "lr": 2.932707055214724e-06, + "objective/entropy": 108.38780212402344, + "objective/kl": 2.629844903945923, + "objective/non_score_reward": -0.1314922422170639, + "objective/rlhf_reward": 1.2110465709120035, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.097379446029663, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4990234375, + "step": 234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007314682006836 + }, + { + "episode": 5664, + "epoch": 0.01131202741728679, + "loss/policy_avg": 0.10844515264034271, + "lr": 2.9324194785276073e-06, + "objective/entropy": 119.04045104980469, + "objective/kl": 2.6839022636413574, + "objective/non_score_reward": -0.13419508934020996, + "objective/rlhf_reward": -0.8051705546677113, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.353799819946289, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.501953125, + "step": 235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981358051300049 + }, + { + "episode": 5688, + "epoch": 0.011359959736851565, + "loss/policy_avg": 0.04195040464401245, + "lr": 2.9321319018404907e-06, + "objective/entropy": 87.08504486083984, + "objective/kl": 3.4860987663269043, + "objective/non_score_reward": -0.17430493235588074, + "objective/rlhf_reward": -1.045829564332962, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9313338994979858, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.400390625, + "step": 236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003347396850586 + }, + { + "episode": 5712, + "epoch": 0.01140789205641634, + "loss/policy_avg": 0.01498046051710844, + "lr": 2.931844325153374e-06, + "objective/entropy": 125.07292175292969, + "objective/kl": 3.1752548217773438, + "objective/non_score_reward": -0.15876273810863495, + "objective/rlhf_reward": -0.9525764416903257, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9016900062561035, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988805055618286 + }, + { + "episode": 5736, + "epoch": 0.011455824375981114, + "loss/policy_avg": 0.02493833564221859, + "lr": 2.9315567484662576e-06, + "objective/entropy": 101.29133605957031, + "objective/kl": 2.8423898220062256, + "objective/non_score_reward": -0.14211949706077576, + "objective/rlhf_reward": 1.7313423660757037, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.1906590461730957, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.58203125, + "step": 238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998305082321167 + }, + { + "episode": 5760, + "epoch": 0.011503756695545889, + "loss/policy_avg": 0.10181448608636856, + "lr": 2.9312691717791414e-06, + "objective/entropy": 97.26934814453125, + "objective/kl": 2.780778646469116, + "objective/non_score_reward": -0.13903895020484924, + "objective/rlhf_reward": -0.8342336155474186, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.297544002532959, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.427734375, + "step": 239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0022928714752197 + }, + { + "episode": 5784, + "epoch": 0.011551689015110663, + "loss/policy_avg": -0.008298151195049286, + "lr": 2.930981595092025e-06, + "objective/entropy": 113.0555419921875, + "objective/kl": 3.998429775238037, + "objective/non_score_reward": -0.19992151856422424, + "objective/rlhf_reward": 1.8004709482192993, + "objective/scores": 0.5, + "policy/approxkl_avg": 5.073122978210449, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4833984375, + "step": 240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985439777374268 + }, + { + "episode": 5808, + "epoch": 0.011599621334675438, + "loss/policy_avg": 0.02471723034977913, + "lr": 2.9306940184049083e-06, + "objective/entropy": 104.7915267944336, + "objective/kl": 1.6734800338745117, + "objective/non_score_reward": -0.08367400616407394, + "objective/rlhf_reward": -0.5020440332591534, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.923748016357422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.525390625, + "step": 241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.995842456817627 + }, + { + "episode": 5832, + "epoch": 0.011647553654240212, + "loss/policy_avg": 0.014984328299760818, + "lr": 2.9304064417177917e-06, + "objective/entropy": 98.41435241699219, + "objective/kl": 2.7530505657196045, + "objective/non_score_reward": -0.13765253126621246, + "objective/rlhf_reward": -0.825915178284049, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.041522026062012, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4208984375, + "step": 242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003490447998047 + }, + { + "episode": 5856, + "epoch": 0.011695485973804987, + "loss/policy_avg": 0.05915838107466698, + "lr": 2.930118865030675e-06, + "objective/entropy": 85.16780090332031, + "objective/kl": 2.717658758163452, + "objective/non_score_reward": -0.1358829289674759, + "objective/rlhf_reward": 2.184702467173338, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.279115676879883, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4111328125, + "step": 243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966355562210083 + }, + { + "episode": 5880, + "epoch": 0.011743418293369761, + "loss/policy_avg": -0.012033217586576939, + "lr": 2.9298312883435586e-06, + "objective/entropy": 82.0631103515625, + "objective/kl": 3.747727870941162, + "objective/non_score_reward": -0.1873863786458969, + "objective/rlhf_reward": -1.1243183091282845, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.415389895439148, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3583984375, + "step": 244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0022215843200684 + }, + { + "episode": 5904, + "epoch": 0.011791350612934537, + "loss/policy_avg": 0.019205952063202858, + "lr": 2.9295437116564416e-06, + "objective/entropy": 76.45556640625, + "objective/kl": 0.7838114500045776, + "objective/non_score_reward": -0.03919057548046112, + "objective/rlhf_reward": -0.23514343798160553, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4736899137496948, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3564453125, + "step": 245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003293991088867 + }, + { + "episode": 5928, + "epoch": 0.011839282932499312, + "loss/policy_avg": 0.007923373021185398, + "lr": 2.929256134969325e-06, + "objective/entropy": 93.68626403808594, + "objective/kl": 2.3337550163269043, + "objective/non_score_reward": -0.11668774485588074, + "objective/rlhf_reward": -0.7001264840364456, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.434812068939209, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43359375, + "step": 246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9965038299560547 + }, + { + "episode": 5952, + "epoch": 0.011887215252064086, + "loss/policy_avg": 0.06653881818056107, + "lr": 2.9289685582822084e-06, + "objective/entropy": 76.97587585449219, + "objective/kl": 3.655611753463745, + "objective/non_score_reward": -0.18278059363365173, + "objective/rlhf_reward": -1.0966835021972656, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8228418827056885, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3681640625, + "step": 247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00066876411438 + }, + { + "episode": 5976, + "epoch": 0.01193514757162886, + "loss/policy_avg": 0.11899162828922272, + "lr": 2.928680981595092e-06, + "objective/entropy": 110.79756164550781, + "objective/kl": 3.855869770050049, + "objective/non_score_reward": -0.19279348850250244, + "objective/rlhf_reward": 0.84323912858963, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.1643248796463013, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.494140625, + "step": 248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0040478706359863 + }, + { + "episode": 6000, + "epoch": 0.011983079891193635, + "loss/policy_avg": -0.02898324280977249, + "lr": 2.9283934049079753e-06, + "objective/entropy": 83.2615966796875, + "objective/kl": 1.5065553188323975, + "objective/non_score_reward": -0.07532777637243271, + "objective/rlhf_reward": 1.8691502187002662, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.771254777908325, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.361328125, + "step": 249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000098705291748 + }, + { + "episode": 6024, + "epoch": 0.01203101221075841, + "loss/policy_avg": -0.023785054683685303, + "lr": 2.928105828220859e-06, + "objective/entropy": 108.30790710449219, + "objective/kl": 1.027402639389038, + "objective/non_score_reward": -0.05137012526392937, + "objective/rlhf_reward": -0.3082207590341568, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.8280510902404785, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.48046875, + "step": 250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001099109649658 + }, + { + "episode": 6048, + "epoch": 0.012078944530323184, + "loss/policy_avg": 0.06121671944856644, + "lr": 2.9278182515337425e-06, + "objective/entropy": 104.1197509765625, + "objective/kl": 3.1020126342773438, + "objective/non_score_reward": -0.15510064363479614, + "objective/rlhf_reward": 2.069396197795868, + "objective/scores": 0.5, + "policy/approxkl_avg": 6.174240589141846, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.478515625, + "step": 251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970431327819824 + }, + { + "episode": 6072, + "epoch": 0.012126876849887959, + "loss/policy_avg": 0.023399360477924347, + "lr": 2.927530674846626e-06, + "objective/entropy": 86.40084075927734, + "objective/kl": 4.951199531555176, + "objective/non_score_reward": -0.24755996465682983, + "objective/rlhf_reward": 4.514640212059021, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.389209508895874, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.455078125, + "step": 252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994969367980957 + }, + { + "episode": 6096, + "epoch": 0.012174809169452733, + "loss/policy_avg": 0.012841073796153069, + "lr": 2.9272430981595094e-06, + "objective/entropy": 133.25868225097656, + "objective/kl": 2.0328173637390137, + "objective/non_score_reward": -0.10164086520671844, + "objective/rlhf_reward": 3.17573340096895, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.0844686031341553, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.580078125, + "step": 253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999415636062622 + }, + { + "episode": 6120, + "epoch": 0.012222741489017508, + "loss/policy_avg": 0.11766739934682846, + "lr": 2.926955521472393e-06, + "objective/entropy": 81.72367858886719, + "objective/kl": 3.480441093444824, + "objective/non_score_reward": -0.17402204871177673, + "objective/rlhf_reward": -1.044132323935628, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8451051712036133, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.365234375, + "step": 254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002162218093872 + }, + { + "episode": 6144, + "epoch": 0.012270673808582282, + "loss/policy_avg": 0.018339507281780243, + "lr": 2.9266679447852762e-06, + "objective/entropy": 105.31643676757812, + "objective/kl": 3.7081727981567383, + "objective/non_score_reward": -0.18540863692760468, + "objective/rlhf_reward": -1.112451858818531, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8820916414260864, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.474609375, + "step": 255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002873659133911 + }, + { + "episode": 6168, + "epoch": 0.012318606128147057, + "loss/policy_avg": 0.08617917448282242, + "lr": 2.9263803680981597e-06, + "objective/entropy": 89.80501556396484, + "objective/kl": 2.6696174144744873, + "objective/non_score_reward": -0.13348087668418884, + "objective/rlhf_reward": -0.8008852154016495, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6820005178451538, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.431640625, + "step": 256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0030927658081055 + }, + { + "episode": 6192, + "epoch": 0.012366538447711831, + "loss/policy_avg": -0.011819163337349892, + "lr": 2.926092791411043e-06, + "objective/entropy": 87.37083435058594, + "objective/kl": 4.450558662414551, + "objective/non_score_reward": -0.22252792119979858, + "objective/rlhf_reward": 0.6648325026035308, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.1071999073028564, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.404296875, + "step": 257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999183177947998 + }, + { + "episode": 6216, + "epoch": 0.012414470767276605, + "loss/policy_avg": 0.05334334820508957, + "lr": 2.9258052147239265e-06, + "objective/entropy": 124.79606628417969, + "objective/kl": 0.8793936967849731, + "objective/non_score_reward": -0.043969687074422836, + "objective/rlhf_reward": -0.26381811685860157, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3504815101623535, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.55859375, + "step": 258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000791072845459 + }, + { + "episode": 6240, + "epoch": 0.01246240308684138, + "loss/policy_avg": 0.028328947722911835, + "lr": 2.92551763803681e-06, + "objective/entropy": 90.93742370605469, + "objective/kl": 4.4687819480896, + "objective/non_score_reward": -0.22343909740447998, + "objective/rlhf_reward": 1.6593655049800873, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.166851997375488, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.435546875, + "step": 259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977554082870483 + }, + { + "episode": 6264, + "epoch": 0.012510335406406154, + "loss/policy_avg": 0.045545145869255066, + "lr": 2.9252300613496934e-06, + "objective/entropy": 90.69743347167969, + "objective/kl": 1.4929875135421753, + "objective/non_score_reward": -0.074649378657341, + "objective/rlhf_reward": 2.5521037578582764, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.441746473312378, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4208984375, + "step": 260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009889602661133 + }, + { + "episode": 6288, + "epoch": 0.012558267725970929, + "loss/policy_avg": -0.009454969316720963, + "lr": 2.924942484662577e-06, + "objective/entropy": 75.949951171875, + "objective/kl": 2.166445016860962, + "objective/non_score_reward": -0.10832224786281586, + "objective/rlhf_reward": 2.3500665444880724, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.1431922912597656, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.361328125, + "step": 261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0022785663604736 + }, + { + "episode": 6312, + "epoch": 0.012606200045535703, + "loss/policy_avg": 0.030427278950810432, + "lr": 2.9246549079754602e-06, + "objective/entropy": 94.03545379638672, + "objective/kl": 3.3064582347869873, + "objective/non_score_reward": -0.1653229147195816, + "objective/rlhf_reward": 0.900851772396883, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.270882606506348, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.41015625, + "step": 262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997680425643921 + }, + { + "episode": 6336, + "epoch": 0.012654132365100478, + "loss/policy_avg": 0.020906079560518265, + "lr": 2.9243673312883437e-06, + "objective/entropy": 88.37602996826172, + "objective/kl": 1.9585927724838257, + "objective/non_score_reward": -0.09792964160442352, + "objective/rlhf_reward": 1.996481483912656, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.340959072113037, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.375, + "step": 263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011086463928223 + }, + { + "episode": 6360, + "epoch": 0.012702064684665252, + "loss/policy_avg": 0.1495688408613205, + "lr": 2.924079754601227e-06, + "objective/entropy": 90.33613586425781, + "objective/kl": 2.2338414192199707, + "objective/non_score_reward": -0.11169207096099854, + "objective/rlhf_reward": -0.6701523959636688, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4653091430664062, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3857421875, + "step": 264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0059993267059326 + }, + { + "episode": 6384, + "epoch": 0.012749997004230027, + "loss/policy_avg": 0.057970039546489716, + "lr": 2.9237921779141105e-06, + "objective/entropy": 96.65043640136719, + "objective/kl": 3.9964311122894287, + "objective/non_score_reward": -0.1998215615749359, + "objective/rlhf_reward": -1.1989293303340673, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.647674560546875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.404296875, + "step": 265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993882179260254 + }, + { + "episode": 6408, + "epoch": 0.012797929323794801, + "loss/policy_avg": -0.05694024637341499, + "lr": 2.923504601226994e-06, + "objective/entropy": 88.165283203125, + "objective/kl": 0.8564585447311401, + "objective/non_score_reward": -0.04282292723655701, + "objective/rlhf_reward": -0.25693754479289055, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.079523801803589, + "policy/clipfrac_avg": 2.0, + "policy/entropy_avg": 0.4052734375, + "step": 266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000948429107666 + }, + { + "episode": 6432, + "epoch": 0.012845861643359576, + "loss/policy_avg": -0.015139162540435791, + "lr": 2.9232170245398774e-06, + "objective/entropy": 94.7279281616211, + "objective/kl": 0.5732266902923584, + "objective/non_score_reward": -0.02866133116185665, + "objective/rlhf_reward": -0.1719679832458496, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7629446983337402, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.423828125, + "step": 267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004628896713257 + }, + { + "episode": 6456, + "epoch": 0.01289379396292435, + "loss/policy_avg": -0.054962895810604095, + "lr": 2.9229294478527608e-06, + "objective/entropy": 86.30790710449219, + "objective/kl": 3.2080466747283936, + "objective/non_score_reward": -0.1604023277759552, + "objective/rlhf_reward": 1.358702943806744, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.9537370204925537, + "policy/clipfrac_avg": 1.8333333730697632, + "policy/entropy_avg": 0.3828125, + "step": 268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000812530517578 + }, + { + "episode": 6480, + "epoch": 0.012941726282489125, + "loss/policy_avg": 0.023103434592485428, + "lr": 2.922641871165644e-06, + "objective/entropy": 86.23143768310547, + "objective/kl": 5.41572904586792, + "objective/non_score_reward": -0.27078646421432495, + "objective/rlhf_reward": 0.10967026202834473, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 5.276633262634277, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3798828125, + "step": 269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984586238861084 + }, + { + "episode": 6504, + "epoch": 0.0129896586020539, + "loss/policy_avg": -0.011162176728248596, + "lr": 2.9223542944785276e-06, + "objective/entropy": 90.3081283569336, + "objective/kl": 3.932896375656128, + "objective/non_score_reward": -0.1966448277235031, + "objective/rlhf_reward": -1.1798689179122448, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.119757175445557, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.423828125, + "step": 270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967050552368164 + }, + { + "episode": 6528, + "epoch": 0.013037590921618674, + "loss/policy_avg": 0.04585479944944382, + "lr": 2.922066717791411e-06, + "objective/entropy": 99.12153625488281, + "objective/kl": 2.7620973587036133, + "objective/non_score_reward": -0.1381048858165741, + "objective/rlhf_reward": 5.171370702795684, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.762786626815796, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.494140625, + "step": 271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9968159198760986 + }, + { + "episode": 6552, + "epoch": 0.013085523241183448, + "loss/policy_avg": -0.016109932214021683, + "lr": 2.9217791411042945e-06, + "objective/entropy": 101.37968444824219, + "objective/kl": 3.8695180416107178, + "objective/non_score_reward": -0.1934759020805359, + "objective/rlhf_reward": 2.6247231201214007, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.8788094520568848, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4453125, + "step": 272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010104179382324 + }, + { + "episode": 6576, + "epoch": 0.013133455560748224, + "loss/policy_avg": 0.006009046919643879, + "lr": 2.9214915644171783e-06, + "objective/entropy": 84.63890075683594, + "objective/kl": 3.135849714279175, + "objective/non_score_reward": -0.15679249167442322, + "objective/rlhf_reward": -0.9407549053430557, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3323421478271484, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.353515625, + "step": 273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993886947631836 + }, + { + "episode": 6600, + "epoch": 0.013181387880312999, + "loss/policy_avg": 0.03798563778400421, + "lr": 2.9212039877300618e-06, + "objective/entropy": 93.93736267089844, + "objective/kl": 5.964257717132568, + "objective/non_score_reward": -0.29821285605430603, + "objective/rlhf_reward": -1.7892771661281586, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.465618133544922, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4404296875, + "step": 274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003280639648438 + }, + { + "episode": 6624, + "epoch": 0.013229320199877773, + "loss/policy_avg": 0.10159832239151001, + "lr": 2.9209164110429448e-06, + "objective/entropy": 96.040283203125, + "objective/kl": 1.8347787857055664, + "objective/non_score_reward": -0.09173893183469772, + "objective/rlhf_reward": -0.5504335854202509, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.418647050857544, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.42578125, + "step": 275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000245571136475 + }, + { + "episode": 6648, + "epoch": 0.013277252519442548, + "loss/policy_avg": 0.021702734753489494, + "lr": 2.920628834355828e-06, + "objective/entropy": 90.51353454589844, + "objective/kl": 6.418416976928711, + "objective/non_score_reward": -0.3209208846092224, + "objective/rlhf_reward": 0.3955915953565601, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.013636112213135, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4013671875, + "step": 276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997532606124878 + }, + { + "episode": 6672, + "epoch": 0.013325184839007322, + "loss/policy_avg": 0.14618083834648132, + "lr": 2.9203412576687116e-06, + "objective/entropy": 145.59263610839844, + "objective/kl": 4.6974616050720215, + "objective/non_score_reward": -0.23487308621406555, + "objective/rlhf_reward": 0.39694150140297746, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.452897548675537, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0023467540740967 + }, + { + "episode": 6696, + "epoch": 0.013373117158572097, + "loss/policy_avg": -0.008299178443849087, + "lr": 2.920053680981595e-06, + "objective/entropy": 87.92326354980469, + "objective/kl": 5.179488182067871, + "objective/non_score_reward": -0.2589744031429291, + "objective/rlhf_reward": 0.446153588593006, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.663969039916992, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4130859375, + "step": 278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999903440475464 + }, + { + "episode": 6720, + "epoch": 0.013421049478136871, + "loss/policy_avg": -0.008848732337355614, + "lr": 2.9197661042944785e-06, + "objective/entropy": 81.57136535644531, + "objective/kl": 6.620213985443115, + "objective/non_score_reward": -0.33101069927215576, + "objective/rlhf_reward": -1.9860640615224838, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.5465898513793945, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3583984375, + "step": 279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972364902496338 + }, + { + "episode": 6744, + "epoch": 0.013468981797701646, + "loss/policy_avg": -0.017478052526712418, + "lr": 2.919478527607362e-06, + "objective/entropy": 100.71482849121094, + "objective/kl": 2.6994457244873047, + "objective/non_score_reward": -0.13497227430343628, + "objective/rlhf_reward": -0.8098336905241013, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4589900970458984, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4541015625, + "step": 280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000110626220703 + }, + { + "episode": 6768, + "epoch": 0.01351691411726642, + "loss/policy_avg": 0.06539896130561829, + "lr": 2.9191909509202453e-06, + "objective/entropy": 93.04888916015625, + "objective/kl": 2.8248977661132812, + "objective/non_score_reward": -0.14124490320682526, + "objective/rlhf_reward": -0.8474694117903709, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.4623918533325195, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3994140625, + "step": 281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99784255027771 + }, + { + "episode": 6792, + "epoch": 0.013564846436831195, + "loss/policy_avg": -0.021046733483672142, + "lr": 2.9189033742331287e-06, + "objective/entropy": 92.93148040771484, + "objective/kl": 2.560904026031494, + "objective/non_score_reward": -0.1280452013015747, + "objective/rlhf_reward": 1.2317288517951965, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.754068374633789, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4033203125, + "step": 282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997408390045166 + }, + { + "episode": 6816, + "epoch": 0.01361277875639597, + "loss/policy_avg": -0.001279283780604601, + "lr": 2.918615797546012e-06, + "objective/entropy": 106.5361328125, + "objective/kl": 3.6714725494384766, + "objective/non_score_reward": -0.18357360363006592, + "objective/rlhf_reward": -1.1014416888356209, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.245214462280273, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4775390625, + "step": 283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018224716186523 + }, + { + "episode": 6840, + "epoch": 0.013660711075960744, + "loss/policy_avg": 0.0217137411236763, + "lr": 2.918328220858896e-06, + "objective/entropy": 108.07386779785156, + "objective/kl": 3.123398542404175, + "objective/non_score_reward": -0.15616995096206665, + "objective/rlhf_reward": -0.9370196424424648, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.864530086517334, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4697265625, + "step": 284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008575916290283 + }, + { + "episode": 6864, + "epoch": 0.013708643395525518, + "loss/policy_avg": 0.03368239104747772, + "lr": 2.9180406441717794e-06, + "objective/entropy": 97.82600402832031, + "objective/kl": 3.7092957496643066, + "objective/non_score_reward": -0.1854647845029831, + "objective/rlhf_reward": -1.1127887815237045, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.468292236328125, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4150390625, + "step": 285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9941192865371704 + }, + { + "episode": 6888, + "epoch": 0.013756575715090293, + "loss/policy_avg": 0.09430566430091858, + "lr": 2.917753067484663e-06, + "objective/entropy": 126.55015563964844, + "objective/kl": 3.5415759086608887, + "objective/non_score_reward": -0.17707878351211548, + "objective/rlhf_reward": 0.7437072505594525, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.0083212852478027, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.580078125, + "step": 286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998415231704712 + }, + { + "episode": 6912, + "epoch": 0.013804508034655067, + "loss/policy_avg": -0.033856041729450226, + "lr": 2.9174654907975463e-06, + "objective/entropy": 102.55089569091797, + "objective/kl": 2.207653522491455, + "objective/non_score_reward": -0.11038267612457275, + "objective/rlhf_reward": 1.1438839246870312, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.0778777599334717, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4345703125, + "step": 287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024948120117188 + }, + { + "episode": 6936, + "epoch": 0.013852440354219842, + "loss/policy_avg": 0.06326739490032196, + "lr": 2.9171779141104297e-06, + "objective/entropy": 98.32320404052734, + "objective/kl": 3.676898956298828, + "objective/non_score_reward": -0.18384495377540588, + "objective/rlhf_reward": 1.896930292248726, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.9141819477081299, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.427734375, + "step": 288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019731521606445 + }, + { + "episode": 6960, + "epoch": 0.013900372673784616, + "loss/policy_avg": 0.010744905099272728, + "lr": 2.9168903374233127e-06, + "objective/entropy": 133.8339080810547, + "objective/kl": 2.4336583614349365, + "objective/non_score_reward": -0.12168292701244354, + "objective/rlhf_reward": 1.1626917358926143, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.8940364122390747, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.56640625, + "step": 289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0009493827819824 + }, + { + "episode": 6984, + "epoch": 0.01394830499334939, + "loss/policy_avg": 0.021932030096650124, + "lr": 2.916602760736196e-06, + "objective/entropy": 138.59405517578125, + "objective/kl": 3.122079849243164, + "objective/non_score_reward": -0.15610399842262268, + "objective/rlhf_reward": 0.9561652850797977, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.614981770515442, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.61328125, + "step": 290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000474214553833 + }, + { + "episode": 7008, + "epoch": 0.013996237312914165, + "loss/policy_avg": 0.013710329309105873, + "lr": 2.9163151840490796e-06, + "objective/entropy": 125.12199401855469, + "objective/kl": 4.6168437004089355, + "objective/non_score_reward": -0.2308422178030014, + "objective/rlhf_reward": -1.385053239762783, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.361660003662109, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55859375, + "step": 291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000274658203125 + }, + { + "episode": 7032, + "epoch": 0.01404416963247894, + "loss/policy_avg": 0.07498032599687576, + "lr": 2.916027607361963e-06, + "objective/entropy": 90.966064453125, + "objective/kl": 5.488894939422607, + "objective/non_score_reward": -0.2744447588920593, + "objective/rlhf_reward": -1.6466684341430664, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.339931011199951, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4169921875, + "step": 292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999441385269165 + }, + { + "episode": 7056, + "epoch": 0.014092101952043714, + "loss/policy_avg": 0.01669841818511486, + "lr": 2.9157400306748464e-06, + "objective/entropy": 136.5240020751953, + "objective/kl": 0.5940343141555786, + "objective/non_score_reward": -0.02970171719789505, + "objective/rlhf_reward": -0.1782103143632412, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.136094093322754, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.61328125, + "step": 293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014166831970215 + }, + { + "episode": 7080, + "epoch": 0.014140034271608488, + "loss/policy_avg": -0.003980956040322781, + "lr": 2.9154524539877303e-06, + "objective/entropy": 78.958251953125, + "objective/kl": 3.3246102333068848, + "objective/non_score_reward": -0.16623049974441528, + "objective/rlhf_reward": 2.0026169791817665, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.685295343399048, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.412109375, + "step": 294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0039148330688477 + }, + { + "episode": 7104, + "epoch": 0.014187966591173263, + "loss/policy_avg": 0.013244928792119026, + "lr": 2.9151648773006137e-06, + "objective/entropy": 122.66305541992188, + "objective/kl": 3.111565113067627, + "objective/non_score_reward": -0.15557828545570374, + "objective/rlhf_reward": 1.0665303841233253, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.7815370559692383, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.541015625, + "step": 295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002041816711426 + }, + { + "episode": 7128, + "epoch": 0.014235898910738037, + "loss/policy_avg": 0.10367128252983093, + "lr": 2.914877300613497e-06, + "objective/entropy": 85.17204284667969, + "objective/kl": 2.7471847534179688, + "objective/non_score_reward": -0.13735926151275635, + "objective/rlhf_reward": 1.1758444905281065, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.3763587474823, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.43359375, + "step": 296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984318017959595 + }, + { + "episode": 7152, + "epoch": 0.014283831230302812, + "loss/policy_avg": 0.07810795307159424, + "lr": 2.9145897239263806e-06, + "objective/entropy": 106.44185638427734, + "objective/kl": 6.20510196685791, + "objective/non_score_reward": -0.3102550804615021, + "objective/rlhf_reward": 0.0312588673523273, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.8959155082702637, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4833984375, + "step": 297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001826286315918 + }, + { + "episode": 7176, + "epoch": 0.014331763549867586, + "loss/policy_avg": 0.05284015089273453, + "lr": 2.914302147239264e-06, + "objective/entropy": 97.61809539794922, + "objective/kl": 3.4935054779052734, + "objective/non_score_reward": -0.17467527091503143, + "objective/rlhf_reward": -1.0480516254901886, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.512946605682373, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.416015625, + "step": 298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0034971237182617 + }, + { + "episode": 7200, + "epoch": 0.01437969586943236, + "loss/policy_avg": 0.07915400713682175, + "lr": 2.9140145705521474e-06, + "objective/entropy": 105.74505615234375, + "objective/kl": 4.009130477905273, + "objective/non_score_reward": -0.20045652985572815, + "objective/rlhf_reward": -1.2027391493320465, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.725015640258789, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4677734375, + "step": 299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972293376922607 + }, + { + "episode": 7224, + "epoch": 0.014427628188997135, + "loss/policy_avg": 0.3397621810436249, + "lr": 2.913726993865031e-06, + "objective/entropy": 88.99009704589844, + "objective/kl": 3.6943633556365967, + "objective/non_score_reward": -0.18471817672252655, + "objective/rlhf_reward": 0.8916909620165824, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.0104448795318604, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.451171875, + "step": 300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0041422843933105 + }, + { + "episode": 7248, + "epoch": 0.01447556050856191, + "loss/policy_avg": 0.29519858956336975, + "lr": 2.9134394171779143e-06, + "objective/entropy": 132.9694366455078, + "objective/kl": 8.177940368652344, + "objective/non_score_reward": -0.4088970124721527, + "objective/rlhf_reward": -2.4533818028867245, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.353280544281006, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.544921875, + "step": 301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004878044128418 + }, + { + "episode": 7272, + "epoch": 0.014523492828126686, + "loss/policy_avg": 0.03422471880912781, + "lr": 2.9131518404907977e-06, + "objective/entropy": 108.39591979980469, + "objective/kl": 4.001317977905273, + "objective/non_score_reward": -0.20006594061851501, + "objective/rlhf_reward": -1.2003955319523811, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.640683174133301, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.46875, + "step": 302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999559998512268 + }, + { + "episode": 7296, + "epoch": 0.01457142514769146, + "loss/policy_avg": 0.11649920046329498, + "lr": 2.912864263803681e-06, + "objective/entropy": 87.92279052734375, + "objective/kl": 6.11470365524292, + "objective/non_score_reward": -0.3057352304458618, + "objective/rlhf_reward": -1.8344111815094948, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.98049259185791, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4150390625, + "step": 303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998186469078064 + }, + { + "episode": 7320, + "epoch": 0.014619357467256235, + "loss/policy_avg": 0.049859173595905304, + "lr": 2.912576687116564e-06, + "objective/entropy": 51.48081588745117, + "objective/kl": 5.328309059143066, + "objective/non_score_reward": -0.26641544699668884, + "objective/rlhf_reward": 2.187085899053257, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 1.5725510120391846, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.2998046875, + "step": 304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993510246276855 + }, + { + "episode": 7344, + "epoch": 0.01466728978682101, + "loss/policy_avg": 0.03393293172121048, + "lr": 2.912289110429448e-06, + "objective/entropy": 130.74639892578125, + "objective/kl": 4.686945915222168, + "objective/non_score_reward": -0.23434728384017944, + "objective/rlhf_reward": 1.5939163267612457, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.41365122795105, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.62890625, + "step": 305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997294545173645 + }, + { + "episode": 7368, + "epoch": 0.014715222106385784, + "loss/policy_avg": 0.06443352997303009, + "lr": 2.9120015337423314e-06, + "objective/entropy": 143.297119140625, + "objective/kl": 3.5830447673797607, + "objective/non_score_reward": -0.179152250289917, + "objective/rlhf_reward": 2.710665059736114, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 7.428189754486084, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.615234375, + "step": 306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0032620429992676 + }, + { + "episode": 7392, + "epoch": 0.014763154425950558, + "loss/policy_avg": 0.05828144773840904, + "lr": 2.911713957055215e-06, + "objective/entropy": 96.24331665039062, + "objective/kl": 4.57220458984375, + "objective/non_score_reward": -0.22861024737358093, + "objective/rlhf_reward": 0.3627274513140999, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.018941879272461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.404296875, + "step": 307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003509521484375 + }, + { + "episode": 7416, + "epoch": 0.014811086745515333, + "loss/policy_avg": 0.07702429592609406, + "lr": 2.9114263803680982e-06, + "objective/entropy": 117.18355560302734, + "objective/kl": 3.9329171180725098, + "objective/non_score_reward": -0.1966458559036255, + "objective/rlhf_reward": 2.605703460512798, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.7093324661254883, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.583984375, + "step": 308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9978505373001099 + }, + { + "episode": 7440, + "epoch": 0.014859019065080107, + "loss/policy_avg": 0.01833891123533249, + "lr": 2.9111388036809817e-06, + "objective/entropy": 75.61592864990234, + "objective/kl": 4.789632797241211, + "objective/non_score_reward": -0.2394816279411316, + "objective/rlhf_reward": -1.4368897899985313, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4046483039855957, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3525390625, + "step": 309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996024370193481 + }, + { + "episode": 7464, + "epoch": 0.014906951384644882, + "loss/policy_avg": 0.1536983996629715, + "lr": 2.910851226993865e-06, + "objective/entropy": 93.67634582519531, + "objective/kl": 6.5604681968688965, + "objective/non_score_reward": -0.32802343368530273, + "objective/rlhf_reward": -1.9681406021118164, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.891597747802734, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3779296875, + "step": 310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984748363494873 + }, + { + "episode": 7488, + "epoch": 0.014954883704209656, + "loss/policy_avg": 0.011402620002627373, + "lr": 2.9105636503067485e-06, + "objective/entropy": 123.21163940429688, + "objective/kl": 3.4037883281707764, + "objective/non_score_reward": -0.17018942534923553, + "objective/rlhf_reward": 2.7644420661908797, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.6413912773132324, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.537109375, + "step": 311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000805616378784 + }, + { + "episode": 7512, + "epoch": 0.01500281602377443, + "loss/policy_avg": -0.017781831324100494, + "lr": 2.910276073619632e-06, + "objective/entropy": 89.88966369628906, + "objective/kl": 4.689935207366943, + "objective/non_score_reward": -0.23449677228927612, + "objective/rlhf_reward": 0.3991993942649398, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 12.010757446289062, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3759765625, + "step": 312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964230060577393 + }, + { + "episode": 7536, + "epoch": 0.015050748343339205, + "loss/policy_avg": 0.397849440574646, + "lr": 2.9099884969325154e-06, + "objective/entropy": 133.92901611328125, + "objective/kl": 2.635261058807373, + "objective/non_score_reward": -0.13176307082176208, + "objective/rlhf_reward": -0.7905783653259277, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.357858657836914, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.587890625, + "step": 313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0038259029388428 + }, + { + "episode": 7560, + "epoch": 0.01509868066290398, + "loss/policy_avg": 0.009789666160941124, + "lr": 2.909700920245399e-06, + "objective/entropy": 85.20477294921875, + "objective/kl": 3.2531774044036865, + "objective/non_score_reward": -0.16265887022018433, + "objective/rlhf_reward": 2.024046815931797, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.6707112789154053, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4072265625, + "step": 314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000591993331909 + }, + { + "episode": 7584, + "epoch": 0.015146612982468754, + "loss/policy_avg": 0.07965235412120819, + "lr": 2.9094133435582822e-06, + "objective/entropy": 130.97622680664062, + "objective/kl": 1.001415491104126, + "objective/non_score_reward": -0.050070762634277344, + "objective/rlhf_reward": -0.30042457580566406, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1857898235321045, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.580078125, + "step": 315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004425048828125 + }, + { + "episode": 7608, + "epoch": 0.015194545302033529, + "loss/policy_avg": 0.13537311553955078, + "lr": 2.9091257668711657e-06, + "objective/entropy": 105.13624572753906, + "objective/kl": 3.2484593391418457, + "objective/non_score_reward": -0.16242295503616333, + "objective/rlhf_reward": -0.9745377786457539, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.052133560180664, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.525390625, + "step": 316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991886615753174 + }, + { + "episode": 7632, + "epoch": 0.015242477621598303, + "loss/policy_avg": 0.011165416799485683, + "lr": 2.908838190184049e-06, + "objective/entropy": 91.5357666015625, + "objective/kl": 4.68283748626709, + "objective/non_score_reward": -0.23414187133312225, + "objective/rlhf_reward": -1.4048511646687984, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.0650651454925537, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.423828125, + "step": 317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000990629196167 + }, + { + "episode": 7656, + "epoch": 0.015290409941163078, + "loss/policy_avg": -0.03712450712919235, + "lr": 2.908550613496933e-06, + "objective/entropy": 89.60869598388672, + "objective/kl": 4.538851737976074, + "objective/non_score_reward": -0.22694258391857147, + "objective/rlhf_reward": 0.9594613547969821, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.544269323348999, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4111328125, + "step": 318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0021491050720215 + }, + { + "episode": 7680, + "epoch": 0.015338342260727852, + "loss/policy_avg": 0.010877646505832672, + "lr": 2.908263036809816e-06, + "objective/entropy": 132.98094177246094, + "objective/kl": 2.603564739227295, + "objective/non_score_reward": -0.13017825782299042, + "objective/rlhf_reward": -0.7810694873332977, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.394895553588867, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.564453125, + "step": 319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001263380050659 + }, + { + "episode": 7704, + "epoch": 0.015386274580292627, + "loss/policy_avg": 0.024066681042313576, + "lr": 2.9079754601226994e-06, + "objective/entropy": 94.44615173339844, + "objective/kl": 2.314455270767212, + "objective/non_score_reward": -0.11572276055812836, + "objective/rlhf_reward": 1.1118434255362781, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.7768396139144897, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.55078125, + "step": 320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00138521194458 + }, + { + "episode": 7728, + "epoch": 0.015434206899857401, + "loss/policy_avg": 0.05542634055018425, + "lr": 2.9076878834355828e-06, + "objective/entropy": 89.4962158203125, + "objective/kl": 4.772665977478027, + "objective/non_score_reward": -0.23863327503204346, + "objective/rlhf_reward": 4.568200355395675, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.764266014099121, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4140625, + "step": 321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988882541656494 + }, + { + "episode": 7752, + "epoch": 0.015482139219422176, + "loss/policy_avg": 0.05841389670968056, + "lr": 2.907400306748466e-06, + "objective/entropy": 109.31379699707031, + "objective/kl": 2.5222527980804443, + "objective/non_score_reward": -0.1261126548051834, + "objective/rlhf_reward": 1.1361133728614654, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.302779197692871, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.48828125, + "step": 322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997952938079834 + }, + { + "episode": 7776, + "epoch": 0.01553007153898695, + "loss/policy_avg": -0.011000058613717556, + "lr": 2.9071127300613496e-06, + "objective/entropy": 115.15431213378906, + "objective/kl": 3.8994197845458984, + "objective/non_score_reward": -0.1949710100889206, + "objective/rlhf_reward": 4.830174044705927, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.311718463897705, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.501953125, + "step": 323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003019332885742 + }, + { + "episode": 7800, + "epoch": 0.015578003858551724, + "loss/policy_avg": 0.09170163422822952, + "lr": 2.906825153374233e-06, + "objective/entropy": 106.21829223632812, + "objective/kl": 5.2838616371154785, + "objective/non_score_reward": -0.2641931176185608, + "objective/rlhf_reward": 0.4148414358496665, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 9.654085159301758, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.458984375, + "step": 324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978963136672974 + }, + { + "episode": 7824, + "epoch": 0.0156259361781165, + "loss/policy_avg": 0.06516622006893158, + "lr": 2.9065375766871165e-06, + "objective/entropy": 93.40861511230469, + "objective/kl": 3.863525152206421, + "objective/non_score_reward": -0.19317626953125, + "objective/rlhf_reward": 0.6471223567963872, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.339329719543457, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.408203125, + "step": 325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001438617706299 + }, + { + "episode": 7848, + "epoch": 0.015673868497681275, + "loss/policy_avg": 0.11551713943481445, + "lr": 2.90625e-06, + "objective/entropy": 87.46661376953125, + "objective/kl": 3.4951157569885254, + "objective/non_score_reward": -0.17475579679012299, + "objective/rlhf_reward": 2.7370437220615558, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.178346872329712, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.404296875, + "step": 326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000617504119873 + }, + { + "episode": 7872, + "epoch": 0.01572180081724605, + "loss/policy_avg": 0.019406702369451523, + "lr": 2.9059624233128833e-06, + "objective/entropy": 112.16633605957031, + "objective/kl": 4.762637138366699, + "objective/non_score_reward": -0.23813185095787048, + "objective/rlhf_reward": 4.571208879351616, + "objective/scores": 1.0, + "policy/approxkl_avg": 5.068930149078369, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.470703125, + "step": 327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997810959815979 + }, + { + "episode": 7896, + "epoch": 0.015769733136810824, + "loss/policy_avg": 0.005042277742177248, + "lr": 2.905674846625767e-06, + "objective/entropy": 106.75770568847656, + "objective/kl": 3.7512598037719727, + "objective/non_score_reward": -0.1875630021095276, + "objective/rlhf_reward": -1.1253779828548431, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.429689407348633, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4921875, + "step": 328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002031326293945 + }, + { + "episode": 7920, + "epoch": 0.0158176654563756, + "loss/policy_avg": 0.14474651217460632, + "lr": 2.9053872699386506e-06, + "objective/entropy": 78.33731079101562, + "objective/kl": 4.1535139083862305, + "objective/non_score_reward": -0.20767569541931152, + "objective/rlhf_reward": 4.753945857286453, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.684666633605957, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.37109375, + "step": 329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985672235488892 + }, + { + "episode": 7944, + "epoch": 0.015865597775940373, + "loss/policy_avg": 0.088826984167099, + "lr": 2.905099693251534e-06, + "objective/entropy": 103.92185974121094, + "objective/kl": 6.392274379730225, + "objective/non_score_reward": -0.3196137547492981, + "objective/rlhf_reward": -1.9176823943853378, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.366364479064941, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4541015625, + "step": 330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966340065002441 + }, + { + "episode": 7968, + "epoch": 0.015913530095505148, + "loss/policy_avg": 0.06039275974035263, + "lr": 2.9048121165644175e-06, + "objective/entropy": 91.83940124511719, + "objective/kl": 6.214501857757568, + "objective/non_score_reward": -0.3107250928878784, + "objective/rlhf_reward": 0.4567662264753345, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.4539072513580322, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.41015625, + "step": 331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997373104095459 + }, + { + "episode": 7992, + "epoch": 0.015961462415069922, + "loss/policy_avg": 0.028454573825001717, + "lr": 2.904524539877301e-06, + "objective/entropy": 129.7343292236328, + "objective/kl": 3.495609998703003, + "objective/non_score_reward": -0.1747804880142212, + "objective/rlhf_reward": -1.0486829280853271, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.109973907470703, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.548828125, + "step": 332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993517398834229 + }, + { + "episode": 8016, + "epoch": 0.016009394734634697, + "loss/policy_avg": -0.0033001413103193045, + "lr": 2.9042369631901843e-06, + "objective/entropy": 96.37405395507812, + "objective/kl": 3.881673812866211, + "objective/non_score_reward": -0.19408369064331055, + "objective/rlhf_reward": 0.6416778748275075, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.4739038944244385, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.53125, + "step": 333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998296856880188 + }, + { + "episode": 8040, + "epoch": 0.01605732705419947, + "loss/policy_avg": 0.03063315525650978, + "lr": 2.9039493865030673e-06, + "objective/entropy": 112.6546630859375, + "objective/kl": 3.1909937858581543, + "objective/non_score_reward": -0.15954968333244324, + "objective/rlhf_reward": -0.9572981372475624, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6233439445495605, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4580078125, + "step": 334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989150762557983 + }, + { + "episode": 8064, + "epoch": 0.016105259373764245, + "loss/policy_avg": 0.05595889315009117, + "lr": 2.9036618098159507e-06, + "objective/entropy": 88.28289031982422, + "objective/kl": 1.5754408836364746, + "objective/non_score_reward": -0.07877204567193985, + "objective/rlhf_reward": 2.1114270818593, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.069863796234131, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.419921875, + "step": 335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002570390701294 + }, + { + "episode": 8088, + "epoch": 0.01615319169332902, + "loss/policy_avg": 0.039474762976169586, + "lr": 2.903374233128834e-06, + "objective/entropy": 69.96929931640625, + "objective/kl": 3.057511806488037, + "objective/non_score_reward": -0.15287558734416962, + "objective/rlhf_reward": 1.2199895967204704, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.785731315612793, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.37890625, + "step": 336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0022151470184326 + }, + { + "episode": 8112, + "epoch": 0.016201124012893794, + "loss/policy_avg": 0.021285872906446457, + "lr": 2.9030866564417176e-06, + "objective/entropy": 107.0462875366211, + "objective/kl": 2.9537525177001953, + "objective/non_score_reward": -0.1476876437664032, + "objective/rlhf_reward": 2.1138741821050644, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.5046746730804443, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4755859375, + "step": 337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999960660934448 + }, + { + "episode": 8136, + "epoch": 0.01624905633245857, + "loss/policy_avg": -0.012808440253138542, + "lr": 2.902799079754601e-06, + "objective/entropy": 86.64801025390625, + "objective/kl": 3.2933435440063477, + "objective/non_score_reward": -0.16466717422008514, + "objective/rlhf_reward": -0.9880030304193497, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.216010570526123, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4482421875, + "step": 338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988373517990112 + }, + { + "episode": 8160, + "epoch": 0.016296988652023343, + "loss/policy_avg": 0.03736317157745361, + "lr": 2.902511503067485e-06, + "objective/entropy": 79.99102020263672, + "objective/kl": 5.037384033203125, + "objective/non_score_reward": -0.25186920166015625, + "objective/rlhf_reward": -1.5112151503562927, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.631683349609375, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3447265625, + "step": 339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991588592529297 + }, + { + "episode": 8184, + "epoch": 0.016344920971588118, + "loss/policy_avg": 0.026965271681547165, + "lr": 2.9022239263803683e-06, + "objective/entropy": 142.47557067871094, + "objective/kl": 2.5932512283325195, + "objective/non_score_reward": -0.12966257333755493, + "objective/rlhf_reward": 1.543141425733662, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.4411702156066895, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.572265625, + "step": 340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9971544742584229 + }, + { + "episode": 8208, + "epoch": 0.016392853291152892, + "loss/policy_avg": 0.011613774113357067, + "lr": 2.9019363496932517e-06, + "objective/entropy": 110.20906829833984, + "objective/kl": 2.48465633392334, + "objective/non_score_reward": -0.12423282861709595, + "objective/rlhf_reward": 1.1473923188141193, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.7963228225708008, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4833984375, + "step": 341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011544227600098 + }, + { + "episode": 8232, + "epoch": 0.016440785610717667, + "loss/policy_avg": 0.021342316642403603, + "lr": 2.901648773006135e-06, + "objective/entropy": 121.18575286865234, + "objective/kl": 2.8719682693481445, + "objective/non_score_reward": -0.14359842240810394, + "objective/rlhf_reward": -0.8615905120968819, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8302711248397827, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.5390625, + "step": 342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986821413040161 + }, + { + "episode": 8256, + "epoch": 0.01648871793028244, + "loss/policy_avg": 0.10014289617538452, + "lr": 2.9013611963190186e-06, + "objective/entropy": 105.21800231933594, + "objective/kl": 1.0545728206634521, + "objective/non_score_reward": -0.05272863432765007, + "objective/rlhf_reward": 1.4180171631172978, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.8870697021484375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.509765625, + "step": 343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003586769104004 + }, + { + "episode": 8280, + "epoch": 0.016536650249847216, + "loss/policy_avg": 0.04130128026008606, + "lr": 2.901073619631902e-06, + "objective/entropy": 119.07177734375, + "objective/kl": 2.6099183559417725, + "objective/non_score_reward": -0.13049593567848206, + "objective/rlhf_reward": -0.7829755321145058, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.981583595275879, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.55078125, + "step": 344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001418590545654 + }, + { + "episode": 8304, + "epoch": 0.01658458256941199, + "loss/policy_avg": -0.012797907926142216, + "lr": 2.9007860429447854e-06, + "objective/entropy": 107.04109191894531, + "objective/kl": 3.883317708969116, + "objective/non_score_reward": -0.1941659152507782, + "objective/rlhf_reward": 2.620583104429882, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.1005122661590576, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.494140625, + "step": 345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002415418624878 + }, + { + "episode": 8328, + "epoch": 0.016632514888976765, + "loss/policy_avg": -0.0014876285567879677, + "lr": 2.900498466257669e-06, + "objective/entropy": 95.5534896850586, + "objective/kl": 1.5589618682861328, + "objective/non_score_reward": -0.07794810086488724, + "objective/rlhf_reward": -0.46768856793642044, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.006092071533203, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4248046875, + "step": 346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001147747039795 + }, + { + "episode": 8352, + "epoch": 0.01668044720854154, + "loss/policy_avg": 0.023012813180685043, + "lr": 2.9002108895705523e-06, + "objective/entropy": 126.54058074951172, + "objective/kl": 2.834998607635498, + "objective/non_score_reward": -0.14174991846084595, + "objective/rlhf_reward": 2.149500496685505, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.828616142272949, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000307559967041 + }, + { + "episode": 8376, + "epoch": 0.016728379528106314, + "loss/policy_avg": -0.006602446548640728, + "lr": 2.8999233128834357e-06, + "objective/entropy": 139.91412353515625, + "objective/kl": 2.652053117752075, + "objective/non_score_reward": -0.13260266184806824, + "objective/rlhf_reward": -0.7956159338355064, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0981831550598145, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.65234375, + "step": 348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990885257720947 + }, + { + "episode": 8400, + "epoch": 0.016776311847671088, + "loss/policy_avg": 0.03380749002099037, + "lr": 2.899635736196319e-06, + "objective/entropy": 94.310791015625, + "objective/kl": 3.65126371383667, + "objective/non_score_reward": -0.1825631856918335, + "objective/rlhf_reward": -1.0953790694475174, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6422533988952637, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.455078125, + "step": 349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001056671142578 + }, + { + "episode": 8424, + "epoch": 0.016824244167235863, + "loss/policy_avg": 0.001228630542755127, + "lr": 2.8993481595092026e-06, + "objective/entropy": 105.7930908203125, + "objective/kl": 3.8849151134490967, + "objective/non_score_reward": -0.19424578547477722, + "objective/rlhf_reward": -1.165474645793438, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4026014804840088, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.47265625, + "step": 350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014753341674805 + }, + { + "episode": 8448, + "epoch": 0.016872176486800637, + "loss/policy_avg": -0.026495875790715218, + "lr": 2.899060582822086e-06, + "objective/entropy": 94.02154541015625, + "objective/kl": 6.62221097946167, + "objective/non_score_reward": -0.33111056685447693, + "objective/rlhf_reward": 0.15057987053288346, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.3631911277770996, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.439453125, + "step": 351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0030617713928223 + }, + { + "episode": 8472, + "epoch": 0.01692010880636541, + "loss/policy_avg": 0.02394876256585121, + "lr": 2.8987730061349694e-06, + "objective/entropy": 72.07473754882812, + "objective/kl": 4.173927307128906, + "objective/non_score_reward": -0.20869635045528412, + "objective/rlhf_reward": -1.2521780580282211, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.389779567718506, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.38671875, + "step": 352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988776445388794 + }, + { + "episode": 8496, + "epoch": 0.016968041125930186, + "loss/policy_avg": 0.0388445220887661, + "lr": 2.898485429447853e-06, + "objective/entropy": 111.483642578125, + "objective/kl": 3.5875024795532227, + "objective/non_score_reward": -0.17937514185905457, + "objective/rlhf_reward": 1.5078086015941596, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.1498693227767944, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.537109375, + "step": 353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0032825469970703 + }, + { + "episode": 8520, + "epoch": 0.01701597344549496, + "loss/policy_avg": 0.008699703961610794, + "lr": 2.8981978527607363e-06, + "objective/entropy": 83.11165618896484, + "objective/kl": 3.3721988201141357, + "objective/non_score_reward": -0.16860991716384888, + "objective/rlhf_reward": -1.0116595216095448, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7225263118743896, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3935546875, + "step": 354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000425338745117 + }, + { + "episode": 8544, + "epoch": 0.017063905765059735, + "loss/policy_avg": 0.012500934302806854, + "lr": 2.8979102760736197e-06, + "objective/entropy": 123.6297607421875, + "objective/kl": 2.3227438926696777, + "objective/non_score_reward": -0.11613719910383224, + "objective/rlhf_reward": 1.3031768053770065, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.5010164976119995, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.623046875, + "step": 355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997199773788452 + }, + { + "episode": 8568, + "epoch": 0.01711183808462451, + "loss/policy_avg": -0.017101367935538292, + "lr": 2.897622699386503e-06, + "objective/entropy": 77.74732971191406, + "objective/kl": 5.919728755950928, + "objective/non_score_reward": -0.2959864139556885, + "objective/rlhf_reward": -1.7759184688329697, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.415091037750244, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3544921875, + "step": 356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991121292114258 + }, + { + "episode": 8592, + "epoch": 0.017159770404189284, + "loss/policy_avg": -0.004866352304816246, + "lr": 2.8973351226993865e-06, + "objective/entropy": 98.21825408935547, + "objective/kl": 0.2391621321439743, + "objective/non_score_reward": -0.011958092451095581, + "objective/rlhf_reward": -0.07174856215715408, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.5579047203063965, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.43359375, + "step": 357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0064215660095215 + }, + { + "episode": 8616, + "epoch": 0.01720770272375406, + "loss/policy_avg": 0.01591409742832184, + "lr": 2.89704754601227e-06, + "objective/entropy": 81.64492797851562, + "objective/kl": 4.802123546600342, + "objective/non_score_reward": -0.24010616540908813, + "objective/rlhf_reward": 0.8804798584033016, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.820075988769531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3701171875, + "step": 358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998070240020752 + }, + { + "episode": 8640, + "epoch": 0.017255635043318833, + "loss/policy_avg": 0.03866538032889366, + "lr": 2.8967599693251534e-06, + "objective/entropy": 113.9254379272461, + "objective/kl": 3.6541125774383545, + "objective/non_score_reward": -0.18270564079284668, + "objective/rlhf_reward": 0.6381551206008278, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.9079604148864746, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.517578125, + "step": 359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000646114349365 + }, + { + "episode": 8664, + "epoch": 0.017303567362883607, + "loss/policy_avg": 0.006064162589609623, + "lr": 2.896472392638037e-06, + "objective/entropy": 100.04442596435547, + "objective/kl": 4.263726711273193, + "objective/non_score_reward": -0.2131863236427307, + "objective/rlhf_reward": 1.304941430798361, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.562147378921509, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.443359375, + "step": 360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019257068634033 + }, + { + "episode": 8688, + "epoch": 0.017351499682448382, + "loss/policy_avg": -0.015803400427103043, + "lr": 2.8961848159509202e-06, + "objective/entropy": 83.79300689697266, + "objective/kl": 3.4599246978759766, + "objective/non_score_reward": -0.17299622297286987, + "objective/rlhf_reward": 0.6964115772292696, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 1.8374733924865723, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3818359375, + "step": 361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002300262451172 + }, + { + "episode": 8712, + "epoch": 0.017399432002013156, + "loss/policy_avg": 0.019441112875938416, + "lr": 2.895897239263804e-06, + "objective/entropy": 95.31941986083984, + "objective/kl": 2.574490547180176, + "objective/non_score_reward": -0.12872454524040222, + "objective/rlhf_reward": 0.9620417423442684, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 1.384813666343689, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.556640625, + "step": 362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993846416473389 + }, + { + "episode": 8736, + "epoch": 0.01744736432157793, + "loss/policy_avg": -0.025898966938257217, + "lr": 2.8956096625766875e-06, + "objective/entropy": 133.70968627929688, + "objective/kl": 3.22401762008667, + "objective/non_score_reward": -0.1612008810043335, + "objective/rlhf_reward": 2.0327947568148375, + "objective/scores": 0.5, + "policy/approxkl_avg": 6.369945049285889, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.599609375, + "step": 363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963085651397705 + }, + { + "episode": 8760, + "epoch": 0.017495296641142705, + "loss/policy_avg": 0.06742075085639954, + "lr": 2.8953220858895705e-06, + "objective/entropy": 139.02386474609375, + "objective/kl": 0.719193160533905, + "objective/non_score_reward": -0.035959649831056595, + "objective/rlhf_reward": -0.21575790271162987, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4158334732055664, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.60546875, + "step": 364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0056374073028564 + }, + { + "episode": 8784, + "epoch": 0.01754322896070748, + "loss/policy_avg": 0.014960049651563168, + "lr": 2.895034509202454e-06, + "objective/entropy": 82.6552734375, + "objective/kl": 6.09274435043335, + "objective/non_score_reward": -0.30463719367980957, + "objective/rlhf_reward": 0.1721767634153365, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.2704811096191406, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.416015625, + "step": 365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007715225219727 + }, + { + "episode": 8808, + "epoch": 0.017591161280272254, + "loss/policy_avg": 1.2413173913955688, + "lr": 2.8947469325153374e-06, + "objective/entropy": 109.41818237304688, + "objective/kl": 5.094698429107666, + "objective/non_score_reward": -0.25473493337631226, + "objective/rlhf_reward": -1.5284094978123903, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5404112339019775, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.509765625, + "step": 366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.005608081817627 + }, + { + "episode": 8832, + "epoch": 0.01763909359983703, + "loss/policy_avg": -0.006356945261359215, + "lr": 2.894459355828221e-06, + "objective/entropy": 128.63169860839844, + "objective/kl": 2.0484085083007812, + "objective/non_score_reward": -0.10242043435573578, + "objective/rlhf_reward": -0.614522535353899, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9707320928573608, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.552734375, + "step": 367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000490188598633 + }, + { + "episode": 8856, + "epoch": 0.017687025919401803, + "loss/policy_avg": 0.00142889772541821, + "lr": 2.8941717791411042e-06, + "objective/entropy": 139.2299346923828, + "objective/kl": 1.9575297832489014, + "objective/non_score_reward": -0.09787648916244507, + "objective/rlhf_reward": 1.996800465619752, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.15226411819458, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.580078125, + "step": 368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000915288925171 + }, + { + "episode": 8880, + "epoch": 0.017734958238966578, + "loss/policy_avg": 0.003688833676278591, + "lr": 2.8938842024539877e-06, + "objective/entropy": 94.59426879882812, + "objective/kl": 3.397627830505371, + "objective/non_score_reward": -0.1698814034461975, + "objective/rlhf_reward": 1.1179547094215287, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.687622547149658, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4306640625, + "step": 369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002715826034546 + }, + { + "episode": 8904, + "epoch": 0.017782890558531352, + "loss/policy_avg": 0.02541806735098362, + "lr": 2.893596625766871e-06, + "objective/entropy": 113.98545837402344, + "objective/kl": 1.8097774982452393, + "objective/non_score_reward": -0.09048887342214584, + "objective/rlhf_reward": 1.4570667752996087, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 0.8875245451927185, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.517578125, + "step": 370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0035271644592285 + }, + { + "episode": 8928, + "epoch": 0.017830822878096127, + "loss/policy_avg": 0.01154803391546011, + "lr": 2.8933090490797545e-06, + "objective/entropy": 89.18072509765625, + "objective/kl": 3.7055163383483887, + "objective/non_score_reward": -0.1852758228778839, + "objective/rlhf_reward": -1.1116548776626587, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3170204162597656, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4150390625, + "step": 371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0000815391540527 + }, + { + "episode": 8952, + "epoch": 0.0178787551976609, + "loss/policy_avg": -0.03063778579235077, + "lr": 2.893021472392638e-06, + "objective/entropy": 103.24898529052734, + "objective/kl": 3.7684803009033203, + "objective/non_score_reward": -0.1884240210056305, + "objective/rlhf_reward": -1.1305440701544285, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.436342716217041, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.458984375, + "step": 372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010619163513184 + }, + { + "episode": 8976, + "epoch": 0.017926687517225676, + "loss/policy_avg": 0.004761957563459873, + "lr": 2.8927338957055218e-06, + "objective/entropy": 89.67506408691406, + "objective/kl": 2.4344851970672607, + "objective/non_score_reward": -0.12172424793243408, + "objective/rlhf_reward": 1.2696545161306858, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.323668479919434, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.384765625, + "step": 373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001441478729248 + }, + { + "episode": 9000, + "epoch": 0.017974619836790454, + "loss/policy_avg": -0.008073483593761921, + "lr": 2.892446319018405e-06, + "objective/entropy": 101.95695495605469, + "objective/kl": 3.774043560028076, + "objective/non_score_reward": -0.18870219588279724, + "objective/rlhf_reward": 4.867786929011345, + "objective/scores": 1.0, + "policy/approxkl_avg": 9.206986427307129, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.41015625, + "step": 374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001361608505249 + }, + { + "episode": 9024, + "epoch": 0.018022552156355228, + "loss/policy_avg": -0.024730606004595757, + "lr": 2.8921587423312886e-06, + "objective/entropy": 110.0859375, + "objective/kl": 3.4377870559692383, + "objective/non_score_reward": -0.17188934981822968, + "objective/rlhf_reward": 1.9686639197170734, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.777695655822754, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.560546875, + "step": 375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.002884864807129 + }, + { + "episode": 9048, + "epoch": 0.018070484475920003, + "loss/policy_avg": 0.011643728241324425, + "lr": 2.891871165644172e-06, + "objective/entropy": 83.80197143554688, + "objective/kl": 3.0617141723632812, + "objective/non_score_reward": -0.15308569371700287, + "objective/rlhf_reward": 5.081485876813531, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.6374082565307617, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.400390625, + "step": 376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999065637588501 + }, + { + "episode": 9072, + "epoch": 0.018118416795484777, + "loss/policy_avg": 0.020846914499998093, + "lr": 2.8915835889570555e-06, + "objective/entropy": 75.78707885742188, + "objective/kl": 3.694187879562378, + "objective/non_score_reward": -0.18470939993858337, + "objective/rlhf_reward": 1.47580297488589, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.528866291046143, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3798828125, + "step": 377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010972023010254 + }, + { + "episode": 9096, + "epoch": 0.01816634911504955, + "loss/policy_avg": 0.02677360363304615, + "lr": 2.8912960122699385e-06, + "objective/entropy": 90.68341064453125, + "objective/kl": 3.350646734237671, + "objective/non_score_reward": -0.16753235459327698, + "objective/rlhf_reward": -1.00519410520792, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.4130425453186035, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4228515625, + "step": 378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993488788604736 + }, + { + "episode": 9120, + "epoch": 0.018214281434614326, + "loss/policy_avg": 0.04314554110169411, + "lr": 2.891008435582822e-06, + "objective/entropy": 85.9647445678711, + "objective/kl": 5.886736869812012, + "objective/non_score_reward": -0.29433682560920715, + "objective/rlhf_reward": -1.7660209685564041, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.277322769165039, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4091796875, + "step": 379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988329410552979 + }, + { + "episode": 9144, + "epoch": 0.0182622137541791, + "loss/policy_avg": 0.1871921271085739, + "lr": 2.8907208588957053e-06, + "objective/entropy": 124.6341552734375, + "objective/kl": 5.639516353607178, + "objective/non_score_reward": -0.28197580575942993, + "objective/rlhf_reward": 0.200934426157793, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.589963436126709, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.51171875, + "step": 380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000868558883667 + }, + { + "episode": 9168, + "epoch": 0.018310146073743875, + "loss/policy_avg": 0.10084758698940277, + "lr": 2.8904332822085888e-06, + "objective/entropy": 66.22016143798828, + "objective/kl": 2.821841239929199, + "objective/non_score_reward": -0.14109206199645996, + "objective/rlhf_reward": -0.8465523160994053, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1778335571289062, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3974609375, + "step": 381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002579689025879 + }, + { + "episode": 9192, + "epoch": 0.01835807839330865, + "loss/policy_avg": 0.012524849735200405, + "lr": 2.890145705521472e-06, + "objective/entropy": 76.07249450683594, + "objective/kl": 3.6547887325286865, + "objective/non_score_reward": -0.18273943662643433, + "objective/rlhf_reward": 0.9035634100437163, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.4966483116149902, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4541015625, + "step": 382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989538192749023 + }, + { + "episode": 9216, + "epoch": 0.018406010712873424, + "loss/policy_avg": -0.002731708809733391, + "lr": 2.889858128834356e-06, + "objective/entropy": 119.64635467529297, + "objective/kl": 3.782222270965576, + "objective/non_score_reward": -0.1891111135482788, + "objective/rlhf_reward": 0.6715133448482785, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 6.722377777099609, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.49609375, + "step": 383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974298477172852 + }, + { + "episode": 9240, + "epoch": 0.0184539430324382, + "loss/policy_avg": 0.05469908565282822, + "lr": 2.8895705521472395e-06, + "objective/entropy": 105.557373046875, + "objective/kl": 3.9204490184783936, + "objective/non_score_reward": -0.19602248072624207, + "objective/rlhf_reward": 0.5582541517809712, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.1819751262664795, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4716796875, + "step": 384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993542432785034 + }, + { + "episode": 9264, + "epoch": 0.018501875352002973, + "loss/policy_avg": 0.04275136813521385, + "lr": 2.889282975460123e-06, + "objective/entropy": 74.2679672241211, + "objective/kl": 5.253615379333496, + "objective/non_score_reward": -0.2626807689666748, + "objective/rlhf_reward": 0.5611585982550514, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.271362781524658, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3798828125, + "step": 385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988439083099365 + }, + { + "episode": 9288, + "epoch": 0.018549807671567747, + "loss/policy_avg": 0.006533767096698284, + "lr": 2.8889953987730063e-06, + "objective/entropy": 92.64540100097656, + "objective/kl": 5.8863348960876465, + "objective/non_score_reward": -0.29431673884391785, + "objective/rlhf_reward": 1.2340996116399765, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.807617425918579, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.41015625, + "step": 386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009660720825195 + }, + { + "episode": 9312, + "epoch": 0.018597739991132522, + "loss/policy_avg": 0.08829957246780396, + "lr": 2.8887078220858897e-06, + "objective/entropy": 76.50399780273438, + "objective/kl": 5.682933807373047, + "objective/non_score_reward": -0.2841467261314392, + "objective/rlhf_reward": 0.10129978855860566, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.963411331176758, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3193359375, + "step": 387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997246265411377 + }, + { + "episode": 9336, + "epoch": 0.018645672310697296, + "loss/policy_avg": 0.003311685286462307, + "lr": 2.888420245398773e-06, + "objective/entropy": 108.45634460449219, + "objective/kl": 1.9549678564071655, + "objective/non_score_reward": -0.09774839133024216, + "objective/rlhf_reward": 1.1478986229643904, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.937483072280884, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.65625, + "step": 388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0015907287597656 + }, + { + "episode": 9360, + "epoch": 0.01869360463026207, + "loss/policy_avg": -0.06831187009811401, + "lr": 2.8881326687116566e-06, + "objective/entropy": 124.7669677734375, + "objective/kl": 1.498788833618164, + "objective/non_score_reward": -0.07493945211172104, + "objective/rlhf_reward": 3.335941846011322, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.5193653106689453, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.533203125, + "step": 389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016531944274902 + }, + { + "episode": 9384, + "epoch": 0.018741536949826845, + "loss/policy_avg": -0.011621003039181232, + "lr": 2.88784509202454e-06, + "objective/entropy": 111.00491333007812, + "objective/kl": 4.309990882873535, + "objective/non_score_reward": -0.21549952030181885, + "objective/rlhf_reward": 1.7070028558373451, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.6545774936676025, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.57421875, + "step": 390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99821138381958 + }, + { + "episode": 9408, + "epoch": 0.01878946926939162, + "loss/policy_avg": 0.053977206349372864, + "lr": 2.8875575153374235e-06, + "objective/entropy": 96.47300720214844, + "objective/kl": 4.991702556610107, + "objective/non_score_reward": -0.2495851367712021, + "objective/rlhf_reward": 0.3086692502142224, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.322088241577148, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.419921875, + "step": 391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982714653015137 + }, + { + "episode": 9432, + "epoch": 0.018837401588956394, + "loss/policy_avg": 0.06055210158228874, + "lr": 2.887269938650307e-06, + "objective/entropy": 87.98290252685547, + "objective/kl": 4.534460067749023, + "objective/non_score_reward": -0.22672304511070251, + "objective/rlhf_reward": 1.639661829918623, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.8823049068450928, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3857421875, + "step": 392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002232551574707 + }, + { + "episode": 9456, + "epoch": 0.01888533390852117, + "loss/policy_avg": 0.05612951144576073, + "lr": 2.8869823619631903e-06, + "objective/entropy": 124.76217651367188, + "objective/kl": 3.7268314361572266, + "objective/non_score_reward": -0.18634158372879028, + "objective/rlhf_reward": 0.6163394834742628, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 5.750222206115723, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.544921875, + "step": 393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9954172372817993 + }, + { + "episode": 9480, + "epoch": 0.018933266228085943, + "loss/policy_avg": 0.15118083357810974, + "lr": 2.8866947852760737e-06, + "objective/entropy": 101.59642028808594, + "objective/kl": 2.209162712097168, + "objective/non_score_reward": -0.1104581356048584, + "objective/rlhf_reward": 1.47449427549137, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.6962342262268066, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.451171875, + "step": 394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997332096099854 + }, + { + "episode": 9504, + "epoch": 0.018981198547650718, + "loss/policy_avg": 0.09461413323879242, + "lr": 2.886407208588957e-06, + "objective/entropy": 99.26164245605469, + "objective/kl": 3.983348846435547, + "objective/non_score_reward": -0.19916746020317078, + "objective/rlhf_reward": 1.804995357990265, + "objective/scores": 0.5, + "policy/approxkl_avg": 8.888657569885254, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.474609375, + "step": 395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9965695142745972 + }, + { + "episode": 9528, + "epoch": 0.019029130867215492, + "loss/policy_avg": 0.13626402616500854, + "lr": 2.8861196319018406e-06, + "objective/entropy": 83.9027099609375, + "objective/kl": 7.2228803634643555, + "objective/non_score_reward": -0.3611440062522888, + "objective/rlhf_reward": -2.1668640077114105, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1680116653442383, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3779296875, + "step": 396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992318153381348 + }, + { + "episode": 9552, + "epoch": 0.019077063186780267, + "loss/policy_avg": -0.019790761172771454, + "lr": 2.885832055214724e-06, + "objective/entropy": 94.0418930053711, + "objective/kl": 2.780942916870117, + "objective/non_score_reward": -0.13904714584350586, + "objective/rlhf_reward": -0.834282886236906, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.086554527282715, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.412109375, + "step": 397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992725849151611 + }, + { + "episode": 9576, + "epoch": 0.01912499550634504, + "loss/policy_avg": 0.015513966791331768, + "lr": 2.8855444785276074e-06, + "objective/entropy": 76.63304901123047, + "objective/kl": 1.8834619522094727, + "objective/non_score_reward": -0.09417310357093811, + "objective/rlhf_reward": 3.220539905591052, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.5431885719299316, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.37109375, + "step": 398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0005524158477783 + }, + { + "episode": 9600, + "epoch": 0.019172927825909816, + "loss/policy_avg": 0.049253419041633606, + "lr": 2.885256901840491e-06, + "objective/entropy": 72.15864562988281, + "objective/kl": 2.653183698654175, + "objective/non_score_reward": -0.1326591968536377, + "objective/rlhf_reward": 5.204044871032238, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.219181537628174, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4013671875, + "step": 399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999395728111267 + }, + { + "episode": 9624, + "epoch": 0.01922086014547459, + "loss/policy_avg": 0.030024005100131035, + "lr": 2.8849693251533743e-06, + "objective/entropy": 90.1722412109375, + "objective/kl": 1.3223938941955566, + "objective/non_score_reward": -0.06611970067024231, + "objective/rlhf_reward": -0.3967181481420994, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9157493114471436, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.37890625, + "step": 400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988298416137695 + }, + { + "episode": 9648, + "epoch": 0.019268792465039365, + "loss/policy_avg": 0.08208562433719635, + "lr": 2.8846817484662577e-06, + "objective/entropy": 132.90902709960938, + "objective/kl": 1.5526055097579956, + "objective/non_score_reward": -0.07763028144836426, + "objective/rlhf_reward": 1.4270075869253482, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.045528411865234, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.57421875, + "step": 401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.996992588043213 + }, + { + "episode": 9672, + "epoch": 0.01931672478460414, + "loss/policy_avg": -0.014223465695977211, + "lr": 2.884394171779141e-06, + "objective/entropy": 88.312744140625, + "objective/kl": 4.284160614013672, + "objective/non_score_reward": -0.21420805156230927, + "objective/rlhf_reward": 0.44914072303927766, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.4522147178649902, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3720703125, + "step": 402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0021374225616455 + }, + { + "episode": 9696, + "epoch": 0.019364657104168913, + "loss/policy_avg": 0.05713770166039467, + "lr": 2.8841065950920246e-06, + "objective/entropy": 99.77611541748047, + "objective/kl": 4.29073429107666, + "objective/non_score_reward": -0.21453672647476196, + "objective/rlhf_reward": -1.287220373749733, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1946828365325928, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.474609375, + "step": 403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001053810119629 + }, + { + "episode": 9720, + "epoch": 0.019412589423733688, + "loss/policy_avg": 0.02697388269007206, + "lr": 2.883819018404908e-06, + "objective/entropy": 83.83444213867188, + "objective/kl": 3.57572603225708, + "objective/non_score_reward": -0.17878632247447968, + "objective/rlhf_reward": -1.0727178454399109, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.679842472076416, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.365234375, + "step": 404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994993209838867 + }, + { + "episode": 9744, + "epoch": 0.019460521743298462, + "loss/policy_avg": 0.0058796037919819355, + "lr": 2.8835314417177914e-06, + "objective/entropy": 106.15974426269531, + "objective/kl": 5.573033809661865, + "objective/non_score_reward": -0.27865171432495117, + "objective/rlhf_reward": -1.671910285949707, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.062155723571777, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4267578125, + "step": 405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008769035339355 + }, + { + "episode": 9768, + "epoch": 0.019508454062863237, + "loss/policy_avg": -0.04137278348207474, + "lr": 2.883243865030675e-06, + "objective/entropy": 117.77615356445312, + "objective/kl": 1.9655439853668213, + "objective/non_score_reward": -0.09827720373868942, + "objective/rlhf_reward": 1.3031260643592681, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.4535489082336426, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.53125, + "step": 406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00260066986084 + }, + { + "episode": 9792, + "epoch": 0.01955638638242801, + "loss/policy_avg": -0.030810844153165817, + "lr": 2.8829562883435587e-06, + "objective/entropy": 117.67794036865234, + "objective/kl": 4.555779933929443, + "objective/non_score_reward": -0.22778896987438202, + "objective/rlhf_reward": 0.9543830241609577, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.382352352142334, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.525390625, + "step": 407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0023536682128906 + }, + { + "episode": 9816, + "epoch": 0.019604318701992786, + "loss/policy_avg": 0.06310312449932098, + "lr": 2.8826687116564417e-06, + "objective/entropy": 112.91573333740234, + "objective/kl": 4.32677698135376, + "objective/non_score_reward": -0.2163388431072235, + "objective/rlhf_reward": -1.298033058643341, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8937187194824219, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4580078125, + "step": 408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000281810760498 + }, + { + "episode": 9840, + "epoch": 0.01965225102155756, + "loss/policy_avg": -0.0343463234603405, + "lr": 2.882381134969325e-06, + "objective/entropy": 102.01966857910156, + "objective/kl": 3.4611546993255615, + "objective/non_score_reward": -0.17305772006511688, + "objective/rlhf_reward": -1.0383462719619274, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3998024463653564, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4462890625, + "step": 409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.006146192550659 + }, + { + "episode": 9864, + "epoch": 0.019700183341122335, + "loss/policy_avg": 0.010436730459332466, + "lr": 2.8820935582822085e-06, + "objective/entropy": 86.52416229248047, + "objective/kl": 3.7595489025115967, + "objective/non_score_reward": -0.18797743320465088, + "objective/rlhf_reward": 1.0093784787167444, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.8151872158050537, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3916015625, + "step": 410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000576972961426 + }, + { + "episode": 9888, + "epoch": 0.01974811566068711, + "loss/policy_avg": 0.056768544018268585, + "lr": 2.881805981595092e-06, + "objective/entropy": 144.109130859375, + "objective/kl": 2.586732864379883, + "objective/non_score_reward": -0.1293366551399231, + "objective/rlhf_reward": -0.7760199420154095, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.48569917678833, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.611328125, + "step": 411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993829727172852 + }, + { + "episode": 9912, + "epoch": 0.019796047980251884, + "loss/policy_avg": 0.0015783989802002907, + "lr": 2.8815184049079754e-06, + "objective/entropy": 100.04719543457031, + "objective/kl": 3.5453574657440186, + "objective/non_score_reward": -0.1772678941488266, + "objective/rlhf_reward": 4.936392650008202, + "objective/scores": 1.0, + "policy/approxkl_avg": 4.222690582275391, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4794921875, + "step": 412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003974437713623 + }, + { + "episode": 9936, + "epoch": 0.019843980299816658, + "loss/policy_avg": 0.016777025535702705, + "lr": 2.881230828220859e-06, + "objective/entropy": 103.0643310546875, + "objective/kl": 5.981478691101074, + "objective/non_score_reward": -0.2990739047527313, + "objective/rlhf_reward": -1.7944433987140656, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.040409803390503, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4521484375, + "step": 413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000030040740967 + }, + { + "episode": 9960, + "epoch": 0.019891912619381433, + "loss/policy_avg": 0.003130437806248665, + "lr": 2.8809432515337422e-06, + "objective/entropy": 91.68243408203125, + "objective/kl": 4.991149425506592, + "objective/non_score_reward": -0.2495574653148651, + "objective/rlhf_reward": -1.49734478443861, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.25068998336792, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4306640625, + "step": 414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006141662597656 + }, + { + "episode": 9984, + "epoch": 0.019939844938946207, + "loss/policy_avg": 0.0678485706448555, + "lr": 2.8806556748466257e-06, + "objective/entropy": 79.75787353515625, + "objective/kl": 3.7649059295654297, + "objective/non_score_reward": -0.18824529647827148, + "objective/rlhf_reward": -1.1294718235731125, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.725039005279541, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3701171875, + "step": 415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979026317596436 + }, + { + "episode": 10008, + "epoch": 0.01998777725851098, + "loss/policy_avg": 0.057038404047489166, + "lr": 2.880368098159509e-06, + "objective/entropy": 115.59861755371094, + "objective/kl": 4.4749884605407715, + "objective/non_score_reward": -0.22374945878982544, + "objective/rlhf_reward": 0.46368326594841813, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.6912617683410645, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.490234375, + "step": 416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9969213008880615 + }, + { + "episode": 10032, + "epoch": 0.020035709578075756, + "loss/policy_avg": -0.014291181229054928, + "lr": 2.880080521472393e-06, + "objective/entropy": 114.03660583496094, + "objective/kl": 2.630175828933716, + "objective/non_score_reward": -0.13150879740715027, + "objective/rlhf_reward": -0.7890527006238699, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4360692501068115, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5, + "step": 417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011281967163086 + }, + { + "episode": 10056, + "epoch": 0.02008364189764053, + "loss/policy_avg": 0.06279174983501434, + "lr": 2.8797929447852764e-06, + "objective/entropy": 88.18580627441406, + "objective/kl": 4.1057844161987305, + "objective/non_score_reward": -0.20528921484947205, + "objective/rlhf_reward": 0.5744446997882161, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.1630258560180664, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.470703125, + "step": 418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998842477798462 + }, + { + "episode": 10080, + "epoch": 0.020131574217205305, + "loss/policy_avg": 0.022175399586558342, + "lr": 2.87950536809816e-06, + "objective/entropy": 83.96435546875, + "objective/kl": 4.872107028961182, + "objective/non_score_reward": -0.2436053454875946, + "objective/rlhf_reward": -1.4616320431232452, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.21586799621582, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.38671875, + "step": 419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981255531311035 + }, + { + "episode": 10104, + "epoch": 0.02017950653677008, + "loss/policy_avg": 0.0296197347342968, + "lr": 2.8792177914110432e-06, + "objective/entropy": 120.8309555053711, + "objective/kl": 2.0951008796691895, + "objective/non_score_reward": -0.10475503653287888, + "objective/rlhf_reward": 5.371469791978598, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.3837618827819824, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.5234375, + "step": 420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001697063446045 + }, + { + "episode": 10128, + "epoch": 0.020227438856334854, + "loss/policy_avg": 0.013116557151079178, + "lr": 2.8789302147239267e-06, + "objective/entropy": 99.44071960449219, + "objective/kl": 3.9660353660583496, + "objective/non_score_reward": -0.198301762342453, + "objective/rlhf_reward": -1.189810547977686, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.284656047821045, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4521484375, + "step": 421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000176191329956 + }, + { + "episode": 10152, + "epoch": 0.02027537117589963, + "loss/policy_avg": 0.026236731559038162, + "lr": 2.87864263803681e-06, + "objective/entropy": 74.61247253417969, + "objective/kl": 5.8815016746521, + "objective/non_score_reward": -0.294075071811676, + "objective/rlhf_reward": 2.0211281352621726, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 1.587428331375122, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3486328125, + "step": 422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002987384796143 + }, + { + "episode": 10176, + "epoch": 0.020323303495464403, + "loss/policy_avg": -0.03532176464796066, + "lr": 2.878355061349693e-06, + "objective/entropy": 114.32998657226562, + "objective/kl": 4.0747480392456055, + "objective/non_score_reward": -0.20373736321926117, + "objective/rlhf_reward": -1.2224241942167282, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.381857395172119, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5546875, + "step": 423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0028741359710693 + }, + { + "episode": 10200, + "epoch": 0.020371235815029178, + "loss/policy_avg": 0.01336676999926567, + "lr": 2.8780674846625765e-06, + "objective/entropy": 84.24551391601562, + "objective/kl": 3.849914073944092, + "objective/non_score_reward": -0.1924956887960434, + "objective/rlhf_reward": -1.1549741327762604, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5584259033203125, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.37109375, + "step": 424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998095035552979 + }, + { + "episode": 10224, + "epoch": 0.020419168134593952, + "loss/policy_avg": 0.08159291744232178, + "lr": 2.87777990797546e-06, + "objective/entropy": 114.26676940917969, + "objective/kl": 3.6341588497161865, + "objective/non_score_reward": -0.18170790374279022, + "objective/rlhf_reward": -1.090247467160225, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9159915447235107, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4794921875, + "step": 425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0053720474243164 + }, + { + "episode": 10248, + "epoch": 0.020467100454158726, + "loss/policy_avg": 0.14743652939796448, + "lr": 2.8774923312883434e-06, + "objective/entropy": 149.51109313964844, + "objective/kl": 4.924877166748047, + "objective/non_score_reward": -0.24624383449554443, + "objective/rlhf_reward": 0.6597800635208023, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.940849781036377, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.677734375, + "step": 426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997089147567749 + }, + { + "episode": 10272, + "epoch": 0.0205150327737235, + "loss/policy_avg": -0.0033318717032670975, + "lr": 2.877204754601227e-06, + "objective/entropy": 128.71759033203125, + "objective/kl": 1.6616190671920776, + "objective/non_score_reward": -0.0830809623003006, + "objective/rlhf_reward": 2.501514259725809, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.5584057569503784, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.564453125, + "step": 427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016214847564697 + }, + { + "episode": 10296, + "epoch": 0.020562965093288275, + "loss/policy_avg": 0.03729747608304024, + "lr": 2.8769171779141106e-06, + "objective/entropy": 107.97052001953125, + "objective/kl": 2.826921224594116, + "objective/non_score_reward": -0.1413460671901703, + "objective/rlhf_reward": 1.2891667567600145, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.1061930656433105, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4765625, + "step": 428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000434637069702 + }, + { + "episode": 10320, + "epoch": 0.02061089741285305, + "loss/policy_avg": 0.011672573164105415, + "lr": 2.876629601226994e-06, + "objective/entropy": 110.74153900146484, + "objective/kl": 5.20823860168457, + "objective/non_score_reward": -0.26041194796562195, + "objective/rlhf_reward": -1.56247166544199, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.07891845703125, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.521484375, + "step": 429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997605562210083 + }, + { + "episode": 10344, + "epoch": 0.020658829732417824, + "loss/policy_avg": 0.11528673768043518, + "lr": 2.8763420245398775e-06, + "objective/entropy": 89.44493865966797, + "objective/kl": 1.644212007522583, + "objective/non_score_reward": -0.08221060037612915, + "objective/rlhf_reward": -0.4932635799050331, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6586668491363525, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.5703125, + "step": 430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000441074371338 + }, + { + "episode": 10368, + "epoch": 0.0207067620519826, + "loss/policy_avg": 0.015914084389805794, + "lr": 2.876054447852761e-06, + "objective/entropy": 90.18218994140625, + "objective/kl": 4.899535655975342, + "objective/non_score_reward": -0.24497678875923157, + "objective/rlhf_reward": 1.5301393419504166, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.18227481842041, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40625, + "step": 431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980523586273193 + }, + { + "episode": 10392, + "epoch": 0.020754694371547377, + "loss/policy_avg": 0.21538235247135162, + "lr": 2.8757668711656443e-06, + "objective/entropy": 90.29976654052734, + "objective/kl": 3.7010626792907715, + "objective/non_score_reward": -0.18505315482616425, + "objective/rlhf_reward": 2.6752597042304687, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.549294948577881, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.38671875, + "step": 432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974658489227295 + }, + { + "episode": 10416, + "epoch": 0.02080262669111215, + "loss/policy_avg": 0.044505368918180466, + "lr": 2.8754792944785278e-06, + "objective/entropy": 88.24333953857422, + "objective/kl": 7.0305914878845215, + "objective/non_score_reward": -0.3515295684337616, + "objective/rlhf_reward": -2.1091774106025696, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.14705753326416, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3876953125, + "step": 433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978352785110474 + }, + { + "episode": 10440, + "epoch": 0.020850559010676926, + "loss/policy_avg": 0.01823197305202484, + "lr": 2.875191717791411e-06, + "objective/entropy": 100.33858489990234, + "objective/kl": 3.7182159423828125, + "objective/non_score_reward": -0.18591079115867615, + "objective/rlhf_reward": 0.7773246329716053, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.882500648498535, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4345703125, + "step": 434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997636079788208 + }, + { + "episode": 10464, + "epoch": 0.0208984913302417, + "loss/policy_avg": 0.060860998928546906, + "lr": 2.8749041411042946e-06, + "objective/entropy": 98.23674011230469, + "objective/kl": 3.7311859130859375, + "objective/non_score_reward": -0.1865593045949936, + "objective/rlhf_reward": -1.1193558052182198, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.871588706970215, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4375, + "step": 435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963643550872803 + }, + { + "episode": 10488, + "epoch": 0.020946423649806475, + "loss/policy_avg": 0.04685552045702934, + "lr": 2.874616564417178e-06, + "objective/entropy": 88.729736328125, + "objective/kl": 5.516032695770264, + "objective/non_score_reward": -0.2758016288280487, + "objective/rlhf_reward": 2.1307688527685813, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 5.5448455810546875, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.376953125, + "step": 436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970228672027588 + }, + { + "episode": 10512, + "epoch": 0.02099435596937125, + "loss/policy_avg": 0.023098420351743698, + "lr": 2.8743289877300615e-06, + "objective/entropy": 65.81800079345703, + "objective/kl": 5.449092864990234, + "objective/non_score_reward": -0.2724546194076538, + "objective/rlhf_reward": -1.634727619588375, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1376771926879883, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.2998046875, + "step": 437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000962257385254 + }, + { + "episode": 10536, + "epoch": 0.021042288288936024, + "loss/policy_avg": 0.0263800211250782, + "lr": 2.874041411042945e-06, + "objective/entropy": 115.90081024169922, + "objective/kl": 2.9199862480163574, + "objective/non_score_reward": -0.14599931240081787, + "objective/rlhf_reward": -0.8759958893060684, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3046083450317383, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.46484375, + "step": 438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000316619873047 + }, + { + "episode": 10560, + "epoch": 0.021090220608500798, + "loss/policy_avg": -0.01658463105559349, + "lr": 2.8737538343558283e-06, + "objective/entropy": 83.69380187988281, + "objective/kl": 4.96422815322876, + "objective/non_score_reward": -0.24821142852306366, + "objective/rlhf_reward": 4.510731533169746, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.7365407943725586, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.37109375, + "step": 439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003032684326172 + }, + { + "episode": 10584, + "epoch": 0.021138152928065573, + "loss/policy_avg": 0.03338801488280296, + "lr": 2.8734662576687117e-06, + "objective/entropy": 101.86214447021484, + "objective/kl": 2.891199827194214, + "objective/non_score_reward": -0.14455999433994293, + "objective/rlhf_reward": 1.2698831566084756, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.490731239318848, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.41796875, + "step": 440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985918998718262 + }, + { + "episode": 10608, + "epoch": 0.021186085247630347, + "loss/policy_avg": 0.19703838229179382, + "lr": 2.873178680981595e-06, + "objective/entropy": 101.780029296875, + "objective/kl": 2.287071704864502, + "objective/non_score_reward": -0.11435358226299286, + "objective/rlhf_reward": 1.6349953945327762, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.335992336273193, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4755859375, + "step": 441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004281997680664 + }, + { + "episode": 10632, + "epoch": 0.02123401756719512, + "loss/policy_avg": 0.021132517606019974, + "lr": 2.8728911042944786e-06, + "objective/entropy": 105.60923767089844, + "objective/kl": 6.0953779220581055, + "objective/non_score_reward": -0.3047689199447632, + "objective/rlhf_reward": 0.06417586025508304, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 5.324655532836914, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4375, + "step": 442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997052788734436 + }, + { + "episode": 10656, + "epoch": 0.021281949886759896, + "loss/policy_avg": 0.04389215633273125, + "lr": 2.872603527607362e-06, + "objective/entropy": 97.69444274902344, + "objective/kl": 3.930748462677002, + "objective/non_score_reward": -0.19653742015361786, + "objective/rlhf_reward": -1.1792244892567396, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3551437854766846, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4541015625, + "step": 443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975665807724 + }, + { + "episode": 10680, + "epoch": 0.02132988220632467, + "loss/policy_avg": 0.10260239243507385, + "lr": 2.8723159509202455e-06, + "objective/entropy": 82.78902435302734, + "objective/kl": 3.6267898082733154, + "objective/non_score_reward": -0.18133948743343353, + "objective/rlhf_reward": -1.0880369022488594, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5362504720687866, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3583984375, + "step": 444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024871826171875 + }, + { + "episode": 10704, + "epoch": 0.021377814525889445, + "loss/policy_avg": -0.018965616822242737, + "lr": 2.872028374233129e-06, + "objective/entropy": 135.18368530273438, + "objective/kl": 2.1742563247680664, + "objective/non_score_reward": -0.1087128147482872, + "objective/rlhf_reward": 1.347723126411438, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.6728168725967407, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.537109375, + "step": 445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015370845794678 + }, + { + "episode": 10728, + "epoch": 0.02142574684545422, + "loss/policy_avg": 0.09902440011501312, + "lr": 2.8717407975460123e-06, + "objective/entropy": 63.02091598510742, + "objective/kl": 2.1511642932891846, + "objective/non_score_reward": -0.10755821317434311, + "objective/rlhf_reward": 1.2474400114706363, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.074841022491455, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3623046875, + "step": 446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0002238750457764 + }, + { + "episode": 10752, + "epoch": 0.021473679165018994, + "loss/policy_avg": 1.3366904258728027, + "lr": 2.8714532208588957e-06, + "objective/entropy": 75.74290466308594, + "objective/kl": 2.3365769386291504, + "objective/non_score_reward": -0.11682885885238647, + "objective/rlhf_reward": -0.7009731382131577, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3851401805877686, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3408203125, + "step": 447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0035550594329834 + }, + { + "episode": 10776, + "epoch": 0.02152161148458377, + "loss/policy_avg": 0.08619701117277145, + "lr": 2.871165644171779e-06, + "objective/entropy": 63.3900260925293, + "objective/kl": 5.416179180145264, + "objective/non_score_reward": -0.2708089351654053, + "objective/rlhf_reward": -1.624853640794754, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.495426893234253, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.375, + "step": 448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0034520626068115 + }, + { + "episode": 10800, + "epoch": 0.021569543804148543, + "loss/policy_avg": 0.027282044291496277, + "lr": 2.8708780674846626e-06, + "objective/entropy": 141.77029418945312, + "objective/kl": 5.110448837280273, + "objective/non_score_reward": -0.2555224597454071, + "objective/rlhf_reward": 0.27304521551144456, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.241987705230713, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999061107635498 + }, + { + "episode": 10824, + "epoch": 0.021617476123713317, + "loss/policy_avg": -0.010026881471276283, + "lr": 2.870590490797546e-06, + "objective/entropy": 117.4852523803711, + "objective/kl": 3.8383846282958984, + "objective/non_score_reward": -0.1919192373752594, + "objective/rlhf_reward": 1.4325439614417048, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.674365758895874, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5078125, + "step": 450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000946521759033 + }, + { + "episode": 10848, + "epoch": 0.021665408443278092, + "loss/policy_avg": 0.04624781757593155, + "lr": 2.87030291411043e-06, + "objective/entropy": 93.13816833496094, + "objective/kl": 6.054122447967529, + "objective/non_score_reward": -0.30270612239837646, + "objective/rlhf_reward": -1.8162367641925812, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.494405746459961, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4267578125, + "step": 451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983863830566406 + }, + { + "episode": 10872, + "epoch": 0.021713340762842866, + "loss/policy_avg": 0.03440706431865692, + "lr": 2.870015337423313e-06, + "objective/entropy": 123.34963989257812, + "objective/kl": 3.545436382293701, + "objective/non_score_reward": -0.17727182805538177, + "objective/rlhf_reward": 4.936369038186967, + "objective/scores": 1.0, + "policy/approxkl_avg": 5.758052349090576, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.572265625, + "step": 452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996041059494019 + }, + { + "episode": 10896, + "epoch": 0.02176127308240764, + "loss/policy_avg": 0.04619492590427399, + "lr": 2.8697277607361963e-06, + "objective/entropy": 130.44210815429688, + "objective/kl": 2.658170700073242, + "objective/non_score_reward": -0.13290853798389435, + "objective/rlhf_reward": -0.7974511682987213, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.749529838562012, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.568359375, + "step": 453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976425170898438 + }, + { + "episode": 10920, + "epoch": 0.021809205401972415, + "loss/policy_avg": 0.05673223361372948, + "lr": 2.8694401840490797e-06, + "objective/entropy": 112.97125244140625, + "objective/kl": 5.225466728210449, + "objective/non_score_reward": -0.2612733244895935, + "objective/rlhf_reward": -1.5676399245858192, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.714008331298828, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.517578125, + "step": 454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9959455728530884 + }, + { + "episode": 10944, + "epoch": 0.02185713772153719, + "loss/policy_avg": 0.022186409682035446, + "lr": 2.869152607361963e-06, + "objective/entropy": 71.6819839477539, + "objective/kl": 3.055636405944824, + "objective/non_score_reward": -0.15278181433677673, + "objective/rlhf_reward": -0.9166908711194992, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.588707208633423, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.328125, + "step": 455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998382568359375 + }, + { + "episode": 10968, + "epoch": 0.021905070041101964, + "loss/policy_avg": 0.0224332045763731, + "lr": 2.8688650306748466e-06, + "objective/entropy": 115.0306625366211, + "objective/kl": 3.9927878379821777, + "objective/non_score_reward": -0.19963940978050232, + "objective/rlhf_reward": -1.1978363618254662, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6733665466308594, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.498046875, + "step": 456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9969961643218994 + }, + { + "episode": 10992, + "epoch": 0.02195300236066674, + "loss/policy_avg": 0.08163905143737793, + "lr": 2.86857745398773e-06, + "objective/entropy": 78.45004272460938, + "objective/kl": 3.6262941360473633, + "objective/non_score_reward": -0.18131470680236816, + "objective/rlhf_reward": -1.087888240814209, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.853886127471924, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3564453125, + "step": 457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006041526794434 + }, + { + "episode": 11016, + "epoch": 0.022000934680231513, + "loss/policy_avg": 0.007836979813873768, + "lr": 2.8682898773006134e-06, + "objective/entropy": 79.47767639160156, + "objective/kl": 3.672515869140625, + "objective/non_score_reward": -0.18362578749656677, + "objective/rlhf_reward": 1.219362140779591, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 1.3275498151779175, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.34765625, + "step": 458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003304958343506 + }, + { + "episode": 11040, + "epoch": 0.022048866999796288, + "loss/policy_avg": 0.018270887434482574, + "lr": 2.868002300613497e-06, + "objective/entropy": 87.16045379638672, + "objective/kl": 3.8720622062683105, + "objective/non_score_reward": -0.19360308349132538, + "objective/rlhf_reward": -1.1616184934973717, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3626997470855713, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.412109375, + "step": 459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996235370635986 + }, + { + "episode": 11064, + "epoch": 0.022096799319361062, + "loss/policy_avg": 0.13990378379821777, + "lr": 2.8677147239263803e-06, + "objective/entropy": 99.79820251464844, + "objective/kl": 3.052666187286377, + "objective/non_score_reward": -0.15263330936431885, + "objective/rlhf_reward": 0.8185891315237366, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.125094413757324, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.42578125, + "step": 460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0039730072021484 + }, + { + "episode": 11088, + "epoch": 0.022144731638925837, + "loss/policy_avg": -0.0029341094195842743, + "lr": 2.867427147239264e-06, + "objective/entropy": 86.119873046875, + "objective/kl": 3.6622061729431152, + "objective/non_score_reward": -0.1831103265285492, + "objective/rlhf_reward": -1.0986618362367153, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.817934989929199, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.435546875, + "step": 461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993889331817627 + }, + { + "episode": 11112, + "epoch": 0.02219266395849061, + "loss/policy_avg": 0.005317240487784147, + "lr": 2.8671395705521475e-06, + "objective/entropy": 88.50054931640625, + "objective/kl": 4.342528820037842, + "objective/non_score_reward": -0.21712645888328552, + "objective/rlhf_reward": 0.4316303238169037, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.0392327308654785, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.392578125, + "step": 462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989891052246094 + }, + { + "episode": 11136, + "epoch": 0.022240596278055386, + "loss/policy_avg": 0.050183285027742386, + "lr": 2.866851993865031e-06, + "objective/entropy": 91.42330169677734, + "objective/kl": 3.9373717308044434, + "objective/non_score_reward": -0.19686858355998993, + "objective/rlhf_reward": -1.1812114948406816, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.970966339111328, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.41796875, + "step": 463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985582828521729 + }, + { + "episode": 11160, + "epoch": 0.02228852859762016, + "loss/policy_avg": 0.02141093835234642, + "lr": 2.8665644171779144e-06, + "objective/entropy": 95.68562316894531, + "objective/kl": 3.931685209274292, + "objective/non_score_reward": -0.19658426940441132, + "objective/rlhf_reward": -1.1795055642724037, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.816255569458008, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41796875, + "step": 464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980010986328125 + }, + { + "episode": 11184, + "epoch": 0.022336460917184935, + "loss/policy_avg": 0.024790557101368904, + "lr": 2.866276840490798e-06, + "objective/entropy": 131.07937622070312, + "objective/kl": 3.3283536434173584, + "objective/non_score_reward": -0.1664176881313324, + "objective/rlhf_reward": -0.9985060691833496, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5698788166046143, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983954429626465 + }, + { + "episode": 11208, + "epoch": 0.02238439323674971, + "loss/policy_avg": 0.04583810269832611, + "lr": 2.8659892638036812e-06, + "objective/entropy": 100.99491882324219, + "objective/kl": 3.5772929191589355, + "objective/non_score_reward": -0.17886465787887573, + "objective/rlhf_reward": -1.073187917470932, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.947303771972656, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.455078125, + "step": 466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962186813354492 + }, + { + "episode": 11232, + "epoch": 0.022432325556314484, + "loss/policy_avg": 0.04181257635354996, + "lr": 2.8657016871165643e-06, + "objective/entropy": 85.98899841308594, + "objective/kl": 4.284356594085693, + "objective/non_score_reward": -0.21421782672405243, + "objective/rlhf_reward": -1.2853069864213467, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.254854202270508, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3984375, + "step": 467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962608814239502 + }, + { + "episode": 11256, + "epoch": 0.022480257875879258, + "loss/policy_avg": 0.006210983730852604, + "lr": 2.8654141104294477e-06, + "objective/entropy": 101.96221923828125, + "objective/kl": 6.427763938903809, + "objective/non_score_reward": -0.3213881850242615, + "objective/rlhf_reward": 0.3927878599215511, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 6.841257095336914, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4384765625, + "step": 468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015573501586914 + }, + { + "episode": 11280, + "epoch": 0.022528190195444033, + "loss/policy_avg": 0.038610316812992096, + "lr": 2.865126533742331e-06, + "objective/entropy": 89.04576110839844, + "objective/kl": 2.730928421020508, + "objective/non_score_reward": -0.13654643297195435, + "objective/rlhf_reward": 5.180721454322338, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.8916349411010742, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.412109375, + "step": 469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997050762176514 + }, + { + "episode": 11304, + "epoch": 0.022576122515008807, + "loss/policy_avg": 0.1737823486328125, + "lr": 2.8648389570552145e-06, + "objective/entropy": 99.1282958984375, + "objective/kl": 4.144288063049316, + "objective/non_score_reward": -0.20721435546875, + "objective/rlhf_reward": -1.2432862594723701, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.354764938354492, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.44140625, + "step": 470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0033676624298096 + }, + { + "episode": 11328, + "epoch": 0.02262405483457358, + "loss/policy_avg": 0.016087224707007408, + "lr": 2.864551380368098e-06, + "objective/entropy": 90.0741195678711, + "objective/kl": 3.3213400840759277, + "objective/non_score_reward": -0.1660670042037964, + "objective/rlhf_reward": 2.0035979822278023, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.043557643890381, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.427734375, + "step": 471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002535343170166 + }, + { + "episode": 11352, + "epoch": 0.022671987154138356, + "loss/policy_avg": -0.010089368559420109, + "lr": 2.864263803680982e-06, + "objective/entropy": 85.12727355957031, + "objective/kl": 4.258035659790039, + "objective/non_score_reward": -0.21290180087089539, + "objective/rlhf_reward": 0.4569782867924057, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 1.9623403549194336, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.380859375, + "step": 472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001938581466675 + }, + { + "episode": 11376, + "epoch": 0.02271991947370313, + "loss/policy_avg": 0.026408130303025246, + "lr": 2.8639762269938652e-06, + "objective/entropy": 102.06597900390625, + "objective/kl": 3.31469988822937, + "objective/non_score_reward": -0.16573497653007507, + "objective/rlhf_reward": 2.7911686324459724, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.54306960105896, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.529296875, + "step": 473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00393009185791 + }, + { + "episode": 11400, + "epoch": 0.022767851793267905, + "loss/policy_avg": 0.06259097903966904, + "lr": 2.8636886503067487e-06, + "objective/entropy": 89.93965148925781, + "objective/kl": 4.1927385330200195, + "objective/non_score_reward": -0.20963692665100098, + "objective/rlhf_reward": -1.257821574807167, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.816557884216309, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3935546875, + "step": 474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998142957687378 + }, + { + "episode": 11424, + "epoch": 0.02281578411283268, + "loss/policy_avg": 0.08352553099393845, + "lr": 2.863401073619632e-06, + "objective/entropy": 71.9052505493164, + "objective/kl": 5.034181118011475, + "objective/non_score_reward": -0.2517090439796448, + "objective/rlhf_reward": -1.5102543234825134, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.4304656982421875, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.453125, + "step": 475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9967167377471924 + }, + { + "episode": 11448, + "epoch": 0.022863716432397454, + "loss/policy_avg": 0.23256786167621613, + "lr": 2.8631134969325155e-06, + "objective/entropy": 91.92772674560547, + "objective/kl": 5.284446716308594, + "objective/non_score_reward": -0.2642223536968231, + "objective/rlhf_reward": 0.22084588905585145, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.9694204330444336, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4775390625, + "step": 476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994295835494995 + }, + { + "episode": 11472, + "epoch": 0.02291164875196223, + "loss/policy_avg": -0.012431513518095016, + "lr": 2.862825920245399e-06, + "objective/entropy": 106.69951629638672, + "objective/kl": 4.014987945556641, + "objective/non_score_reward": -0.20074938237667084, + "objective/rlhf_reward": 0.6016837207020554, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 0.9266321659088135, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.529296875, + "step": 477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007989406585693 + }, + { + "episode": 11496, + "epoch": 0.022959581071527003, + "loss/policy_avg": 0.016697635874152184, + "lr": 2.8625383435582824e-06, + "objective/entropy": 83.09112548828125, + "objective/kl": 4.378654479980469, + "objective/non_score_reward": -0.21893273293972015, + "objective/rlhf_reward": 0.42079258262074815, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.174764633178711, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3828125, + "step": 478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978358745574951 + }, + { + "episode": 11520, + "epoch": 0.023007513391091777, + "loss/policy_avg": 0.016456590965390205, + "lr": 2.8622507668711658e-06, + "objective/entropy": 82.7313232421875, + "objective/kl": 3.221165657043457, + "objective/non_score_reward": -0.16105830669403076, + "objective/rlhf_reward": 1.6177095678808184, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.2963664531707764, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4033203125, + "step": 479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000460624694824 + }, + { + "episode": 11544, + "epoch": 0.023055445710656552, + "loss/policy_avg": 0.05686425417661667, + "lr": 2.8619631901840492e-06, + "objective/entropy": 70.43087005615234, + "objective/kl": 5.8952765464782715, + "objective/non_score_reward": -0.29476383328437805, + "objective/rlhf_reward": -1.7685829848051071, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.606513023376465, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.32421875, + "step": 480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9950857162475586 + }, + { + "episode": 11568, + "epoch": 0.023103378030221326, + "loss/policy_avg": 0.17201220989227295, + "lr": 2.8616756134969326e-06, + "objective/entropy": 108.549560546875, + "objective/kl": 4.427221298217773, + "objective/non_score_reward": -0.22136108577251434, + "objective/rlhf_reward": 0.6718336045742034, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.529676914215088, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.501953125, + "step": 481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000444173812866 + }, + { + "episode": 11592, + "epoch": 0.0231513103497861, + "loss/policy_avg": 0.09017947316169739, + "lr": 2.861388036809816e-06, + "objective/entropy": 73.33565521240234, + "objective/kl": 4.623129844665527, + "objective/non_score_reward": -0.23115651309490204, + "objective/rlhf_reward": -1.3869389854371548, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2485336065292358, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.439453125, + "step": 482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0016582012176514 + }, + { + "episode": 11616, + "epoch": 0.023199242669350875, + "loss/policy_avg": -0.005082109943032265, + "lr": 2.8611004601226995e-06, + "objective/entropy": 103.50146484375, + "objective/kl": 3.3150641918182373, + "objective/non_score_reward": -0.16575321555137634, + "objective/rlhf_reward": 5.005480691790581, + "objective/scores": 1.0, + "policy/approxkl_avg": 9.281396865844727, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4384765625, + "step": 483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993665218353271 + }, + { + "episode": 11640, + "epoch": 0.02324717498891565, + "loss/policy_avg": 0.002429109998047352, + "lr": 2.860812883435583e-06, + "objective/entropy": 75.4254150390625, + "objective/kl": 4.040124416351318, + "objective/non_score_reward": -0.20200622081756592, + "objective/rlhf_reward": 1.1090795483041767, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.293373107910156, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3583984375, + "step": 484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0005571842193604 + }, + { + "episode": 11664, + "epoch": 0.023295107308480424, + "loss/policy_avg": 0.03243885189294815, + "lr": 2.8605253067484663e-06, + "objective/entropy": 96.67467498779297, + "objective/kl": 4.761052131652832, + "objective/non_score_reward": -0.238052636384964, + "objective/rlhf_reward": -1.4283157140016556, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.2516770362854, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.423828125, + "step": 485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962496757507324 + }, + { + "episode": 11688, + "epoch": 0.0233430396280452, + "loss/policy_avg": -0.01368781365454197, + "lr": 2.8602377300613498e-06, + "objective/entropy": 120.44252014160156, + "objective/kl": 1.636137843132019, + "objective/non_score_reward": -0.08180689811706543, + "objective/rlhf_reward": 1.3153385703803335, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.2647151947021484, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.544921875, + "step": 486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994701147079468 + }, + { + "episode": 11712, + "epoch": 0.023390971947609973, + "loss/policy_avg": 0.014074032194912434, + "lr": 2.859950153374233e-06, + "objective/entropy": 102.04637145996094, + "objective/kl": 4.952628135681152, + "objective/non_score_reward": -0.24763141572475433, + "objective/rlhf_reward": -1.4857884421944618, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4268388748168945, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4267578125, + "step": 487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984090328216553 + }, + { + "episode": 11736, + "epoch": 0.023438904267174748, + "loss/policy_avg": 0.32468974590301514, + "lr": 2.8596625766871166e-06, + "objective/entropy": 93.15440368652344, + "objective/kl": 4.474202632904053, + "objective/non_score_reward": -0.22371014952659607, + "objective/rlhf_reward": 0.6577392071485518, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.347200393676758, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.400390625, + "step": 488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003246307373047 + }, + { + "episode": 11760, + "epoch": 0.023486836586739522, + "loss/policy_avg": 0.02336808666586876, + "lr": 2.859375e-06, + "objective/entropy": 109.06414794921875, + "objective/kl": 4.039943695068359, + "objective/non_score_reward": -0.20199716091156006, + "objective/rlhf_reward": 0.5224059626356445, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 9.504860877990723, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.482421875, + "step": 489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99422025680542 + }, + { + "episode": 11784, + "epoch": 0.0235347689063043, + "loss/policy_avg": -0.019448518753051758, + "lr": 2.8590874233128835e-06, + "objective/entropy": 73.8209457397461, + "objective/kl": 3.6317622661590576, + "objective/non_score_reward": -0.18158811330795288, + "objective/rlhf_reward": -1.089528650045395, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.045463800430298, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3564453125, + "step": 490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006728172302246 + }, + { + "episode": 11808, + "epoch": 0.023582701225869074, + "loss/policy_avg": 0.04375119507312775, + "lr": 2.858799846625767e-06, + "objective/entropy": 102.70416259765625, + "objective/kl": 5.825234889984131, + "objective/non_score_reward": -0.29126179218292236, + "objective/rlhf_reward": -1.7475705742835999, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.346668243408203, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4560546875, + "step": 491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975346326828003 + }, + { + "episode": 11832, + "epoch": 0.02363063354543385, + "loss/policy_avg": 0.014540346339344978, + "lr": 2.8585122699386503e-06, + "objective/entropy": 109.66856384277344, + "objective/kl": 3.0029234886169434, + "objective/non_score_reward": -0.15014617145061493, + "objective/rlhf_reward": 1.4202398892093662, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.6914665699005127, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.509765625, + "step": 492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993436336517334 + }, + { + "episode": 11856, + "epoch": 0.023678565864998623, + "loss/policy_avg": 0.0049804686568677425, + "lr": 2.8582246932515337e-06, + "objective/entropy": 94.58683776855469, + "objective/kl": 4.56975793838501, + "objective/non_score_reward": -0.22848787903785706, + "objective/rlhf_reward": -1.3709272518754005, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.781710624694824, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4375, + "step": 493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964604377746582 + }, + { + "episode": 11880, + "epoch": 0.023726498184563398, + "loss/policy_avg": 0.029869168996810913, + "lr": 2.857937116564417e-06, + "objective/entropy": 87.79437255859375, + "objective/kl": 3.0206687450408936, + "objective/non_score_reward": -0.1510334312915802, + "objective/rlhf_reward": 0.9865886971792783, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.4764304161071777, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4033203125, + "step": 494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0005240440368652 + }, + { + "episode": 11904, + "epoch": 0.023774430504128172, + "loss/policy_avg": -0.015042989514768124, + "lr": 2.857649539877301e-06, + "objective/entropy": 72.32623291015625, + "objective/kl": 4.725089073181152, + "objective/non_score_reward": -0.23625448346138, + "objective/rlhf_reward": -1.4175268672406673, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.915755271911621, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4091796875, + "step": 495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0011420249938965 + }, + { + "episode": 11928, + "epoch": 0.023822362823692947, + "loss/policy_avg": 0.0674760639667511, + "lr": 2.8573619631901845e-06, + "objective/entropy": 74.95481872558594, + "objective/kl": 5.2560834884643555, + "objective/non_score_reward": -0.26280418038368225, + "objective/rlhf_reward": 0.5604181111265553, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.682612419128418, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3310546875, + "step": 496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004593849182129 + }, + { + "episode": 11952, + "epoch": 0.02387029514325772, + "loss/policy_avg": 0.07120761275291443, + "lr": 2.8570743865030675e-06, + "objective/entropy": 101.38941955566406, + "objective/kl": 4.461763381958008, + "objective/non_score_reward": -0.22308818995952606, + "objective/rlhf_reward": -1.3385290503501892, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.995584011077881, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4599609375, + "step": 497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002059936523438 + }, + { + "episode": 11976, + "epoch": 0.023918227462822496, + "loss/policy_avg": -0.024035777896642685, + "lr": 2.856786809815951e-06, + "objective/entropy": 81.1277084350586, + "objective/kl": 5.193532466888428, + "objective/non_score_reward": -0.25967663526535034, + "objective/rlhf_reward": 0.2481202368975911, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.894901752471924, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4677734375, + "step": 498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973117113113403 + }, + { + "episode": 12000, + "epoch": 0.02396615978238727, + "loss/policy_avg": -0.009149005636572838, + "lr": 2.8564992331288343e-06, + "objective/entropy": 115.399169921875, + "objective/kl": 8.054191589355469, + "objective/non_score_reward": -0.40270957350730896, + "objective/rlhf_reward": 0.5837425515055656, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.598761558532715, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4912109375, + "step": 499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998589754104614 + }, + { + "episode": 12024, + "epoch": 0.024014092101952045, + "loss/policy_avg": 0.04159349948167801, + "lr": 2.8562116564417177e-06, + "objective/entropy": 82.60993194580078, + "objective/kl": 5.335894584655762, + "objective/non_score_reward": -0.26679471135139465, + "objective/rlhf_reward": -1.600768193602562, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.9063663482666016, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4443359375, + "step": 500, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000386953353882 + }, + { + "episode": 12048, + "epoch": 0.02406202442151682, + "loss/policy_avg": -0.019558634608983994, + "lr": 2.855924079754601e-06, + "objective/entropy": 117.45049285888672, + "objective/kl": 1.1582202911376953, + "objective/non_score_reward": -0.05791100859642029, + "objective/rlhf_reward": 2.2365932856859656, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.6017698049545288, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.521484375, + "step": 501, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003221273422241 + }, + { + "episode": 12072, + "epoch": 0.024109956741081594, + "loss/policy_avg": 0.17124204337596893, + "lr": 2.8556365030674846e-06, + "objective/entropy": 97.6928939819336, + "objective/kl": 4.5768914222717285, + "objective/non_score_reward": -0.2288445681333542, + "objective/rlhf_reward": 0.5197218519142475, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 5.156371116638184, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.41796875, + "step": 502, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998929500579834 + }, + { + "episode": 12096, + "epoch": 0.024157889060646368, + "loss/policy_avg": -0.009648483246564865, + "lr": 2.855348926380368e-06, + "objective/entropy": 101.44920349121094, + "objective/kl": 2.7913272380828857, + "objective/non_score_reward": -0.1395663321018219, + "objective/rlhf_reward": -0.8373980335891247, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8518749475479126, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.498046875, + "step": 503, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0002248287200928 + }, + { + "episode": 12120, + "epoch": 0.024205821380211143, + "loss/policy_avg": 0.12513300776481628, + "lr": 2.8550613496932514e-06, + "objective/entropy": 110.22604370117188, + "objective/kl": 7.349318981170654, + "objective/non_score_reward": -0.36746594309806824, + "objective/rlhf_reward": -0.20479563623666774, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 7.277237892150879, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4970703125, + "step": 504, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9953255653381348 + }, + { + "episode": 12144, + "epoch": 0.024253753699775917, + "loss/policy_avg": -0.0008101798593997955, + "lr": 2.854773773006135e-06, + "objective/entropy": 114.38346862792969, + "objective/kl": 2.5578360557556152, + "objective/non_score_reward": -0.12789177894592285, + "objective/rlhf_reward": 1.816708622610757, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.551894187927246, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.537109375, + "step": 505, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004889965057373 + }, + { + "episode": 12168, + "epoch": 0.02430168601934069, + "loss/policy_avg": 0.01656084507703781, + "lr": 2.8544861963190187e-06, + "objective/entropy": 115.84288024902344, + "objective/kl": 1.669954538345337, + "objective/non_score_reward": -0.08349772542715073, + "objective/rlhf_reward": -0.5009863451123238, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.744967460632324, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.53125, + "step": 506, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998966932296753 + }, + { + "episode": 12192, + "epoch": 0.024349618338905466, + "loss/policy_avg": 0.035612378269433975, + "lr": 2.854198619631902e-06, + "objective/entropy": 114.54009246826172, + "objective/kl": 3.963474988937378, + "objective/non_score_reward": -0.19817376136779785, + "objective/rlhf_reward": -1.1890425086021423, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.172623872756958, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.52734375, + "step": 507, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003833770751953 + }, + { + "episode": 12216, + "epoch": 0.02439755065847024, + "loss/policy_avg": 0.015122998505830765, + "lr": 2.8539110429447856e-06, + "objective/entropy": 146.95428466796875, + "objective/kl": 4.422600746154785, + "objective/non_score_reward": -0.22113001346588135, + "objective/rlhf_reward": -1.326780118048191, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7787455320358276, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.615234375, + "step": 508, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001955270767212 + }, + { + "episode": 12240, + "epoch": 0.024445482978035015, + "loss/policy_avg": 0.3376290500164032, + "lr": 2.853623466257669e-06, + "objective/entropy": 96.29425048828125, + "objective/kl": 5.758080959320068, + "objective/non_score_reward": -0.2879040837287903, + "objective/rlhf_reward": -1.7274244576692581, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.06043004989624, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.541015625, + "step": 509, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9997632503509521 + }, + { + "episode": 12264, + "epoch": 0.02449341529759979, + "loss/policy_avg": 0.012225430458784103, + "lr": 2.8533358895705524e-06, + "objective/entropy": 97.1302719116211, + "objective/kl": 4.125348091125488, + "objective/non_score_reward": -0.20626741647720337, + "objective/rlhf_reward": 0.7623955793678759, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.837220668792725, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.431640625, + "step": 510, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00093412399292 + }, + { + "episode": 12288, + "epoch": 0.024541347617164564, + "loss/policy_avg": 0.1214967891573906, + "lr": 2.853048312883436e-06, + "objective/entropy": 86.03912353515625, + "objective/kl": 2.2983150482177734, + "objective/non_score_reward": -0.11491575092077255, + "objective/rlhf_reward": 2.3105054683983326, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.424278974533081, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.37890625, + "step": 511, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024726390838623 + }, + { + "episode": 12312, + "epoch": 0.02458927993672934, + "loss/policy_avg": 0.21219781041145325, + "lr": 2.852760736196319e-06, + "objective/entropy": 92.06816101074219, + "objective/kl": 5.616193771362305, + "objective/non_score_reward": -0.2808097004890442, + "objective/rlhf_reward": 0.4523850612748993, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.6290481090545654, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4755859375, + "step": 512, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994451999664307 + }, + { + "episode": 12336, + "epoch": 0.024637212256294113, + "loss/policy_avg": 0.007288619875907898, + "lr": 2.8524731595092023e-06, + "objective/entropy": 100.42236328125, + "objective/kl": 6.954169750213623, + "objective/non_score_reward": -0.34770846366882324, + "objective/rlhf_reward": 0.9137490317225456, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.7331228256225586, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4619140625, + "step": 513, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002845287322998 + }, + { + "episode": 12360, + "epoch": 0.024685144575858887, + "loss/policy_avg": 0.04623780772089958, + "lr": 2.8521855828220857e-06, + "objective/entropy": 83.37246704101562, + "objective/kl": 3.2128772735595703, + "objective/non_score_reward": -0.16064387559890747, + "objective/rlhf_reward": 0.7705257527426563, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.1856279373168945, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3974609375, + "step": 514, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991801977157593 + }, + { + "episode": 12384, + "epoch": 0.024733076895423662, + "loss/policy_avg": 0.04811213165521622, + "lr": 2.851898006134969e-06, + "objective/entropy": 98.9010238647461, + "objective/kl": 5.479508876800537, + "objective/non_score_reward": -0.2739754319190979, + "objective/rlhf_reward": 0.35614745318889607, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.817846298217773, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4228515625, + "step": 515, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988760948181152 + }, + { + "episode": 12408, + "epoch": 0.024781009214988436, + "loss/policy_avg": 0.08421969413757324, + "lr": 2.851610429447853e-06, + "objective/entropy": 77.08775329589844, + "objective/kl": 4.280744552612305, + "objective/non_score_reward": -0.214037224650383, + "objective/rlhf_reward": 4.715776681900024, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.5768558979034424, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3759765625, + "step": 516, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994008541107178 + }, + { + "episode": 12432, + "epoch": 0.02482894153455321, + "loss/policy_avg": 0.07634267210960388, + "lr": 2.8513228527607364e-06, + "objective/entropy": 112.64117431640625, + "objective/kl": 7.24207067489624, + "objective/non_score_reward": -0.36210355162620544, + "objective/rlhf_reward": 3.8273788392543793, + "objective/scores": 1.0, + "policy/approxkl_avg": 6.28758430480957, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.490234375, + "step": 517, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.994844675064087 + }, + { + "episode": 12456, + "epoch": 0.024876873854117985, + "loss/policy_avg": 0.09299606829881668, + "lr": 2.85103527607362e-06, + "objective/entropy": 125.58233642578125, + "objective/kl": 5.107172012329102, + "objective/non_score_reward": -0.25535857677459717, + "objective/rlhf_reward": 2.253426971374195, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.571087837219238, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.5546875, + "step": 518, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986679553985596 + }, + { + "episode": 12480, + "epoch": 0.02492480617368276, + "loss/policy_avg": 0.061636269092559814, + "lr": 2.8507476993865032e-06, + "objective/entropy": 114.10973358154297, + "objective/kl": 3.129873037338257, + "objective/non_score_reward": -0.1564936637878418, + "objective/rlhf_reward": -0.9389619547873735, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.8650288581848145, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4833984375, + "step": 519, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985730648040771 + }, + { + "episode": 12504, + "epoch": 0.024972738493247534, + "loss/policy_avg": 0.03763078898191452, + "lr": 2.8504601226993867e-06, + "objective/entropy": 95.9405517578125, + "objective/kl": 3.532863140106201, + "objective/non_score_reward": -0.17664313316345215, + "objective/rlhf_reward": 1.0773842715133561, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.361450672149658, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44921875, + "step": 520, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963849782943726 + }, + { + "episode": 12528, + "epoch": 0.02502067081281231, + "loss/policy_avg": 0.04901731014251709, + "lr": 2.85017254601227e-06, + "objective/entropy": 97.36045837402344, + "objective/kl": 4.990077495574951, + "objective/non_score_reward": -0.24950385093688965, + "objective/rlhf_reward": -1.497023232281208, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.514789581298828, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4208984375, + "step": 521, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999291896820068 + }, + { + "episode": 12552, + "epoch": 0.025068603132377083, + "loss/policy_avg": 0.057858798652887344, + "lr": 2.8498849693251535e-06, + "objective/entropy": 79.57304382324219, + "objective/kl": 4.864692211151123, + "objective/non_score_reward": -0.24323460459709167, + "objective/rlhf_reward": 0.6778354559500349, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 7.7043657302856445, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.44140625, + "step": 522, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9954159259796143 + }, + { + "episode": 12576, + "epoch": 0.025116535451941858, + "loss/policy_avg": 0.023719266057014465, + "lr": 2.849597392638037e-06, + "objective/entropy": 103.38787841796875, + "objective/kl": 4.669527053833008, + "objective/non_score_reward": -0.23347634077072144, + "objective/rlhf_reward": 0.4053218623043332, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.846221685409546, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51171875, + "step": 523, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0025229454040527 + }, + { + "episode": 12600, + "epoch": 0.025164467771506632, + "loss/policy_avg": 0.24500982463359833, + "lr": 2.8493098159509204e-06, + "objective/entropy": 82.87353515625, + "objective/kl": 5.500484466552734, + "objective/non_score_reward": -0.2750242352485657, + "objective/rlhf_reward": -1.6501452922821045, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.239179611206055, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3720703125, + "step": 524, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013680458068848 + }, + { + "episode": 12624, + "epoch": 0.025212400091071407, + "loss/policy_avg": 0.014119019731879234, + "lr": 2.849022239263804e-06, + "objective/entropy": 134.18128967285156, + "objective/kl": 2.8478400707244873, + "objective/non_score_reward": -0.14239200949668884, + "objective/rlhf_reward": -0.8543520420789719, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.2307324409484863, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.64453125, + "step": 525, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998753309249878 + }, + { + "episode": 12648, + "epoch": 0.02526033241063618, + "loss/policy_avg": 0.07435805350542068, + "lr": 2.8487346625766872e-06, + "objective/entropy": 89.86431884765625, + "objective/kl": 6.178718566894531, + "objective/non_score_reward": -0.3089359402656555, + "objective/rlhf_reward": 0.28362760026348954, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.093132734298706, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.37890625, + "step": 526, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003016948699951 + }, + { + "episode": 12672, + "epoch": 0.025308264730200956, + "loss/policy_avg": 0.06784027069807053, + "lr": 2.8484470858895707e-06, + "objective/entropy": 99.95799255371094, + "objective/kl": 1.34434175491333, + "objective/non_score_reward": -0.06721709668636322, + "objective/rlhf_reward": -0.40330255031585693, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.876334309577942, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.5078125, + "step": 527, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000293731689453 + }, + { + "episode": 12696, + "epoch": 0.02535619704976573, + "loss/policy_avg": -0.01629638858139515, + "lr": 2.848159509202454e-06, + "objective/entropy": 85.66056823730469, + "objective/kl": 3.7671995162963867, + "objective/non_score_reward": -0.18835997581481934, + "objective/rlhf_reward": 1.0070831932534112, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.549825668334961, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.36328125, + "step": 528, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989556074142456 + }, + { + "episode": 12720, + "epoch": 0.025404129369330505, + "loss/policy_avg": 0.053887709975242615, + "lr": 2.8478719325153375e-06, + "objective/entropy": 101.66270446777344, + "objective/kl": 3.3986587524414062, + "objective/non_score_reward": -0.16993294656276703, + "objective/rlhf_reward": -1.0195976682007313, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.210090160369873, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49609375, + "step": 529, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000427007675171 + }, + { + "episode": 12744, + "epoch": 0.02545206168889528, + "loss/policy_avg": 0.016393592581152916, + "lr": 2.847584355828221e-06, + "objective/entropy": 87.07431030273438, + "objective/kl": 5.258113384246826, + "objective/non_score_reward": -0.26290568709373474, + "objective/rlhf_reward": -1.5774340480566025, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5561914443969727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3935546875, + "step": 530, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00054931640625 + }, + { + "episode": 12768, + "epoch": 0.025499994008460054, + "loss/policy_avg": -0.0034183645620942116, + "lr": 2.8472967791411044e-06, + "objective/entropy": 89.7046127319336, + "objective/kl": 3.71132755279541, + "objective/non_score_reward": -0.18556639552116394, + "objective/rlhf_reward": -1.1133983135223389, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.517796516418457, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3779296875, + "step": 531, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996910810470581 + }, + { + "episode": 12792, + "epoch": 0.025547926328024828, + "loss/policy_avg": -0.005856037139892578, + "lr": 2.8470092024539878e-06, + "objective/entropy": 99.97640228271484, + "objective/kl": 3.9198479652404785, + "objective/non_score_reward": -0.19599241018295288, + "objective/rlhf_reward": 0.7168348517707195, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.263566970825195, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46875, + "step": 532, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999324083328247 + }, + { + "episode": 12816, + "epoch": 0.025595858647589603, + "loss/policy_avg": 0.04740133509039879, + "lr": 2.8467216257668712e-06, + "objective/entropy": 134.374755859375, + "objective/kl": 3.6495845317840576, + "objective/non_score_reward": -0.18247920274734497, + "objective/rlhf_reward": 0.79791407216998, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.2694649696350098, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.576171875, + "step": 533, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000791072845459 + }, + { + "episode": 12840, + "epoch": 0.025643790967154377, + "loss/policy_avg": 0.013124587945640087, + "lr": 2.8464340490797546e-06, + "objective/entropy": 81.40655517578125, + "objective/kl": 3.8682937622070312, + "objective/non_score_reward": -0.19341471791267395, + "objective/rlhf_reward": -1.1604881808161736, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4671647548675537, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.380859375, + "step": 534, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989275932312012 + }, + { + "episode": 12864, + "epoch": 0.02569172328671915, + "loss/policy_avg": 0.009399760514497757, + "lr": 2.846146472392638e-06, + "objective/entropy": 103.51649475097656, + "objective/kl": 1.8976939916610718, + "objective/non_score_reward": -0.09488469362258911, + "objective/rlhf_reward": 1.1650807589188896, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 6.039618492126465, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4921875, + "step": 535, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975980520248413 + }, + { + "episode": 12888, + "epoch": 0.025739655606283926, + "loss/policy_avg": 0.04003096744418144, + "lr": 2.8458588957055215e-06, + "objective/entropy": 99.14649963378906, + "objective/kl": 3.5289571285247803, + "objective/non_score_reward": -0.17644786834716797, + "objective/rlhf_reward": 4.941312812268734, + "objective/scores": 1.0, + "policy/approxkl_avg": 4.65523624420166, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4638671875, + "step": 536, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000193119049072 + }, + { + "episode": 12912, + "epoch": 0.0257875879258487, + "loss/policy_avg": -0.02170879952609539, + "lr": 2.845571319018405e-06, + "objective/entropy": 127.4202880859375, + "objective/kl": 2.390782594680786, + "objective/non_score_reward": -0.11953914165496826, + "objective/rlhf_reward": 1.2827651500701904, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.2339155673980713, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.54296875, + "step": 537, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9995496273040771 + }, + { + "episode": 12936, + "epoch": 0.025835520245413475, + "loss/policy_avg": -0.0023226316552609205, + "lr": 2.8452837423312883e-06, + "objective/entropy": 110.65887451171875, + "objective/kl": 4.224815845489502, + "objective/non_score_reward": -0.2112407684326172, + "objective/rlhf_reward": 0.5387352739812169, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.0213584899902344, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.498046875, + "step": 538, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000672340393066 + }, + { + "episode": 12960, + "epoch": 0.02588345256497825, + "loss/policy_avg": 0.0656738355755806, + "lr": 2.8449961656441718e-06, + "objective/entropy": 84.85826873779297, + "objective/kl": 4.309027194976807, + "objective/non_score_reward": -0.21545135974884033, + "objective/rlhf_reward": -1.2927081175148487, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2352041006088257, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3935546875, + "step": 539, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00160551071167 + }, + { + "episode": 12984, + "epoch": 0.025931384884543024, + "loss/policy_avg": 0.02835928089916706, + "lr": 2.8447085889570556e-06, + "objective/entropy": 98.23834228515625, + "objective/kl": 3.944690465927124, + "objective/non_score_reward": -0.19723451137542725, + "objective/rlhf_reward": 1.400652257836053, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.521748661994934, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4560546875, + "step": 540, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013771057128906 + }, + { + "episode": 13008, + "epoch": 0.0259793172041078, + "loss/policy_avg": 0.007454338949173689, + "lr": 2.8444210122699386e-06, + "objective/entropy": 84.56394958496094, + "objective/kl": 2.3241615295410156, + "objective/non_score_reward": -0.11620807647705078, + "objective/rlhf_reward": 1.0371405400232159, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.547837972640991, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.408203125, + "step": 541, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000685214996338 + }, + { + "episode": 13032, + "epoch": 0.026027249523672573, + "loss/policy_avg": 0.027467647567391396, + "lr": 2.844133435582822e-06, + "objective/entropy": 72.6270751953125, + "objective/kl": 2.762265205383301, + "objective/non_score_reward": -0.13811327517032623, + "objective/rlhf_reward": -0.8286796286702156, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.047220230102539, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3916015625, + "step": 542, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998673915863037 + }, + { + "episode": 13056, + "epoch": 0.026075181843237347, + "loss/policy_avg": 0.19625544548034668, + "lr": 2.8438458588957055e-06, + "objective/entropy": 86.93348693847656, + "objective/kl": 5.877695083618164, + "objective/non_score_reward": -0.2938847839832306, + "objective/rlhf_reward": -1.7633086182177067, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.264875411987305, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3681640625, + "step": 543, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998779296875 + }, + { + "episode": 13080, + "epoch": 0.026123114162802122, + "loss/policy_avg": -0.05115317553281784, + "lr": 2.843558282208589e-06, + "objective/entropy": 81.25460815429688, + "objective/kl": 2.1282925605773926, + "objective/non_score_reward": -0.10641463845968246, + "objective/rlhf_reward": -0.6384877860546112, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5974056720733643, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.365234375, + "step": 544, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993257522583008 + }, + { + "episode": 13104, + "epoch": 0.026171046482366896, + "loss/policy_avg": 0.06661288440227509, + "lr": 2.8432707055214723e-06, + "objective/entropy": 78.95567321777344, + "objective/kl": 1.973778486251831, + "objective/non_score_reward": -0.09868893027305603, + "objective/rlhf_reward": 1.1422553799942814, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 5.442002296447754, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.373046875, + "step": 545, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9968225955963135 + }, + { + "episode": 13128, + "epoch": 0.02621897880193167, + "loss/policy_avg": 0.046537891030311584, + "lr": 2.8429831288343558e-06, + "objective/entropy": 116.42289733886719, + "objective/kl": 2.9185924530029297, + "objective/non_score_reward": -0.14592963457107544, + "objective/rlhf_reward": -0.87557777389884, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.830223083496094, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.537109375, + "step": 546, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999525547027588 + }, + { + "episode": 13152, + "epoch": 0.02626691112149645, + "loss/policy_avg": 0.058385539799928665, + "lr": 2.842695552147239e-06, + "objective/entropy": 95.09030151367188, + "objective/kl": 3.2010011672973633, + "objective/non_score_reward": -0.16005006432533264, + "objective/rlhf_reward": 0.9324888971141185, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.761023998260498, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.46484375, + "step": 547, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0021708011627197 + }, + { + "episode": 13176, + "epoch": 0.026314843441061223, + "loss/policy_avg": 0.01292688399553299, + "lr": 2.8424079754601226e-06, + "objective/entropy": 91.11614227294922, + "objective/kl": 4.497677803039551, + "objective/non_score_reward": -0.22488388419151306, + "objective/rlhf_reward": -1.3493032157421112, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.2223236560821533, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.400390625, + "step": 548, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002939701080322 + }, + { + "episode": 13200, + "epoch": 0.026362775760625998, + "loss/policy_avg": 0.08357983082532883, + "lr": 2.842120398773006e-06, + "objective/entropy": 97.10147094726562, + "objective/kl": 5.985170364379883, + "objective/non_score_reward": -0.2992585301399231, + "objective/rlhf_reward": 0.3416919492591751, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 6.657796382904053, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.45703125, + "step": 549, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967410564422607 + }, + { + "episode": 13224, + "epoch": 0.026410708080190772, + "loss/policy_avg": 0.02868782728910446, + "lr": 2.84183282208589e-06, + "objective/entropy": 105.6905517578125, + "objective/kl": 1.896695852279663, + "objective/non_score_reward": -0.09483479708433151, + "objective/rlhf_reward": -0.5690087676048279, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8552749156951904, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4658203125, + "step": 550, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001020908355713 + }, + { + "episode": 13248, + "epoch": 0.026458640399755547, + "loss/policy_avg": -0.0176153052598238, + "lr": 2.8415452453987733e-06, + "objective/entropy": 113.1185531616211, + "objective/kl": 3.937284231185913, + "objective/non_score_reward": -0.19686418771743774, + "objective/rlhf_reward": -1.1811851970851421, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7943179607391357, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4716796875, + "step": 551, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001751184463501 + }, + { + "episode": 13272, + "epoch": 0.02650657271932032, + "loss/policy_avg": 0.08580126613378525, + "lr": 2.8412576687116567e-06, + "objective/entropy": 81.77702331542969, + "objective/kl": 5.145464897155762, + "objective/non_score_reward": -0.25727325677871704, + "objective/rlhf_reward": -1.543639436364174, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.034655809402466, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3486328125, + "step": 552, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998135566711426 + }, + { + "episode": 13296, + "epoch": 0.026554505038885096, + "loss/policy_avg": 0.01750761643052101, + "lr": 2.84097009202454e-06, + "objective/entropy": 68.23872375488281, + "objective/kl": 2.663771152496338, + "objective/non_score_reward": -0.13318856060504913, + "objective/rlhf_reward": 0.935257588689097, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 5.020416259765625, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.353515625, + "step": 553, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981495141983032 + }, + { + "episode": 13320, + "epoch": 0.02660243735844987, + "loss/policy_avg": 0.0051691727712750435, + "lr": 2.8406825153374236e-06, + "objective/entropy": 76.99561309814453, + "objective/kl": 3.5840210914611816, + "objective/non_score_reward": -0.17920105159282684, + "objective/rlhf_reward": -1.0752062983810902, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5945792198181152, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3427734375, + "step": 554, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000096082687378 + }, + { + "episode": 13344, + "epoch": 0.026650369678014645, + "loss/policy_avg": -0.012288580648601055, + "lr": 2.840394938650307e-06, + "objective/entropy": 89.03411865234375, + "objective/kl": 2.6943607330322266, + "objective/non_score_reward": -0.13471806049346924, + "objective/rlhf_reward": -0.8083082735538483, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6570206880569458, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.416015625, + "step": 555, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998886585235596 + }, + { + "episode": 13368, + "epoch": 0.02669830199757942, + "loss/policy_avg": 0.03779691457748413, + "lr": 2.84010736196319e-06, + "objective/entropy": 83.56863403320312, + "objective/kl": 4.6169514656066895, + "objective/non_score_reward": -0.23084756731987, + "objective/rlhf_reward": 0.4210945700646672, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 6.541321754455566, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3818359375, + "step": 556, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9947904348373413 + }, + { + "episode": 13392, + "epoch": 0.026746234317144194, + "loss/policy_avg": 0.018883923068642616, + "lr": 2.8398197852760734e-06, + "objective/entropy": 88.85995483398438, + "objective/kl": 2.4565415382385254, + "objective/non_score_reward": -0.12282707542181015, + "objective/rlhf_reward": -0.7369624339044094, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.808292865753174, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4130859375, + "step": 557, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983946084976196 + }, + { + "episode": 13416, + "epoch": 0.026794166636708968, + "loss/policy_avg": 0.032795943319797516, + "lr": 2.839532208588957e-06, + "objective/entropy": 88.42547607421875, + "objective/kl": 2.779268264770508, + "objective/non_score_reward": -0.13896340131759644, + "objective/rlhf_reward": 1.0590087857535686, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 5.199113845825195, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.384765625, + "step": 558, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990257024765015 + }, + { + "episode": 13440, + "epoch": 0.026842098956273742, + "loss/policy_avg": -0.013005442917346954, + "lr": 2.8392446319018403e-06, + "objective/entropy": 89.93621063232422, + "objective/kl": 5.951442241668701, + "objective/non_score_reward": -0.29757213592529297, + "objective/rlhf_reward": 4.214567296206951, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.572275161743164, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4384765625, + "step": 559, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0028109550476074 + }, + { + "episode": 13464, + "epoch": 0.026890031275838517, + "loss/policy_avg": -0.013624023646116257, + "lr": 2.838957055214724e-06, + "objective/entropy": 100.53422546386719, + "objective/kl": 1.2797200679779053, + "objective/non_score_reward": -0.06398600339889526, + "objective/rlhf_reward": -0.3839160241186619, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5100646018981934, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4736328125, + "step": 560, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003835678100586 + }, + { + "episode": 13488, + "epoch": 0.02693796359540329, + "loss/policy_avg": -0.042243629693984985, + "lr": 2.8386694785276076e-06, + "objective/entropy": 109.6282958984375, + "objective/kl": 4.806629180908203, + "objective/non_score_reward": -0.24033145606517792, + "objective/rlhf_reward": 4.558011218905449, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.110781669616699, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4736328125, + "step": 561, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015041828155518 + }, + { + "episode": 13512, + "epoch": 0.026985895914968066, + "loss/policy_avg": 0.07531896233558655, + "lr": 2.838381901840491e-06, + "objective/entropy": 68.88694763183594, + "objective/kl": 6.311133861541748, + "objective/non_score_reward": -0.31555670499801636, + "objective/rlhf_reward": 0.24390295226467973, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.051870822906494, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.333984375, + "step": 562, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002501010894775 + }, + { + "episode": 13536, + "epoch": 0.02703382823453284, + "loss/policy_avg": 0.03316590189933777, + "lr": 2.8380943251533744e-06, + "objective/entropy": 91.5116195678711, + "objective/kl": 6.38645076751709, + "objective/non_score_reward": -0.3193225860595703, + "objective/rlhf_reward": 0.4051815356660846, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.397380828857422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4248046875, + "step": 563, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991964101791382 + }, + { + "episode": 13560, + "epoch": 0.027081760554097615, + "loss/policy_avg": 0.24550089240074158, + "lr": 2.837806748466258e-06, + "objective/entropy": 73.84913635253906, + "objective/kl": 3.312932014465332, + "objective/non_score_reward": -0.16564659774303436, + "objective/rlhf_reward": -0.993879534304142, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0842647552490234, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3330078125, + "step": 564, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024266242980957 + }, + { + "episode": 13584, + "epoch": 0.02712969287366239, + "loss/policy_avg": 0.08863756060600281, + "lr": 2.8375191717791413e-06, + "objective/entropy": 127.52447509765625, + "objective/kl": 2.664094924926758, + "objective/non_score_reward": -0.13320474326610565, + "objective/rlhf_reward": 1.3380146779526605, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.203895568847656, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.568359375, + "step": 565, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976519346237183 + }, + { + "episode": 13608, + "epoch": 0.027177625193227164, + "loss/policy_avg": 0.06295748054981232, + "lr": 2.8372315950920247e-06, + "objective/entropy": 133.82366943359375, + "objective/kl": 3.511705160140991, + "objective/non_score_reward": -0.17558525502681732, + "objective/rlhf_reward": 1.0837316297401323, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.149251461029053, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5703125, + "step": 566, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9954204559326172 + }, + { + "episode": 13632, + "epoch": 0.02722555751279194, + "loss/policy_avg": 0.03414370119571686, + "lr": 2.836944018404908e-06, + "objective/entropy": 81.90007019042969, + "objective/kl": 5.14072322845459, + "objective/non_score_reward": -0.25703614950180054, + "objective/rlhf_reward": -1.542216807603836, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.55391263961792, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3779296875, + "step": 567, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012130737304688 + }, + { + "episode": 13656, + "epoch": 0.027273489832356713, + "loss/policy_avg": 0.00469338009133935, + "lr": 2.8366564417177915e-06, + "objective/entropy": 75.09312438964844, + "objective/kl": 3.058222532272339, + "objective/non_score_reward": -0.15291112661361694, + "objective/rlhf_reward": 0.8887132515550885, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.2944071292877197, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3525390625, + "step": 568, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004420280456543 + }, + { + "episode": 13680, + "epoch": 0.027321422151921487, + "loss/policy_avg": 0.15842081606388092, + "lr": 2.836368865030675e-06, + "objective/entropy": 94.289794921875, + "objective/kl": 4.6122636795043945, + "objective/non_score_reward": -0.23061320185661316, + "objective/rlhf_reward": -1.383679062128067, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.2201924324035645, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40234375, + "step": 569, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0032811164855957 + }, + { + "episode": 13704, + "epoch": 0.027369354471486262, + "loss/policy_avg": 0.004062782973051071, + "lr": 2.8360812883435584e-06, + "objective/entropy": 72.5268783569336, + "objective/kl": 3.8907430171966553, + "objective/non_score_reward": -0.19453716278076172, + "objective/rlhf_reward": -1.1672229245305061, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.8251872062683105, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4541015625, + "step": 570, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000824451446533 + }, + { + "episode": 13728, + "epoch": 0.027417286791051036, + "loss/policy_avg": 0.0250387080013752, + "lr": 2.835793711656442e-06, + "objective/entropy": 110.71012878417969, + "objective/kl": 3.706209182739258, + "objective/non_score_reward": -0.1853104531764984, + "objective/rlhf_reward": 1.8881372436881065, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.6404072046279907, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.51171875, + "step": 571, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000798225402832 + }, + { + "episode": 13752, + "epoch": 0.02746521911061581, + "loss/policy_avg": -0.020535431802272797, + "lr": 2.8355061349693253e-06, + "objective/entropy": 100.09159851074219, + "objective/kl": 3.0550670623779297, + "objective/non_score_reward": -0.15275335311889648, + "objective/rlhf_reward": -0.9165201783180237, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.9103407859802246, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4345703125, + "step": 572, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0021026134490967 + }, + { + "episode": 13776, + "epoch": 0.027513151430180585, + "loss/policy_avg": 0.05539553239941597, + "lr": 2.8352185582822087e-06, + "objective/entropy": 100.67100524902344, + "objective/kl": 5.127203941345215, + "objective/non_score_reward": -0.2563602328300476, + "objective/rlhf_reward": 2.247417318163555, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 5.0551228523254395, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.46484375, + "step": 573, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.995403528213501 + }, + { + "episode": 13800, + "epoch": 0.02756108374974536, + "loss/policy_avg": 0.09414783865213394, + "lr": 2.834930981595092e-06, + "objective/entropy": 86.02059936523438, + "objective/kl": 1.9871184825897217, + "objective/non_score_reward": -0.09935590624809265, + "objective/rlhf_reward": -0.5961354523897171, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.316880464553833, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.419921875, + "step": 574, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999647617340088 + }, + { + "episode": 13824, + "epoch": 0.027609016069310134, + "loss/policy_avg": 0.022686563432216644, + "lr": 2.8346434049079755e-06, + "objective/entropy": 159.66802978515625, + "objective/kl": 5.052129745483398, + "objective/non_score_reward": -0.25260651111602783, + "objective/rlhf_reward": 0.21874995081580506, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.076416015625, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 575, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963369369506836 + }, + { + "episode": 13848, + "epoch": 0.02765694838887491, + "loss/policy_avg": 0.03251364454627037, + "lr": 2.834355828220859e-06, + "objective/entropy": 69.61742401123047, + "objective/kl": 6.255773544311523, + "objective/non_score_reward": -0.3127886950969696, + "objective/rlhf_reward": 1.1232679337263107, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.701547861099243, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3994140625, + "step": 576, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971745014190674 + }, + { + "episode": 13872, + "epoch": 0.027704880708439683, + "loss/policy_avg": 0.050228867679834366, + "lr": 2.8340682515337424e-06, + "objective/entropy": 74.14045715332031, + "objective/kl": 4.0898966789245605, + "objective/non_score_reward": -0.20449483394622803, + "objective/rlhf_reward": -1.2269689813256264, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9416115283966064, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4228515625, + "step": 577, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992867708206177 + }, + { + "episode": 13896, + "epoch": 0.027752813028004458, + "loss/policy_avg": 0.017696838825941086, + "lr": 2.833780674846626e-06, + "objective/entropy": 91.73491668701172, + "objective/kl": 5.153836250305176, + "objective/non_score_reward": -0.2576918303966522, + "objective/rlhf_reward": 4.453849159181118, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.5571956634521484, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.39453125, + "step": 578, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002391815185547 + }, + { + "episode": 13920, + "epoch": 0.027800745347569232, + "loss/policy_avg": 0.31128519773483276, + "lr": 2.8334930981595092e-06, + "objective/entropy": 106.99852752685547, + "objective/kl": 5.85416316986084, + "objective/non_score_reward": -0.292708158493042, + "objective/rlhf_reward": -1.7562490031123161, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.491933345794678, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4609375, + "step": 579, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990768432617188 + }, + { + "episode": 13944, + "epoch": 0.027848677667134007, + "loss/policy_avg": -0.025291606783866882, + "lr": 2.8332055214723927e-06, + "objective/entropy": 121.28169250488281, + "objective/kl": 2.5373544692993164, + "objective/non_score_reward": -0.12686771154403687, + "objective/rlhf_reward": 3.0243722819668464, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 1.9598803520202637, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.513671875, + "step": 580, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002041816711426 + }, + { + "episode": 13968, + "epoch": 0.02789660998669878, + "loss/policy_avg": 0.02201208658516407, + "lr": 2.832917944785276e-06, + "objective/entropy": 136.12261962890625, + "objective/kl": 3.993893623352051, + "objective/non_score_reward": -0.1996946930885315, + "objective/rlhf_reward": 0.6080118601561818, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.8032073974609375, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.626953125, + "step": 581, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984114170074463 + }, + { + "episode": 13992, + "epoch": 0.027944542306263555, + "loss/policy_avg": 0.029302649199962616, + "lr": 2.8326303680981595e-06, + "objective/entropy": 136.7864227294922, + "objective/kl": 1.9017550945281982, + "objective/non_score_reward": -0.09508776664733887, + "objective/rlhf_reward": -0.5705266073346138, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9502267837524414, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58984375, + "step": 582, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0057082176208496 + }, + { + "episode": 14016, + "epoch": 0.02799247462582833, + "loss/policy_avg": 0.06482797861099243, + "lr": 2.832342791411043e-06, + "objective/entropy": 82.9349136352539, + "objective/kl": 5.254426002502441, + "objective/non_score_reward": -0.26272130012512207, + "objective/rlhf_reward": -1.5763277262449265, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6604042053222656, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4375, + "step": 583, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001680850982666 + }, + { + "episode": 14040, + "epoch": 0.028040406945393104, + "loss/policy_avg": 0.027776550501585007, + "lr": 2.8320552147239268e-06, + "objective/entropy": 87.71633911132812, + "objective/kl": 3.5362606048583984, + "objective/non_score_reward": -0.1768130511045456, + "objective/rlhf_reward": -1.0608782954514027, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5400289297103882, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3798828125, + "step": 584, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002898693084717 + }, + { + "episode": 14064, + "epoch": 0.02808833926495788, + "loss/policy_avg": 0.026721132919192314, + "lr": 2.8317676380368102e-06, + "objective/entropy": 125.197998046875, + "objective/kl": 4.688167572021484, + "objective/non_score_reward": -0.23440837860107422, + "objective/rlhf_reward": -1.4064502269029617, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.767127513885498, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.572265625, + "step": 585, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998934268951416 + }, + { + "episode": 14088, + "epoch": 0.028136271584522653, + "loss/policy_avg": 0.07143731415271759, + "lr": 2.8314800613496932e-06, + "objective/entropy": 111.80906677246094, + "objective/kl": 4.058431625366211, + "objective/non_score_reward": -0.20292159914970398, + "objective/rlhf_reward": 0.6752597701242771, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.8018012046813965, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4990234375, + "step": 586, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000527858734131 + }, + { + "episode": 14112, + "epoch": 0.028184203904087428, + "loss/policy_avg": 0.05494013428688049, + "lr": 2.8311924846625766e-06, + "objective/entropy": 107.014892578125, + "objective/kl": 5.716266632080078, + "objective/non_score_reward": -0.2858133316040039, + "objective/rlhf_reward": 0.42236316282643205, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.857789516448975, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.5078125, + "step": 587, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984688758850098 + }, + { + "episode": 14136, + "epoch": 0.028232136223652202, + "loss/policy_avg": 0.0461447648704052, + "lr": 2.83090490797546e-06, + "objective/entropy": 100.64680480957031, + "objective/kl": 6.104743957519531, + "objective/non_score_reward": -0.3052372336387634, + "objective/rlhf_reward": -1.8314232304692268, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.02564811706543, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.421875, + "step": 588, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998743534088135 + }, + { + "episode": 14160, + "epoch": 0.028280068543216977, + "loss/policy_avg": -0.021601075306534767, + "lr": 2.8306173312883435e-06, + "objective/entropy": 85.8267822265625, + "objective/kl": 4.214189052581787, + "objective/non_score_reward": -0.21070945262908936, + "objective/rlhf_reward": -1.2642566785216331, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.764417052268982, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4091796875, + "step": 589, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011112689971924 + }, + { + "episode": 14184, + "epoch": 0.02832800086278175, + "loss/policy_avg": 0.18596884608268738, + "lr": 2.830329754601227e-06, + "objective/entropy": 84.82264709472656, + "objective/kl": 4.514269828796387, + "objective/non_score_reward": -0.22571349143981934, + "objective/rlhf_reward": -1.3542809039354324, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.641467094421387, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3984375, + "step": 590, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000016927719116 + }, + { + "episode": 14208, + "epoch": 0.028375933182346526, + "loss/policy_avg": 0.041232749819755554, + "lr": 2.8300421779141103e-06, + "objective/entropy": 59.06492233276367, + "objective/kl": 3.168313503265381, + "objective/non_score_reward": -0.15841567516326904, + "objective/rlhf_reward": 5.049506030976772, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.1272411346435547, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3818359375, + "step": 591, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992547035217285 + }, + { + "episode": 14232, + "epoch": 0.0284238655019113, + "loss/policy_avg": 0.024005573242902756, + "lr": 2.8297546012269938e-06, + "objective/entropy": 84.81796264648438, + "objective/kl": 6.932097434997559, + "objective/non_score_reward": -0.3466048836708069, + "objective/rlhf_reward": 0.057613999437226115, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.829646110534668, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.39453125, + "step": 592, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998450756072998 + }, + { + "episode": 14256, + "epoch": 0.028471797821476075, + "loss/policy_avg": 0.02056157775223255, + "lr": 2.829467024539877e-06, + "objective/entropy": 109.64855194091797, + "objective/kl": 4.457742214202881, + "objective/non_score_reward": -0.22288712859153748, + "objective/rlhf_reward": 0.3970662906662308, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.42343807220459, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.517578125, + "step": 593, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000990867614746 + }, + { + "episode": 14280, + "epoch": 0.02851973014104085, + "loss/policy_avg": -0.05336518958210945, + "lr": 2.829179447852761e-06, + "objective/entropy": 54.87667465209961, + "objective/kl": 1.2108416557312012, + "objective/non_score_reward": -0.060542088001966476, + "objective/rlhf_reward": 1.6367474943399427, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.6075925827026367, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.5703125, + "step": 594, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 3, + "val/ratio": 1.9997950792312622 + }, + { + "episode": 14304, + "epoch": 0.028567662460605624, + "loss/policy_avg": 0.009388393722474575, + "lr": 2.8288918711656445e-06, + "objective/entropy": 82.22164916992188, + "objective/kl": 5.615533351898193, + "objective/non_score_reward": -0.2807766795158386, + "objective/rlhf_reward": -1.68466005474329, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.283105373382568, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3759765625, + "step": 595, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002901077270508 + }, + { + "episode": 14328, + "epoch": 0.028615594780170398, + "loss/policy_avg": 0.019810955971479416, + "lr": 2.828604294478528e-06, + "objective/entropy": 124.91587829589844, + "objective/kl": 2.2044320106506348, + "objective/non_score_reward": -0.11022161692380905, + "objective/rlhf_reward": 1.3386703170835972, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.7607558965682983, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.578125, + "step": 596, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000711441040039 + }, + { + "episode": 14352, + "epoch": 0.028663527099735173, + "loss/policy_avg": 0.022766483947634697, + "lr": 2.8283167177914113e-06, + "objective/entropy": 90.30193328857422, + "objective/kl": 4.796558856964111, + "objective/non_score_reward": -0.239827960729599, + "objective/rlhf_reward": -1.4389676824212074, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.724247694015503, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 597, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0006210803985596 + }, + { + "episode": 14376, + "epoch": 0.028711459419299947, + "loss/policy_avg": -0.0028263437561690807, + "lr": 2.8280291411042947e-06, + "objective/entropy": 111.1976547241211, + "objective/kl": 1.2801239490509033, + "objective/non_score_reward": -0.06400620192289352, + "objective/rlhf_reward": -0.38403720781207085, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9229480028152466, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.52734375, + "step": 598, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992340803146362 + }, + { + "episode": 14400, + "epoch": 0.02875939173886472, + "loss/policy_avg": 0.07866540551185608, + "lr": 2.827741564417178e-06, + "objective/entropy": 80.45591735839844, + "objective/kl": 3.822201728820801, + "objective/non_score_reward": -0.19111010432243347, + "objective/rlhf_reward": 0.6595193853021893, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 8.930303573608398, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3837890625, + "step": 599, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9955170154571533 + }, + { + "episode": 14424, + "epoch": 0.028807324058429496, + "loss/policy_avg": 0.024506252259016037, + "lr": 2.8274539877300616e-06, + "objective/entropy": 103.03802490234375, + "objective/kl": 1.8462704420089722, + "objective/non_score_reward": -0.09231351315975189, + "objective/rlhf_reward": -0.5538810640573502, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1537535190582275, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4287109375, + "step": 600, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024642944335938 + }, + { + "episode": 14448, + "epoch": 0.02885525637799427, + "loss/policy_avg": -0.03205743432044983, + "lr": 2.8271664110429446e-06, + "objective/entropy": 136.50550842285156, + "objective/kl": 4.480498790740967, + "objective/non_score_reward": -0.2240249365568161, + "objective/rlhf_reward": 0.39023939444578515, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.5356905460357666, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.611328125, + "step": 601, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999215602874756 + }, + { + "episode": 14472, + "epoch": 0.028903188697559045, + "loss/policy_avg": 0.047943320125341415, + "lr": 2.826878834355828e-06, + "objective/entropy": 86.44506072998047, + "objective/kl": 5.256937026977539, + "objective/non_score_reward": -0.2628468871116638, + "objective/rlhf_reward": -1.577081173658371, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.0566987991333, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.380859375, + "step": 602, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974699020385742 + }, + { + "episode": 14496, + "epoch": 0.02895112101712382, + "loss/policy_avg": 0.028582492843270302, + "lr": 2.8265912576687115e-06, + "objective/entropy": 104.3714828491211, + "objective/kl": 5.13123893737793, + "objective/non_score_reward": -0.2565619647502899, + "objective/rlhf_reward": 4.46062833070755, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.240304470062256, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.501953125, + "step": 603, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978840351104736 + }, + { + "episode": 14520, + "epoch": 0.028999053336688594, + "loss/policy_avg": 0.09258130937814713, + "lr": 2.826303680981595e-06, + "objective/entropy": 82.89654541015625, + "objective/kl": 5.597306728363037, + "objective/non_score_reward": -0.2798653542995453, + "objective/rlhf_reward": 0.21359723922522922, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.7065343856811523, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.37890625, + "step": 604, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001868724822998 + }, + { + "episode": 14544, + "epoch": 0.029046985656253372, + "loss/policy_avg": 0.07752461731433868, + "lr": 2.8260161042944787e-06, + "objective/entropy": 101.85990905761719, + "objective/kl": 7.422156810760498, + "objective/non_score_reward": -0.371107816696167, + "objective/rlhf_reward": -2.226646974682808, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.574728965759277, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4560546875, + "step": 605, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.995313048362732 + }, + { + "episode": 14568, + "epoch": 0.029094917975818146, + "loss/policy_avg": -0.022021649405360222, + "lr": 2.825728527607362e-06, + "objective/entropy": 90.04088592529297, + "objective/kl": 6.081704616546631, + "objective/non_score_reward": -0.30408525466918945, + "objective/rlhf_reward": -1.824511393904686, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.305087566375732, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4296875, + "step": 606, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979760646820068 + }, + { + "episode": 14592, + "epoch": 0.02914285029538292, + "loss/policy_avg": 0.0284237340092659, + "lr": 2.8254409509202456e-06, + "objective/entropy": 104.76629638671875, + "objective/kl": 5.404122352600098, + "objective/non_score_reward": -0.270206093788147, + "objective/rlhf_reward": -1.6212365701794624, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.116570472717285, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.44140625, + "step": 607, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9949662685394287 + }, + { + "episode": 14616, + "epoch": 0.029190782614947695, + "loss/policy_avg": 0.002815498039126396, + "lr": 2.825153374233129e-06, + "objective/entropy": 106.08427429199219, + "objective/kl": 4.556731700897217, + "objective/non_score_reward": -0.22783660888671875, + "objective/rlhf_reward": 4.632980436086655, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.030404567718506, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.50390625, + "step": 608, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974284172058105 + }, + { + "episode": 14640, + "epoch": 0.02923871493451247, + "loss/policy_avg": 0.04995134100317955, + "lr": 2.8248657975460124e-06, + "objective/entropy": 91.5798568725586, + "objective/kl": 2.5468392372131348, + "objective/non_score_reward": -0.12734195590019226, + "objective/rlhf_reward": -0.76405169069767, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.722676753997803, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4072265625, + "step": 609, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982054233551025 + }, + { + "episode": 14664, + "epoch": 0.029286647254077244, + "loss/policy_avg": 0.04855087026953697, + "lr": 2.824578220858896e-06, + "objective/entropy": 93.92295837402344, + "objective/kl": 5.60801887512207, + "objective/non_score_reward": -0.28040096163749695, + "objective/rlhf_reward": 2.103172818658989, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.4234893321990967, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.470703125, + "step": 610, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99894118309021 + }, + { + "episode": 14688, + "epoch": 0.02933457957364202, + "loss/policy_avg": 0.05327991396188736, + "lr": 2.8242906441717793e-06, + "objective/entropy": 76.72386169433594, + "objective/kl": 2.441195487976074, + "objective/non_score_reward": -0.12205978482961655, + "objective/rlhf_reward": 1.0020302787319504, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.387835741043091, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3701171875, + "step": 611, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994940757751465 + }, + { + "episode": 14712, + "epoch": 0.029382511893206793, + "loss/policy_avg": 0.029780372977256775, + "lr": 2.8240030674846627e-06, + "objective/entropy": 101.66665649414062, + "objective/kl": 4.013691425323486, + "objective/non_score_reward": -0.2006845772266388, + "objective/rlhf_reward": -1.2041074112057686, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4479317665100098, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4541015625, + "step": 612, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0023903846740723 + }, + { + "episode": 14736, + "epoch": 0.029430444212771568, + "loss/policy_avg": 0.057757895439863205, + "lr": 2.823715490797546e-06, + "objective/entropy": 100.02766418457031, + "objective/kl": 4.654374599456787, + "objective/non_score_reward": -0.23271870613098145, + "objective/rlhf_reward": 0.740930818807019, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.905346870422363, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40625, + "step": 613, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977343082427979 + }, + { + "episode": 14760, + "epoch": 0.029478376532336342, + "loss/policy_avg": 0.04366529732942581, + "lr": 2.8234279141104296e-06, + "objective/entropy": 81.93441009521484, + "objective/kl": 6.952441692352295, + "objective/non_score_reward": -0.3476220965385437, + "objective/rlhf_reward": -0.19294322910992245, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.8562068939208984, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3818359375, + "step": 614, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984970092773438 + }, + { + "episode": 14784, + "epoch": 0.029526308851901117, + "loss/policy_avg": -0.05840581655502319, + "lr": 2.823140337423313e-06, + "objective/entropy": 85.55134582519531, + "objective/kl": 3.630944013595581, + "objective/non_score_reward": -0.18154720962047577, + "objective/rlhf_reward": 0.910716876387596, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.291320323944092, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.3603515625, + "step": 615, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0055112838745117 + }, + { + "episode": 14808, + "epoch": 0.02957424117146589, + "loss/policy_avg": -0.04911494255065918, + "lr": 2.8228527607361964e-06, + "objective/entropy": 107.88655853271484, + "objective/kl": 2.811856746673584, + "objective/non_score_reward": -0.14059284329414368, + "objective/rlhf_reward": 0.9626229216696057, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.9645776748657227, + "policy/clipfrac_avg": 1.8333333730697632, + "policy/entropy_avg": 0.4501953125, + "step": 616, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00254487991333 + }, + { + "episode": 14832, + "epoch": 0.029622173491030666, + "loss/policy_avg": 0.026907984167337418, + "lr": 2.82256518404908e-06, + "objective/entropy": 88.2837142944336, + "objective/kl": 2.7154958248138428, + "objective/non_score_reward": -0.13577479124069214, + "objective/rlhf_reward": 0.9915312786937985, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.014118194580078, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.376953125, + "step": 617, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008981227874756 + }, + { + "episode": 14856, + "epoch": 0.02967010581059544, + "loss/policy_avg": -0.0313553661108017, + "lr": 2.8222776073619633e-06, + "objective/entropy": 113.7740707397461, + "objective/kl": 5.766857147216797, + "objective/non_score_reward": -0.2883428931236267, + "objective/rlhf_reward": -1.7300572358071804, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.221314430236816, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.52734375, + "step": 618, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987285137176514 + }, + { + "episode": 14880, + "epoch": 0.029718038130160215, + "loss/policy_avg": 0.0710495114326477, + "lr": 2.8219900306748467e-06, + "objective/entropy": 98.9884262084961, + "objective/kl": 3.7192630767822266, + "objective/non_score_reward": -0.1859631985425949, + "objective/rlhf_reward": -1.115779086947441, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4224777221679688, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.431640625, + "step": 619, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008773803710938 + }, + { + "episode": 14904, + "epoch": 0.02976597044972499, + "loss/policy_avg": -0.020035047084093094, + "lr": 2.82170245398773e-06, + "objective/entropy": 143.8428955078125, + "objective/kl": 3.968383312225342, + "objective/non_score_reward": -0.19841915369033813, + "objective/rlhf_reward": -1.190514974296093, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.634972095489502, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.615234375, + "step": 620, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987893104553223 + }, + { + "episode": 14928, + "epoch": 0.029813902769289764, + "loss/policy_avg": 0.008157234638929367, + "lr": 2.8214148773006135e-06, + "objective/entropy": 130.00253295898438, + "objective/kl": 3.7748942375183105, + "objective/non_score_reward": -0.1887447088956833, + "objective/rlhf_reward": 4.8675317987799644, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.718982219696045, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.578125, + "step": 621, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002279043197632 + }, + { + "episode": 14952, + "epoch": 0.029861835088854538, + "loss/policy_avg": 0.04042531177401543, + "lr": 2.821127300613497e-06, + "objective/entropy": 84.47622680664062, + "objective/kl": 3.6307151317596436, + "objective/non_score_reward": -0.1815357506275177, + "objective/rlhf_reward": 1.048028618883027, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.1620690822601318, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4111328125, + "step": 622, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000732660293579 + }, + { + "episode": 14976, + "epoch": 0.029909767408419313, + "loss/policy_avg": 0.04123755544424057, + "lr": 2.8208397239263804e-06, + "objective/entropy": 105.11029052734375, + "objective/kl": 4.004117965698242, + "objective/non_score_reward": -0.20020589232444763, + "objective/rlhf_reward": -1.2012353539466858, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4549992084503174, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4931640625, + "step": 623, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0029306411743164 + }, + { + "episode": 15000, + "epoch": 0.029957699727984087, + "loss/policy_avg": 0.07185370475053787, + "lr": 2.820552147239264e-06, + "objective/entropy": 91.059814453125, + "objective/kl": 3.950619697570801, + "objective/non_score_reward": -0.19753101468086243, + "objective/rlhf_reward": -1.1851859539747238, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.96390962600708, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.431640625, + "step": 624, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9961769580841064 + }, + { + "episode": 15024, + "epoch": 0.03000563204754886, + "loss/policy_avg": 0.0030305488035082817, + "lr": 2.8202645705521473e-06, + "objective/entropy": 70.12528991699219, + "objective/kl": 5.0060882568359375, + "objective/non_score_reward": -0.2503044009208679, + "objective/rlhf_reward": -1.501826286315918, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.197725296020508, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3251953125, + "step": 625, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0005693435668945 + }, + { + "episode": 15048, + "epoch": 0.030053564367113636, + "loss/policy_avg": 0.04509567841887474, + "lr": 2.8199769938650307e-06, + "objective/entropy": 127.25487518310547, + "objective/kl": 4.925838470458984, + "objective/non_score_reward": -0.24629193544387817, + "objective/rlhf_reward": -1.477751612663269, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.15359115600586, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5390625, + "step": 626, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9947179555892944 + }, + { + "episode": 15072, + "epoch": 0.03010149668667841, + "loss/policy_avg": 0.05762709677219391, + "lr": 2.819689417177914e-06, + "objective/entropy": 87.84245300292969, + "objective/kl": 3.9465138912200928, + "objective/non_score_reward": -0.1973257064819336, + "objective/rlhf_reward": 0.8160457909107207, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.774023056030273, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41015625, + "step": 627, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964251518249512 + }, + { + "episode": 15096, + "epoch": 0.030149429006243185, + "loss/policy_avg": 0.056047312915325165, + "lr": 2.819401840490798e-06, + "objective/entropy": 121.11092376708984, + "objective/kl": 5.9568891525268555, + "objective/non_score_reward": -0.29784446954727173, + "objective/rlhf_reward": -1.7870666310191154, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.188004970550537, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.501953125, + "step": 628, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991493225097656 + }, + { + "episode": 15120, + "epoch": 0.03019736132580796, + "loss/policy_avg": 0.02126982994377613, + "lr": 2.8191142638036814e-06, + "objective/entropy": 120.42460632324219, + "objective/kl": 4.484891414642334, + "objective/non_score_reward": -0.22424456477165222, + "objective/rlhf_reward": 0.4607126598597798, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.7046737670898438, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 629, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999742031097412 + }, + { + "episode": 15144, + "epoch": 0.030245293645372734, + "loss/policy_avg": 0.011982021853327751, + "lr": 2.8188266871165644e-06, + "objective/entropy": 87.83477783203125, + "objective/kl": 7.543981552124023, + "objective/non_score_reward": -0.37719905376434326, + "objective/rlhf_reward": -0.4570144827126231, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 6.211130619049072, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4130859375, + "step": 630, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9971911907196045 + }, + { + "episode": 15168, + "epoch": 0.03029322596493751, + "loss/policy_avg": 0.09035187214612961, + "lr": 2.818539110429448e-06, + "objective/entropy": 114.4012451171875, + "objective/kl": 3.0759520530700684, + "objective/non_score_reward": -0.15379759669303894, + "objective/rlhf_reward": 1.6612738055350276, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.033353328704834, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.513671875, + "step": 631, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000544309616089 + }, + { + "episode": 15192, + "epoch": 0.030341158284502283, + "loss/policy_avg": 0.009275372140109539, + "lr": 2.8182515337423312e-06, + "objective/entropy": 106.42205047607422, + "objective/kl": 5.511739253997803, + "objective/non_score_reward": -0.27558696269989014, + "objective/rlhf_reward": -1.6535218507051468, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.211625814437866, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4970703125, + "step": 632, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003653049468994 + }, + { + "episode": 15216, + "epoch": 0.030389090604067057, + "loss/policy_avg": 0.09042274951934814, + "lr": 2.8179639570552147e-06, + "objective/entropy": 76.96186065673828, + "objective/kl": 5.167054176330566, + "objective/non_score_reward": -0.25835272669792175, + "objective/rlhf_reward": -1.5501163229346275, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4186813831329346, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4404296875, + "step": 633, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9969165325164795 + }, + { + "episode": 15240, + "epoch": 0.030437022923631832, + "loss/policy_avg": -0.0008582691662013531, + "lr": 2.817676380368098e-06, + "objective/entropy": 86.93215942382812, + "objective/kl": 4.232517719268799, + "objective/non_score_reward": -0.21162591874599457, + "objective/rlhf_reward": 1.314303925371358, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.4194729328155518, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.392578125, + "step": 634, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011119842529297 + }, + { + "episode": 15264, + "epoch": 0.030484955243196606, + "loss/policy_avg": 0.362686425447464, + "lr": 2.8173888036809815e-06, + "objective/entropy": 97.19206237792969, + "objective/kl": 5.449301719665527, + "objective/non_score_reward": -0.2724651098251343, + "objective/rlhf_reward": -1.6347906067967415, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.796441912651062, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4404296875, + "step": 635, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004988193511963 + }, + { + "episode": 15288, + "epoch": 0.03053288756276138, + "loss/policy_avg": 0.026926912367343903, + "lr": 2.817101226993865e-06, + "objective/entropy": 91.80333709716797, + "objective/kl": 5.061484336853027, + "objective/non_score_reward": -0.2530742287635803, + "objective/rlhf_reward": -1.51844522356987, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6634368896484375, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4296875, + "step": 636, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999874472618103 + }, + { + "episode": 15312, + "epoch": 0.030580819882326155, + "loss/policy_avg": 0.05895761400461197, + "lr": 2.8168136503067484e-06, + "objective/entropy": 101.90184783935547, + "objective/kl": 4.790085792541504, + "objective/non_score_reward": -0.23950430750846863, + "objective/rlhf_reward": -1.4370257034897804, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.827524662017822, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4521484375, + "step": 637, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975301027297974 + }, + { + "episode": 15336, + "epoch": 0.03062875220189093, + "loss/policy_avg": 0.03930250555276871, + "lr": 2.816526073619632e-06, + "objective/entropy": 112.2939682006836, + "objective/kl": 6.458032131195068, + "objective/non_score_reward": -0.3229016065597534, + "objective/rlhf_reward": -1.9374096989631653, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.308248996734619, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4716796875, + "step": 638, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982242584228516 + }, + { + "episode": 15360, + "epoch": 0.030676684521455704, + "loss/policy_avg": -0.011750642210245132, + "lr": 2.8162384969325156e-06, + "objective/entropy": 107.02984619140625, + "objective/kl": 5.230892181396484, + "objective/non_score_reward": -0.2615446448326111, + "objective/rlhf_reward": -1.569267675280571, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.378607749938965, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4697265625, + "step": 639, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0025782585144043 + }, + { + "episode": 15384, + "epoch": 0.03072461684102048, + "loss/policy_avg": -0.013221648521721363, + "lr": 2.815950920245399e-06, + "objective/entropy": 72.27055358886719, + "objective/kl": 5.9925642013549805, + "objective/non_score_reward": -0.29962822794914246, + "objective/rlhf_reward": -0.06338040233694686, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.854820251464844, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4208984375, + "step": 640, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.005218982696533 + }, + { + "episode": 15408, + "epoch": 0.030772549160585253, + "loss/policy_avg": 0.07462769746780396, + "lr": 2.8156633435582825e-06, + "objective/entropy": 89.11520385742188, + "objective/kl": 4.122031211853027, + "objective/non_score_reward": -0.20610153675079346, + "objective/rlhf_reward": -1.2366092130541801, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.894392967224121, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3876953125, + "step": 641, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9959723949432373 + }, + { + "episode": 15432, + "epoch": 0.030820481480150028, + "loss/policy_avg": 0.017127806320786476, + "lr": 2.815375766871166e-06, + "objective/entropy": 103.74423217773438, + "objective/kl": 2.7664237022399902, + "objective/non_score_reward": -0.138321191072464, + "objective/rlhf_reward": -0.829927071928978, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7248330116271973, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.5390625, + "step": 642, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00238299369812 + }, + { + "episode": 15456, + "epoch": 0.030868413799714802, + "loss/policy_avg": 0.006928525865077972, + "lr": 2.8150881901840493e-06, + "objective/entropy": 86.64430236816406, + "objective/kl": 4.296612739562988, + "objective/non_score_reward": -0.21483062207698822, + "objective/rlhf_reward": 0.4454052999512039, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 1.372735619544983, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4130859375, + "step": 643, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014731884002686 + }, + { + "episode": 15480, + "epoch": 0.030916346119279577, + "loss/policy_avg": 0.03297048807144165, + "lr": 2.8148006134969328e-06, + "objective/entropy": 70.8450927734375, + "objective/kl": 3.784954071044922, + "objective/non_score_reward": -0.1892477124929428, + "objective/rlhf_reward": -1.1354862377047539, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7779982089996338, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.326171875, + "step": 644, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00264835357666 + }, + { + "episode": 15504, + "epoch": 0.03096427843884435, + "loss/policy_avg": -0.014860376715660095, + "lr": 2.8145130368098158e-06, + "objective/entropy": 89.54942321777344, + "objective/kl": 3.6200451850891113, + "objective/non_score_reward": -0.18100225925445557, + "objective/rlhf_reward": -1.086013525724411, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.957568645477295, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.44140625, + "step": 645, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002686023712158 + }, + { + "episode": 15528, + "epoch": 0.031012210758409126, + "loss/policy_avg": 0.4200719892978668, + "lr": 2.814225460122699e-06, + "objective/entropy": 107.55262756347656, + "objective/kl": 4.160887718200684, + "objective/non_score_reward": -0.20804435014724731, + "objective/rlhf_reward": 1.0728507592875722, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.560863018035889, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.47265625, + "step": 646, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996811151504517 + }, + { + "episode": 15552, + "epoch": 0.0310601430779739, + "loss/policy_avg": 0.010246986523270607, + "lr": 2.8139378834355826e-06, + "objective/entropy": 84.59648132324219, + "objective/kl": 7.082032680511475, + "objective/non_score_reward": -0.35410165786743164, + "objective/rlhf_reward": 0.45944952044505805, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.418114423751831, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3935546875, + "step": 647, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001021385192871 + }, + { + "episode": 15576, + "epoch": 0.031108075397538675, + "loss/policy_avg": 0.030364131554961205, + "lr": 2.813650306748466e-06, + "objective/entropy": 88.86760711669922, + "objective/kl": 5.641407012939453, + "objective/non_score_reward": -0.2820703685283661, + "objective/rlhf_reward": -1.6924220770597458, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8105480670928955, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.41796875, + "step": 648, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000910758972168 + }, + { + "episode": 15600, + "epoch": 0.03115600771710345, + "loss/policy_avg": 0.23141127824783325, + "lr": 2.81336273006135e-06, + "objective/entropy": 123.42881774902344, + "objective/kl": 1.9861406087875366, + "objective/non_score_reward": -0.09930703043937683, + "objective/rlhf_reward": -0.5958421919494867, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.552154541015625, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.55859375, + "step": 649, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989237785339355 + }, + { + "episode": 15624, + "epoch": 0.031203940036668223, + "loss/policy_avg": 0.031925808638334274, + "lr": 2.8130751533742333e-06, + "objective/entropy": 112.88096618652344, + "objective/kl": 3.3129231929779053, + "objective/non_score_reward": -0.16564616560935974, + "objective/rlhf_reward": -0.9938769973814487, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.0946602821350098, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.52734375, + "step": 650, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003081798553467 + }, + { + "episode": 15648, + "epoch": 0.031251872356233, + "loss/policy_avg": 0.08590701967477798, + "lr": 2.8127875766871168e-06, + "objective/entropy": 109.22920227050781, + "objective/kl": 3.619563102722168, + "objective/non_score_reward": -0.18097816407680511, + "objective/rlhf_reward": 0.6485199510947548, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.2633399963378906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4609375, + "step": 651, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001964569091797 + }, + { + "episode": 15672, + "epoch": 0.03129980467579777, + "loss/policy_avg": 0.03848039358854294, + "lr": 2.8125e-06, + "objective/entropy": 78.9945068359375, + "objective/kl": 6.0070343017578125, + "objective/non_score_reward": -0.30035170912742615, + "objective/rlhf_reward": -1.8021102845668793, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.93839693069458, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.37109375, + "step": 652, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000978708267212 + }, + { + "episode": 15696, + "epoch": 0.03134773699536255, + "loss/policy_avg": 0.23985466361045837, + "lr": 2.8122124233128836e-06, + "objective/entropy": 100.83177947998047, + "objective/kl": 6.275968074798584, + "objective/non_score_reward": -0.3137983977794647, + "objective/rlhf_reward": -1.8827903419733047, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.919081926345825, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.466796875, + "step": 653, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0020804405212402 + }, + { + "episode": 15720, + "epoch": 0.03139566931492732, + "loss/policy_avg": 0.06486252695322037, + "lr": 2.811924846625767e-06, + "objective/entropy": 90.84605407714844, + "objective/kl": 3.4288814067840576, + "objective/non_score_reward": -0.1714440882205963, + "objective/rlhf_reward": -1.0286644380539656, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4356420040130615, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.412109375, + "step": 654, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00351619720459 + }, + { + "episode": 15744, + "epoch": 0.0314436016344921, + "loss/policy_avg": 0.35598158836364746, + "lr": 2.8116372699386505e-06, + "objective/entropy": 98.63151550292969, + "objective/kl": 6.8577799797058105, + "objective/non_score_reward": -0.3428890109062195, + "objective/rlhf_reward": -2.0573338717222214, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.568537712097168, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4208984375, + "step": 655, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999991655349731 + }, + { + "episode": 15768, + "epoch": 0.03149153395405687, + "loss/policy_avg": 0.04067952558398247, + "lr": 2.811349693251534e-06, + "objective/entropy": 141.08877563476562, + "objective/kl": 5.10235071182251, + "objective/non_score_reward": -0.2551175355911255, + "objective/rlhf_reward": -1.5307050943374634, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.426131010055542, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 656, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977879524230957 + }, + { + "episode": 15792, + "epoch": 0.03153946627362165, + "loss/policy_avg": 0.02599414996802807, + "lr": 2.8110621165644173e-06, + "objective/entropy": 86.70712280273438, + "objective/kl": 2.9474596977233887, + "objective/non_score_reward": -0.14737296104431152, + "objective/rlhf_reward": 1.6998215374710055, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 5.616138458251953, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3623046875, + "step": 657, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995654821395874 + }, + { + "episode": 15816, + "epoch": 0.03158739859318642, + "loss/policy_avg": -0.01104295626282692, + "lr": 2.8107745398773007e-06, + "objective/entropy": 81.79622650146484, + "objective/kl": 5.051379203796387, + "objective/non_score_reward": -0.25256896018981934, + "objective/rlhf_reward": -1.5154138207435608, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.89011812210083, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3701171875, + "step": 658, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964945316314697 + }, + { + "episode": 15840, + "epoch": 0.0316353309127512, + "loss/policy_avg": 0.12571890652179718, + "lr": 2.810486963190184e-06, + "objective/entropy": 96.8927230834961, + "objective/kl": 5.987539291381836, + "objective/non_score_reward": -0.2993769645690918, + "objective/rlhf_reward": 0.00991805990946626, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.382266998291016, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4228515625, + "step": 659, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979853630065918 + }, + { + "episode": 15864, + "epoch": 0.03168326323231597, + "loss/policy_avg": 0.06719927489757538, + "lr": 2.8101993865030676e-06, + "objective/entropy": 132.24362182617188, + "objective/kl": 3.5840365886688232, + "objective/non_score_reward": -0.17920184135437012, + "objective/rlhf_reward": 1.9247889965772629, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.140369176864624, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.56640625, + "step": 660, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985761642456055 + }, + { + "episode": 15888, + "epoch": 0.031731195551880746, + "loss/policy_avg": 0.02621820941567421, + "lr": 2.809911809815951e-06, + "objective/entropy": 77.62518310546875, + "objective/kl": 5.918420791625977, + "objective/non_score_reward": -0.29592105746269226, + "objective/rlhf_reward": 4.224473670125008, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.996936798095703, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3125, + "step": 661, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967403411865234 + }, + { + "episode": 15912, + "epoch": 0.03177912787144552, + "loss/policy_avg": 0.032360684126615524, + "lr": 2.8096242331288344e-06, + "objective/entropy": 89.46601867675781, + "objective/kl": 4.118409156799316, + "objective/non_score_reward": -0.20592045783996582, + "objective/rlhf_reward": -1.2355227768421173, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.613059997558594, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4326171875, + "step": 662, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997235655784607 + }, + { + "episode": 15936, + "epoch": 0.031827060191010295, + "loss/policy_avg": 0.12337756156921387, + "lr": 2.809336656441718e-06, + "objective/entropy": 103.72271728515625, + "objective/kl": 2.879462718963623, + "objective/non_score_reward": -0.14397311210632324, + "objective/rlhf_reward": -0.8638387024402618, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.149782657623291, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.482421875, + "step": 663, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001060962677002 + }, + { + "episode": 15960, + "epoch": 0.031874992510575066, + "loss/policy_avg": 0.02040736749768257, + "lr": 2.8090490797546013e-06, + "objective/entropy": 91.24725341796875, + "objective/kl": 6.406259059906006, + "objective/non_score_reward": -0.3203129470348358, + "objective/rlhf_reward": -1.9218776673078537, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.362061500549316, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.388671875, + "step": 664, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972227811813354 + }, + { + "episode": 15984, + "epoch": 0.031922924830139844, + "loss/policy_avg": 0.09209120273590088, + "lr": 2.8087615030674847e-06, + "objective/entropy": 115.5174789428711, + "objective/kl": 3.1037070751190186, + "objective/non_score_reward": -0.15518534183502197, + "objective/rlhf_reward": -0.9311120733618736, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.996057033538818, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.515625, + "step": 665, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.995287537574768 + }, + { + "episode": 16008, + "epoch": 0.031970857149704615, + "loss/policy_avg": 0.04856950789690018, + "lr": 2.808473926380368e-06, + "objective/entropy": 93.19438171386719, + "objective/kl": 5.033243656158447, + "objective/non_score_reward": -0.2516621947288513, + "objective/rlhf_reward": -1.5099730379879475, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.581728219985962, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3955078125, + "step": 666, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0018258094787598 + }, + { + "episode": 16032, + "epoch": 0.03201878946926939, + "loss/policy_avg": -0.0462065115571022, + "lr": 2.8081863496932516e-06, + "objective/entropy": 104.47245788574219, + "objective/kl": 1.7179036140441895, + "objective/non_score_reward": -0.08589519560337067, + "objective/rlhf_reward": -0.5153711289167404, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.955135703086853, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.443359375, + "step": 667, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0049097537994385 + }, + { + "episode": 16056, + "epoch": 0.032066721788834164, + "loss/policy_avg": 0.002609906019642949, + "lr": 2.807898773006135e-06, + "objective/entropy": 79.1048355102539, + "objective/kl": 4.777551174163818, + "objective/non_score_reward": -0.2388775795698166, + "objective/rlhf_reward": -1.433265395462513, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.9781274795532227, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3818359375, + "step": 668, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999597072601318 + }, + { + "episode": 16080, + "epoch": 0.03211465410839894, + "loss/policy_avg": 0.07883474230766296, + "lr": 2.8076111963190184e-06, + "objective/entropy": 93.07427978515625, + "objective/kl": 5.404719829559326, + "objective/non_score_reward": -0.2702360153198242, + "objective/rlhf_reward": 0.37858401983976353, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.6494712829589844, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4248046875, + "step": 669, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992001056671143 + }, + { + "episode": 16104, + "epoch": 0.03216258642796371, + "loss/policy_avg": -0.02150985598564148, + "lr": 2.807323619631902e-06, + "objective/entropy": 90.32296752929688, + "objective/kl": 5.878199577331543, + "objective/non_score_reward": -0.2939099669456482, + "objective/rlhf_reward": -0.029070813964239495, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.268265724182129, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4443359375, + "step": 670, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998884201049805 + }, + { + "episode": 16128, + "epoch": 0.03221051874752849, + "loss/policy_avg": 0.0845918357372284, + "lr": 2.8070360429447853e-06, + "objective/entropy": 88.01094818115234, + "objective/kl": 5.74808406829834, + "objective/non_score_reward": -0.2874041795730591, + "objective/rlhf_reward": 0.1683641832760181, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 5.50971794128418, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.40625, + "step": 671, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974206686019897 + }, + { + "episode": 16152, + "epoch": 0.03225845106709326, + "loss/policy_avg": 0.017645521089434624, + "lr": 2.8067484662576687e-06, + "objective/entropy": 86.8798828125, + "objective/kl": 6.554607391357422, + "objective/non_score_reward": -0.3277303874492645, + "objective/rlhf_reward": -1.9663822203874588, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.87298059463501, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3564453125, + "step": 672, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999701976776123 + }, + { + "episode": 16176, + "epoch": 0.03230638338665804, + "loss/policy_avg": 0.028996344655752182, + "lr": 2.8064608895705525e-06, + "objective/entropy": 91.28887939453125, + "objective/kl": 2.530057907104492, + "objective/non_score_reward": -0.1265029013156891, + "objective/rlhf_reward": 0.9753716133431278, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.622627258300781, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4091796875, + "step": 673, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999524712562561 + }, + { + "episode": 16200, + "epoch": 0.03235431570622281, + "loss/policy_avg": 0.0011674091219902039, + "lr": 2.806173312883436e-06, + "objective/entropy": 70.5921859741211, + "objective/kl": 4.101029396057129, + "objective/non_score_reward": -0.2050514668226242, + "objective/rlhf_reward": 2.5552696757895164, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.9347782135009766, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3349609375, + "step": 674, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010862350463867 + }, + { + "episode": 16224, + "epoch": 0.03240224802578759, + "loss/policy_avg": 0.051978811621665955, + "lr": 2.805885736196319e-06, + "objective/entropy": 114.9510498046875, + "objective/kl": 4.047782897949219, + "objective/non_score_reward": -0.20238915085792542, + "objective/rlhf_reward": 0.5200541868702255, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.070061206817627, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51171875, + "step": 675, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99875807762146 + }, + { + "episode": 16248, + "epoch": 0.03245018034535236, + "loss/policy_avg": 0.008648848161101341, + "lr": 2.8055981595092024e-06, + "objective/entropy": 94.15206909179688, + "objective/kl": 3.8694655895233154, + "objective/non_score_reward": -0.19347326457500458, + "objective/rlhf_reward": -1.160839594900608, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.1512869596481323, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.513671875, + "step": 676, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0014500617980957 + }, + { + "episode": 16272, + "epoch": 0.03249811266491714, + "loss/policy_avg": 0.037415727972984314, + "lr": 2.805310582822086e-06, + "objective/entropy": 98.83926391601562, + "objective/kl": 4.354862213134766, + "objective/non_score_reward": -0.21774308383464813, + "objective/rlhf_reward": -1.3064585775136948, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.8714044094085693, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.4169921875, + "step": 677, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001909732818604 + }, + { + "episode": 16296, + "epoch": 0.03254604498448191, + "loss/policy_avg": -0.0019232900813221931, + "lr": 2.8050230061349693e-06, + "objective/entropy": 94.42630004882812, + "objective/kl": 3.8474669456481934, + "objective/non_score_reward": -0.1923733651638031, + "objective/rlhf_reward": -1.154240071773529, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.427772045135498, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.404296875, + "step": 678, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990053176879883 + }, + { + "episode": 16320, + "epoch": 0.03259397730404669, + "loss/policy_avg": 0.012594800442457199, + "lr": 2.8047354294478527e-06, + "objective/entropy": 101.46583557128906, + "objective/kl": 6.148866176605225, + "objective/non_score_reward": -0.3074433207511902, + "objective/rlhf_reward": 0.2925832279433144, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.7853195667266846, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.44921875, + "step": 679, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999977946281433 + }, + { + "episode": 16344, + "epoch": 0.03264190962361146, + "loss/policy_avg": -0.015242991037666798, + "lr": 2.804447852760736e-06, + "objective/entropy": 91.35795593261719, + "objective/kl": 4.608936309814453, + "objective/non_score_reward": -0.23044681549072266, + "objective/rlhf_reward": -1.3826809898018837, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.967317581176758, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.419921875, + "step": 680, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999803900718689 + }, + { + "episode": 16368, + "epoch": 0.032689841943176236, + "loss/policy_avg": 0.3839697241783142, + "lr": 2.8041602760736195e-06, + "objective/entropy": 83.71601867675781, + "objective/kl": 2.9531688690185547, + "objective/non_score_reward": -0.14765845239162445, + "objective/rlhf_reward": 5.114049289375544, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.9509658813476562, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3681640625, + "step": 681, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0037946701049805 + }, + { + "episode": 16392, + "epoch": 0.03273777426274101, + "loss/policy_avg": 0.044735897332429886, + "lr": 2.803872699386503e-06, + "objective/entropy": 96.74462890625, + "objective/kl": 5.961175918579102, + "objective/non_score_reward": -0.29805877804756165, + "objective/rlhf_reward": -1.7883526384830475, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6738030910491943, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4013671875, + "step": 682, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996347427368164 + }, + { + "episode": 16416, + "epoch": 0.032785706582305785, + "loss/policy_avg": -0.018163762986660004, + "lr": 2.803585122699387e-06, + "objective/entropy": 85.88985443115234, + "objective/kl": 3.9504146575927734, + "objective/non_score_reward": -0.19752076268196106, + "objective/rlhf_reward": -1.185124471783638, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.371577501296997, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4169921875, + "step": 683, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0029659271240234 + }, + { + "episode": 16440, + "epoch": 0.032833638901870556, + "loss/policy_avg": 0.04655037075281143, + "lr": 2.8032975460122702e-06, + "objective/entropy": 88.13755798339844, + "objective/kl": 4.32558536529541, + "objective/non_score_reward": -0.21627925336360931, + "objective/rlhf_reward": 0.7023244574666022, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.373892068862915, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4111328125, + "step": 684, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002929925918579 + }, + { + "episode": 16464, + "epoch": 0.032881571221435334, + "loss/policy_avg": 0.07055753469467163, + "lr": 2.8030099693251537e-06, + "objective/entropy": 102.58724212646484, + "objective/kl": 5.311849594116211, + "objective/non_score_reward": -0.265592485666275, + "objective/rlhf_reward": -1.5935548096895218, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.176906585693359, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.458984375, + "step": 685, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992914199829102 + }, + { + "episode": 16488, + "epoch": 0.032929503541000105, + "loss/policy_avg": -0.024220827966928482, + "lr": 2.802722392638037e-06, + "objective/entropy": 98.72303771972656, + "objective/kl": 3.3447787761688232, + "objective/non_score_reward": -0.16723893582820892, + "objective/rlhf_reward": 1.317683288042641, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.284590244293213, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.453125, + "step": 686, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001983404159546 + }, + { + "episode": 16512, + "epoch": 0.03297743586056488, + "loss/policy_avg": -0.012027123011648655, + "lr": 2.8024348159509205e-06, + "objective/entropy": 78.47486114501953, + "objective/kl": 3.8769264221191406, + "objective/non_score_reward": -0.19384633004665375, + "objective/rlhf_reward": -1.1630779914557934, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4407821893692017, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4169921875, + "step": 687, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001128911972046 + }, + { + "episode": 16536, + "epoch": 0.033025368180129654, + "loss/policy_avg": 0.11866030097007751, + "lr": 2.802147239263804e-06, + "objective/entropy": 72.361572265625, + "objective/kl": 3.97251033782959, + "objective/non_score_reward": -0.19862553477287292, + "objective/rlhf_reward": -1.1917532086372375, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4714713096618652, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.515625, + "step": 688, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003359317779541 + }, + { + "episode": 16560, + "epoch": 0.03307330049969443, + "loss/policy_avg": 0.058641016483306885, + "lr": 2.8018596625766874e-06, + "objective/entropy": 75.98714447021484, + "objective/kl": 3.868462085723877, + "objective/non_score_reward": -0.1934230923652649, + "objective/rlhf_reward": 0.645641397440556, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.633174419403076, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4912109375, + "step": 689, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9978687763214111 + }, + { + "episode": 16584, + "epoch": 0.0331212328192592, + "loss/policy_avg": 0.022836944088339806, + "lr": 2.8015720858895704e-06, + "objective/entropy": 82.18055725097656, + "objective/kl": 5.156536102294922, + "objective/non_score_reward": -0.2578268051147461, + "objective/rlhf_reward": -1.546960823237896, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8261890411376953, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3642578125, + "step": 690, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992616176605225 + }, + { + "episode": 16608, + "epoch": 0.03316916513882398, + "loss/policy_avg": 0.04143326357007027, + "lr": 2.801284509202454e-06, + "objective/entropy": 90.64967346191406, + "objective/kl": 3.9163427352905273, + "objective/non_score_reward": -0.19581714272499084, + "objective/rlhf_reward": 2.6106756650788, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 0.898766815662384, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4404296875, + "step": 691, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0032382011413574 + }, + { + "episode": 16632, + "epoch": 0.03321709745838876, + "loss/policy_avg": -0.011122960597276688, + "lr": 2.8009969325153372e-06, + "objective/entropy": 118.8199691772461, + "objective/kl": 3.268311023712158, + "objective/non_score_reward": -0.1634155511856079, + "objective/rlhf_reward": 0.753895665694841, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.97409725189209, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5, + "step": 692, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002232551574707 + }, + { + "episode": 16656, + "epoch": 0.03326502977795353, + "loss/policy_avg": 0.020786846056580544, + "lr": 2.8007093558282206e-06, + "objective/entropy": 112.21043395996094, + "objective/kl": 3.140720844268799, + "objective/non_score_reward": -0.15703603625297546, + "objective/rlhf_reward": 1.641843101120183, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.0571141242980957, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.46875, + "step": 693, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984453916549683 + }, + { + "episode": 16680, + "epoch": 0.03331296209751831, + "loss/policy_avg": 0.031112581491470337, + "lr": 2.8004217791411045e-06, + "objective/entropy": 89.49346923828125, + "objective/kl": 4.526451110839844, + "objective/non_score_reward": -0.22632253170013428, + "objective/rlhf_reward": 0.3764537453547798, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.4798502922058105, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3916015625, + "step": 694, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983123540878296 + }, + { + "episode": 16704, + "epoch": 0.03336089441708308, + "loss/policy_avg": 0.016581252217292786, + "lr": 2.800134202453988e-06, + "objective/entropy": 119.6799545288086, + "objective/kl": 6.108203887939453, + "objective/non_score_reward": -0.305410236120224, + "objective/rlhf_reward": 1.9531172239166907, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.863758563995361, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.513671875, + "step": 695, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987270832061768 + }, + { + "episode": 16728, + "epoch": 0.033408826736647856, + "loss/policy_avg": 0.04032521694898605, + "lr": 2.7998466257668713e-06, + "objective/entropy": 106.359619140625, + "objective/kl": 5.746870994567871, + "objective/non_score_reward": -0.2873435318470001, + "objective/rlhf_reward": -1.7240611091256142, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.811809539794922, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4423828125, + "step": 696, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998631477355957 + }, + { + "episode": 16752, + "epoch": 0.03345675905621263, + "loss/policy_avg": 0.01577501744031906, + "lr": 2.7995590490797548e-06, + "objective/entropy": 109.61204528808594, + "objective/kl": 5.819454193115234, + "objective/non_score_reward": -0.2909727096557617, + "objective/rlhf_reward": -1.7458362877368927, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.18795108795166, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4501953125, + "step": 697, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99989914894104 + }, + { + "episode": 16776, + "epoch": 0.033504691375777405, + "loss/policy_avg": 0.06474008411169052, + "lr": 2.799271472392638e-06, + "objective/entropy": 126.65843963623047, + "objective/kl": 4.095909118652344, + "objective/non_score_reward": -0.20479540526866913, + "objective/rlhf_reward": -1.2287725247442722, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7258036136627197, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.5390625, + "step": 698, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006160736083984 + }, + { + "episode": 16800, + "epoch": 0.033552623695342176, + "loss/policy_avg": -0.009414106607437134, + "lr": 2.7989838957055216e-06, + "objective/entropy": 100.93156433105469, + "objective/kl": 3.970113754272461, + "objective/non_score_reward": -0.1985056847333908, + "objective/rlhf_reward": 0.9462089397419823, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.5446009635925293, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4521484375, + "step": 699, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0052738189697266 + }, + { + "episode": 16824, + "epoch": 0.033600556014906954, + "loss/policy_avg": 0.0793452113866806, + "lr": 2.798696319018405e-06, + "objective/entropy": 107.23753356933594, + "objective/kl": 6.2864508628845215, + "objective/non_score_reward": -0.3143225610256195, + "objective/rlhf_reward": -1.8859353512525558, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.378715991973877, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4794921875, + "step": 700, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967279434204102 + }, + { + "episode": 16848, + "epoch": 0.033648488334471725, + "loss/policy_avg": 0.010690303519368172, + "lr": 2.7984087423312885e-06, + "objective/entropy": 91.05307006835938, + "objective/kl": 1.500659704208374, + "objective/non_score_reward": -0.07503297924995422, + "objective/rlhf_reward": 1.870918975358105, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.232447624206543, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.55859375, + "step": 701, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000957489013672 + }, + { + "episode": 16872, + "epoch": 0.0336964206540365, + "loss/policy_avg": 0.028314707800745964, + "lr": 2.798121165644172e-06, + "objective/entropy": 102.44908142089844, + "objective/kl": 5.620230674743652, + "objective/non_score_reward": -0.28101152181625366, + "objective/rlhf_reward": 0.31393101066350926, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.1480467319488525, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.51953125, + "step": 702, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987521171569824 + }, + { + "episode": 16896, + "epoch": 0.033744352973601274, + "loss/policy_avg": 0.019661687314510345, + "lr": 2.7978335889570553e-06, + "objective/entropy": 106.68318939208984, + "objective/kl": 4.888775825500488, + "objective/non_score_reward": -0.2444387972354889, + "objective/rlhf_reward": 0.4261565257302131, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.1324853897094727, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4814453125, + "step": 703, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000755786895752 + }, + { + "episode": 16920, + "epoch": 0.03379228529316605, + "loss/policy_avg": 0.06972890347242355, + "lr": 2.7975460122699388e-06, + "objective/entropy": 101.07684326171875, + "objective/kl": 3.757908821105957, + "objective/non_score_reward": -0.18789546191692352, + "objective/rlhf_reward": 1.009870388399495, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.724516868591309, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4619140625, + "step": 704, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993127584457397 + }, + { + "episode": 16944, + "epoch": 0.03384021761273082, + "loss/policy_avg": 0.020795978605747223, + "lr": 2.797258435582822e-06, + "objective/entropy": 101.24345397949219, + "objective/kl": 3.265904188156128, + "objective/non_score_reward": -0.1632952094078064, + "objective/rlhf_reward": -0.9797712676227093, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.2967634201049805, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4267578125, + "step": 705, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007450580596924 + }, + { + "episode": 16968, + "epoch": 0.0338881499322956, + "loss/policy_avg": -0.029753942042589188, + "lr": 2.7969708588957056e-06, + "objective/entropy": 127.89506530761719, + "objective/kl": 2.524000644683838, + "objective/non_score_reward": -0.12620002031326294, + "objective/rlhf_reward": 3.028378406999748, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 1.6228749752044678, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.5859375, + "step": 706, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016770362854004 + }, + { + "episode": 16992, + "epoch": 0.03393608225186037, + "loss/policy_avg": 0.005111599341034889, + "lr": 2.796683282208589e-06, + "objective/entropy": 111.81777954101562, + "objective/kl": 4.9906907081604, + "objective/non_score_reward": -0.2495345175266266, + "objective/rlhf_reward": 0.5027929246425628, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.6682634353637695, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.509765625, + "step": 707, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002641677856445 + }, + { + "episode": 17016, + "epoch": 0.03398401457142515, + "loss/policy_avg": 0.023075047880411148, + "lr": 2.7963957055214725e-06, + "objective/entropy": 112.82221984863281, + "objective/kl": 7.65072774887085, + "objective/non_score_reward": -0.3825363516807556, + "objective/rlhf_reward": -2.29521806538105, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3631930351257324, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4697265625, + "step": 708, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994550943374634 + }, + { + "episode": 17040, + "epoch": 0.03403194689098992, + "loss/policy_avg": 0.1201067641377449, + "lr": 2.796108128834356e-06, + "objective/entropy": 103.09459686279297, + "objective/kl": 5.161118507385254, + "objective/non_score_reward": -0.2580559253692627, + "objective/rlhf_reward": -1.548335425555706, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.637876510620117, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46484375, + "step": 709, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0044305324554443 + }, + { + "episode": 17064, + "epoch": 0.0340798792105547, + "loss/policy_avg": 0.11167553067207336, + "lr": 2.7958205521472393e-06, + "objective/entropy": 98.06791687011719, + "objective/kl": 6.272442817687988, + "objective/non_score_reward": -0.3136221468448639, + "objective/rlhf_reward": 0.4393840368438724, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 9.645532608032227, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.451171875, + "step": 710, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9941247701644897 + }, + { + "episode": 17088, + "epoch": 0.03412781153011947, + "loss/policy_avg": 0.04584480822086334, + "lr": 2.7955329754601227e-06, + "objective/entropy": 81.93452453613281, + "objective/kl": 3.1383018493652344, + "objective/non_score_reward": -0.1569150984287262, + "objective/rlhf_reward": 0.7928984418407761, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.077558994293213, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3935546875, + "step": 711, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999780535697937 + }, + { + "episode": 17112, + "epoch": 0.03417574384968425, + "loss/policy_avg": 0.06404899060726166, + "lr": 2.795245398773006e-06, + "objective/entropy": 105.03392028808594, + "objective/kl": 3.6139867305755615, + "objective/non_score_reward": -0.18069933354854584, + "objective/rlhf_reward": 1.0530471660603418, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.1363778114318848, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.453125, + "step": 712, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003039598464966 + }, + { + "episode": 17136, + "epoch": 0.03422367616924902, + "loss/policy_avg": 0.13679960370063782, + "lr": 2.7949578220858896e-06, + "objective/entropy": 93.47728729248047, + "objective/kl": 6.625774383544922, + "objective/non_score_reward": -0.3312886953353882, + "objective/rlhf_reward": -0.1815522203801836, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.848726749420166, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4306640625, + "step": 713, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963405132293701 + }, + { + "episode": 17160, + "epoch": 0.0342716084888138, + "loss/policy_avg": -0.0032400726340711117, + "lr": 2.794670245398773e-06, + "objective/entropy": 83.45506286621094, + "objective/kl": 5.021091938018799, + "objective/non_score_reward": -0.251054584980011, + "objective/rlhf_reward": -1.5063274502754211, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.262059211730957, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.3798828125, + "step": 714, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996957778930664 + }, + { + "episode": 17184, + "epoch": 0.03431954080837857, + "loss/policy_avg": 0.10427850484848022, + "lr": 2.7943826687116564e-06, + "objective/entropy": 112.53851318359375, + "objective/kl": 2.3828535079956055, + "objective/non_score_reward": -0.11914268136024475, + "objective/rlhf_reward": -0.7148560956120491, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.312082290649414, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.53125, + "step": 715, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991588592529297 + }, + { + "episode": 17208, + "epoch": 0.034367473127943346, + "loss/policy_avg": -0.007660915143787861, + "lr": 2.79409509202454e-06, + "objective/entropy": 84.03372955322266, + "objective/kl": 1.6369898319244385, + "objective/non_score_reward": -0.08184950053691864, + "objective/rlhf_reward": -0.49109698459506035, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2708444595336914, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.361328125, + "step": 716, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0044307708740234 + }, + { + "episode": 17232, + "epoch": 0.03441540544750812, + "loss/policy_avg": 0.07618755102157593, + "lr": 2.7938075153374237e-06, + "objective/entropy": 83.79444885253906, + "objective/kl": 7.17683219909668, + "objective/non_score_reward": -0.3588416278362274, + "objective/rlhf_reward": -2.1530496925115585, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.007095813751221, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.359375, + "step": 717, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9968452453613281 + }, + { + "episode": 17256, + "epoch": 0.034463337767072895, + "loss/policy_avg": 0.07050332427024841, + "lr": 2.793519938650307e-06, + "objective/entropy": 92.82572937011719, + "objective/kl": 2.681675672531128, + "objective/non_score_reward": -0.13408377766609192, + "objective/rlhf_reward": -0.8045026585459709, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.198982238769531, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.365234375, + "step": 718, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993823766708374 + }, + { + "episode": 17280, + "epoch": 0.034511270086637666, + "loss/policy_avg": 0.056927330791950226, + "lr": 2.79323236196319e-06, + "objective/entropy": 104.02859497070312, + "objective/kl": 6.1074957847595215, + "objective/non_score_reward": -0.30537480115890503, + "objective/rlhf_reward": -1.8322486337274313, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.8842620849609375, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4560546875, + "step": 719, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964358806610107 + }, + { + "episode": 17304, + "epoch": 0.034559202406202444, + "loss/policy_avg": -0.007081905379891396, + "lr": 2.7929447852760736e-06, + "objective/entropy": 114.20758056640625, + "objective/kl": 3.463644027709961, + "objective/non_score_reward": -0.17318221926689148, + "objective/rlhf_reward": -1.039093241095543, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.90466570854187, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.53515625, + "step": 720, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997186660766602 + }, + { + "episode": 17328, + "epoch": 0.034607134725767215, + "loss/policy_avg": 0.015198176726698875, + "lr": 2.792657208588957e-06, + "objective/entropy": 158.06381225585938, + "objective/kl": 2.246577262878418, + "objective/non_score_reward": -0.11232884973287582, + "objective/rlhf_reward": 1.2188161753556337, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 0.8271192312240601, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.671875, + "step": 721, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018062591552734 + }, + { + "episode": 17352, + "epoch": 0.03465506704533199, + "loss/policy_avg": 0.08539818972349167, + "lr": 2.7923696319018404e-06, + "objective/entropy": 129.8774871826172, + "objective/kl": 7.0259599685668945, + "objective/non_score_reward": -0.3512979745864868, + "objective/rlhf_reward": -2.1077877581119537, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.0315985679626465, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.556640625, + "step": 722, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994308948516846 + }, + { + "episode": 17376, + "epoch": 0.034702999364896764, + "loss/policy_avg": 0.10531924664974213, + "lr": 2.792082055214724e-06, + "objective/entropy": 119.16227722167969, + "objective/kl": 5.4193267822265625, + "objective/non_score_reward": -0.2709663510322571, + "objective/rlhf_reward": -1.6257979720830917, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4247498512268066, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.525390625, + "step": 723, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003847360610962 + }, + { + "episode": 17400, + "epoch": 0.03475093168446154, + "loss/policy_avg": 0.014854070730507374, + "lr": 2.7917944785276073e-06, + "objective/entropy": 86.93174743652344, + "objective/kl": 4.050511837005615, + "objective/non_score_reward": -0.20252560079097748, + "objective/rlhf_reward": -1.2151535972952843, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.014798164367676, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.39453125, + "step": 724, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000887393951416 + }, + { + "episode": 17424, + "epoch": 0.03479886400402631, + "loss/policy_avg": 0.13801337778568268, + "lr": 2.7915069018404907e-06, + "objective/entropy": 94.32865905761719, + "objective/kl": 4.1095476150512695, + "objective/non_score_reward": -0.20547738671302795, + "objective/rlhf_reward": 0.9043788917769324, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.868177890777588, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4072265625, + "step": 725, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002524852752686 + }, + { + "episode": 17448, + "epoch": 0.03484679632359109, + "loss/policy_avg": 0.007892458699643612, + "lr": 2.791219325153374e-06, + "objective/entropy": 71.18832397460938, + "objective/kl": 3.7083280086517334, + "objective/non_score_reward": -0.18541643023490906, + "objective/rlhf_reward": -1.112498439848423, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.671597480773926, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3369140625, + "step": 726, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992289543151855 + }, + { + "episode": 17472, + "epoch": 0.03489472864315586, + "loss/policy_avg": 0.05538250505924225, + "lr": 2.7909317484662576e-06, + "objective/entropy": 98.4771728515625, + "objective/kl": 3.403498411178589, + "objective/non_score_reward": -0.17017492651939392, + "objective/rlhf_reward": 0.7133394583956085, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.089088439941406, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.396484375, + "step": 727, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998232126235962 + }, + { + "episode": 17496, + "epoch": 0.03494266096272064, + "loss/policy_avg": 0.11734011769294739, + "lr": 2.7906441717791414e-06, + "objective/entropy": 88.85986328125, + "objective/kl": 3.2424845695495605, + "objective/non_score_reward": -0.16212421655654907, + "objective/rlhf_reward": 2.027254655957222, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.4621543884277344, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3955078125, + "step": 728, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0039806365966797 + }, + { + "episode": 17520, + "epoch": 0.03499059328228541, + "loss/policy_avg": 0.015957780182361603, + "lr": 2.790356595092025e-06, + "objective/entropy": 93.19619750976562, + "objective/kl": 4.220281600952148, + "objective/non_score_reward": -0.21101412177085876, + "objective/rlhf_reward": -1.266084685921669, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.449823379516602, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3974609375, + "step": 729, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972922801971436 + }, + { + "episode": 17544, + "epoch": 0.03503852560185019, + "loss/policy_avg": 0.08787181228399277, + "lr": 2.7900690184049083e-06, + "objective/entropy": 99.04100799560547, + "objective/kl": 6.164834022521973, + "objective/non_score_reward": -0.30824172496795654, + "objective/rlhf_reward": 0.7346091327430698, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.952451467514038, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4462890625, + "step": 730, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0017242431640625 + }, + { + "episode": 17568, + "epoch": 0.03508645792141496, + "loss/policy_avg": -0.006226440891623497, + "lr": 2.7897814417177917e-06, + "objective/entropy": 91.38070678710938, + "objective/kl": 4.6299309730529785, + "objective/non_score_reward": -0.23149655759334564, + "objective/rlhf_reward": 0.4172006656767163, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.482537269592285, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.42578125, + "step": 731, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008347034454346 + }, + { + "episode": 17592, + "epoch": 0.03513439024097974, + "loss/policy_avg": 0.06503081321716309, + "lr": 2.789493865030675e-06, + "objective/entropy": 85.8843994140625, + "objective/kl": 3.4764487743377686, + "objective/non_score_reward": -0.17382244765758514, + "objective/rlhf_reward": -1.042934670113027, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.3425084352493286, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.3828125, + "step": 732, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006473064422607 + }, + { + "episode": 17616, + "epoch": 0.03518232256054451, + "loss/policy_avg": 0.025212785229086876, + "lr": 2.7892062883435585e-06, + "objective/entropy": 94.51025390625, + "objective/kl": 2.7837893962860107, + "objective/non_score_reward": -0.1391894817352295, + "objective/rlhf_reward": 1.3021063328195943, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.155670166015625, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4453125, + "step": 733, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988410472869873 + }, + { + "episode": 17640, + "epoch": 0.03523025488010929, + "loss/policy_avg": 0.007111940532922745, + "lr": 2.7889187116564415e-06, + "objective/entropy": 101.4713134765625, + "objective/kl": 2.30013370513916, + "objective/non_score_reward": -0.11500667035579681, + "objective/rlhf_reward": -0.6900399848818779, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.462810516357422, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.46484375, + "step": 734, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000722408294678 + }, + { + "episode": 17664, + "epoch": 0.03527818719967406, + "loss/policy_avg": -0.012315358966588974, + "lr": 2.788631134969325e-06, + "objective/entropy": 121.2395248413086, + "objective/kl": 2.4197301864624023, + "objective/non_score_reward": -0.12098650634288788, + "objective/rlhf_reward": -0.7259190380573273, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1454834938049316, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.533203125, + "step": 735, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000669956207275 + }, + { + "episode": 17688, + "epoch": 0.035326119519238836, + "loss/policy_avg": 0.024908840656280518, + "lr": 2.7883435582822084e-06, + "objective/entropy": 88.74043273925781, + "objective/kl": 4.972241401672363, + "objective/non_score_reward": -0.24861207604408264, + "objective/rlhf_reward": 0.24271660595095979, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.770882606506348, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4130859375, + "step": 736, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962129592895508 + }, + { + "episode": 17712, + "epoch": 0.03537405183880361, + "loss/policy_avg": -0.030763089656829834, + "lr": 2.788055981595092e-06, + "objective/entropy": 97.61178588867188, + "objective/kl": 5.8599677085876465, + "objective/non_score_reward": -0.29299840331077576, + "objective/rlhf_reward": 4.242009669542313, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.7183421850204468, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4521484375, + "step": 737, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0023114681243896 + }, + { + "episode": 17736, + "epoch": 0.035421984158368384, + "loss/policy_avg": 0.03092562034726143, + "lr": 2.7877684049079757e-06, + "objective/entropy": 60.64031982421875, + "objective/kl": 6.330188274383545, + "objective/non_score_reward": -0.3165094256401062, + "objective/rlhf_reward": 1.1009435653686523, + "objective/scores": 0.5, + "policy/approxkl_avg": 6.580857753753662, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3251953125, + "step": 738, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9996299743652344 + }, + { + "episode": 17760, + "epoch": 0.035469916477933155, + "loss/policy_avg": 0.03484443575143814, + "lr": 2.787480828220859e-06, + "objective/entropy": 78.14405822753906, + "objective/kl": 4.823902606964111, + "objective/non_score_reward": -0.24119512736797333, + "objective/rlhf_reward": -1.4471707344055176, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.333271503448486, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3896484375, + "step": 739, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9961516857147217 + }, + { + "episode": 17784, + "epoch": 0.03551784879749793, + "loss/policy_avg": 0.02564769797027111, + "lr": 2.7871932515337425e-06, + "objective/entropy": 89.4190673828125, + "objective/kl": 3.2454631328582764, + "objective/non_score_reward": -0.16227316856384277, + "objective/rlhf_reward": -0.9736389592289925, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4939374923706055, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.41796875, + "step": 740, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963891506195068 + }, + { + "episode": 17808, + "epoch": 0.035565781117062704, + "loss/policy_avg": 0.10963942855596542, + "lr": 2.786905674846626e-06, + "objective/entropy": 94.960693359375, + "objective/kl": 4.479686737060547, + "objective/non_score_reward": -0.22398434579372406, + "objective/rlhf_reward": 0.5488831747761573, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.206039905548096, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.431640625, + "step": 741, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990242719650269 + }, + { + "episode": 17832, + "epoch": 0.03561371343662748, + "loss/policy_avg": 0.016339588910341263, + "lr": 2.7866180981595094e-06, + "objective/entropy": 82.87503051757812, + "objective/kl": 4.28940486907959, + "objective/non_score_reward": -0.21447020769119263, + "objective/rlhf_reward": -1.2868213132023811, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.297367095947266, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3642578125, + "step": 742, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983983039855957 + }, + { + "episode": 17856, + "epoch": 0.03566164575619225, + "loss/policy_avg": 0.027774496003985405, + "lr": 2.786330521472393e-06, + "objective/entropy": 92.83729553222656, + "objective/kl": 5.372522354125977, + "objective/non_score_reward": -0.2686261236667633, + "objective/rlhf_reward": 0.38824328780174244, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.612212181091309, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4052734375, + "step": 743, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985430240631104 + }, + { + "episode": 17880, + "epoch": 0.03570957807575703, + "loss/policy_avg": 0.056710027158260345, + "lr": 2.7860429447852762e-06, + "objective/entropy": 112.47662353515625, + "objective/kl": 4.207890510559082, + "objective/non_score_reward": -0.21039454638957977, + "objective/rlhf_reward": -1.262367233633995, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2129783630371094, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.517578125, + "step": 744, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997196197509766 + }, + { + "episode": 17904, + "epoch": 0.0357575103953218, + "loss/policy_avg": 0.24912984669208527, + "lr": 2.7857553680981596e-06, + "objective/entropy": 96.34727478027344, + "objective/kl": 3.5249571800231934, + "objective/non_score_reward": -0.17624786496162415, + "objective/rlhf_reward": -1.0574871450662613, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.8671133518218994, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.451171875, + "step": 745, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000049591064453 + }, + { + "episode": 17928, + "epoch": 0.03580544271488658, + "loss/policy_avg": -0.003899288596585393, + "lr": 2.785467791411043e-06, + "objective/entropy": 56.94530487060547, + "objective/kl": 4.18492317199707, + "objective/non_score_reward": -0.20924614369869232, + "objective/rlhf_reward": 0.8817662679065598, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.1458748579025269, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.2578125, + "step": 746, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006253719329834 + }, + { + "episode": 17952, + "epoch": 0.03585337503445135, + "loss/policy_avg": 0.06819597631692886, + "lr": 2.7851802147239265e-06, + "objective/entropy": 99.42156219482422, + "objective/kl": 5.690925598144531, + "objective/non_score_reward": -0.2845463156700134, + "objective/rlhf_reward": 1.2927222698926926, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.895784378051758, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4130859375, + "step": 747, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000113010406494 + }, + { + "episode": 17976, + "epoch": 0.03590130735401613, + "loss/policy_avg": 0.03785514086484909, + "lr": 2.78489263803681e-06, + "objective/entropy": 93.50057983398438, + "objective/kl": 5.685895919799805, + "objective/non_score_reward": -0.2842947840690613, + "objective/rlhf_reward": -1.7057687789201736, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.5728983879089355, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.421875, + "step": 748, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9957398176193237 + }, + { + "episode": 18000, + "epoch": 0.03594923967358091, + "loss/policy_avg": 0.16301029920578003, + "lr": 2.7846050613496933e-06, + "objective/entropy": 86.15575408935547, + "objective/kl": 4.2543840408325195, + "objective/non_score_reward": -0.21271920204162598, + "objective/rlhf_reward": -1.276315152645111, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.351012706756592, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.423828125, + "step": 749, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999162197113037 + }, + { + "episode": 18024, + "epoch": 0.03599717199314568, + "loss/policy_avg": 0.09330323338508606, + "lr": 2.7843174846625768e-06, + "objective/entropy": 91.78379821777344, + "objective/kl": 3.987361431121826, + "objective/non_score_reward": -0.19936808943748474, + "objective/rlhf_reward": 4.803791493177414, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.2497762441635132, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.408203125, + "step": 750, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0022525787353516 + }, + { + "episode": 18048, + "epoch": 0.036045104312710456, + "loss/policy_avg": 0.0451858714222908, + "lr": 2.78402990797546e-06, + "objective/entropy": 88.0435791015625, + "objective/kl": 6.236573219299316, + "objective/non_score_reward": -0.31182861328125, + "objective/rlhf_reward": -1.8709716796875, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.093326568603516, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3984375, + "step": 751, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977097511291504 + }, + { + "episode": 18072, + "epoch": 0.03609303663227523, + "loss/policy_avg": 0.4149673283100128, + "lr": 2.7837423312883436e-06, + "objective/entropy": 78.98046875, + "objective/kl": 5.816821098327637, + "objective/non_score_reward": -0.2908410429954529, + "objective/rlhf_reward": 2.04053241246764, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.305059432983398, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.37109375, + "step": 752, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001530647277832 + }, + { + "episode": 18096, + "epoch": 0.036140968951840005, + "loss/policy_avg": -0.03065009042620659, + "lr": 2.783454754601227e-06, + "objective/entropy": 107.22837829589844, + "objective/kl": 5.4091596603393555, + "objective/non_score_reward": -0.2704579532146454, + "objective/rlhf_reward": -1.622747641056776, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3968474864959717, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4765625, + "step": 753, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006115436553955 + }, + { + "episode": 18120, + "epoch": 0.036188901271404776, + "loss/policy_avg": 0.07707612961530685, + "lr": 2.7831671779141105e-06, + "objective/entropy": 120.21377563476562, + "objective/kl": 0.6728882789611816, + "objective/non_score_reward": -0.0336444191634655, + "objective/rlhf_reward": 5.798133485019207, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.7228012084960938, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 754, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99937105178833 + }, + { + "episode": 18144, + "epoch": 0.036236833590969554, + "loss/policy_avg": 0.038433440029621124, + "lr": 2.782879601226994e-06, + "objective/entropy": 97.5404281616211, + "objective/kl": 5.639504432678223, + "objective/non_score_reward": -0.2819752097129822, + "objective/rlhf_reward": 0.4453919537772072, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 8.059606552124023, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4443359375, + "step": 755, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0022943019866943 + }, + { + "episode": 18168, + "epoch": 0.036284765910534325, + "loss/policy_avg": 0.026101592928171158, + "lr": 2.7825920245398773e-06, + "objective/entropy": 73.35076904296875, + "objective/kl": 4.499371528625488, + "objective/non_score_reward": -0.2249685823917389, + "objective/rlhf_reward": 1.234247906243989, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.4053056240081787, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.34375, + "step": 756, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998135566711426 + }, + { + "episode": 18192, + "epoch": 0.0363326982300991, + "loss/policy_avg": -0.03225627541542053, + "lr": 2.7823044478527608e-06, + "objective/entropy": 97.56829833984375, + "objective/kl": 3.0185704231262207, + "objective/non_score_reward": -0.1509285271167755, + "objective/rlhf_reward": -0.9055710695683956, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5770065784454346, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4326171875, + "step": 757, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0030159950256348 + }, + { + "episode": 18216, + "epoch": 0.036380630549663874, + "loss/policy_avg": 0.004922188818454742, + "lr": 2.782016871165644e-06, + "objective/entropy": 106.77813720703125, + "objective/kl": 6.037500381469727, + "objective/non_score_reward": -0.3018750250339508, + "objective/rlhf_reward": -1.8112501502037048, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.967162609100342, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.478515625, + "step": 758, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003398895263672 + }, + { + "episode": 18240, + "epoch": 0.03642856286922865, + "loss/policy_avg": -0.022680379450321198, + "lr": 2.7817292944785276e-06, + "objective/entropy": 89.08116149902344, + "objective/kl": 3.2653005123138428, + "objective/non_score_reward": -0.16326504945755005, + "objective/rlhf_reward": -0.9795902743935585, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.803141713142395, + "policy/clipfrac_avg": 1.8333333730697632, + "policy/entropy_avg": 0.40234375, + "step": 759, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004751682281494 + }, + { + "episode": 18264, + "epoch": 0.03647649518879342, + "loss/policy_avg": 0.06717869639396667, + "lr": 2.781441717791411e-06, + "objective/entropy": 77.81326293945312, + "objective/kl": 6.749392509460449, + "objective/non_score_reward": -0.3374696373939514, + "objective/rlhf_reward": 0.11242548268629432, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.56482458114624, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4072265625, + "step": 760, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999211311340332 + }, + { + "episode": 18288, + "epoch": 0.0365244275083582, + "loss/policy_avg": 0.06153953820466995, + "lr": 2.7811541411042945e-06, + "objective/entropy": 100.3778076171875, + "objective/kl": 5.479520797729492, + "objective/non_score_reward": -0.27397605776786804, + "objective/rlhf_reward": -1.64385624229908, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.036687850952148, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4853515625, + "step": 761, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9987432956695557 + }, + { + "episode": 18312, + "epoch": 0.03657235982792297, + "loss/policy_avg": -0.06674277782440186, + "lr": 2.7808665644171783e-06, + "objective/entropy": 72.38360595703125, + "objective/kl": 3.916440486907959, + "objective/non_score_reward": -0.19582204520702362, + "objective/rlhf_reward": 4.825067892670631, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.5121889114379883, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3525390625, + "step": 762, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004079341888428 + }, + { + "episode": 18336, + "epoch": 0.03662029214748775, + "loss/policy_avg": 0.19961640238761902, + "lr": 2.7805789877300617e-06, + "objective/entropy": 109.44195556640625, + "objective/kl": 4.7736968994140625, + "objective/non_score_reward": -0.23868483304977417, + "objective/rlhf_reward": -1.4321089945733547, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.88626766204834, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4951171875, + "step": 763, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99753999710083 + }, + { + "episode": 18360, + "epoch": 0.03666822446705252, + "loss/policy_avg": 0.03949819132685661, + "lr": 2.7802914110429447e-06, + "objective/entropy": 89.58146667480469, + "objective/kl": 5.035233497619629, + "objective/non_score_reward": -0.25176167488098145, + "objective/rlhf_reward": 2.2750085466486625, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.885690450668335, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3896484375, + "step": 764, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000642776489258 + }, + { + "episode": 18384, + "epoch": 0.0367161567866173, + "loss/policy_avg": 0.02423580177128315, + "lr": 2.780003834355828e-06, + "objective/entropy": 109.71763610839844, + "objective/kl": 6.046716690063477, + "objective/non_score_reward": -0.30233585834503174, + "objective/rlhf_reward": -1.8140151090919971, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.449071884155273, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4599609375, + "step": 765, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9952600002288818 + }, + { + "episode": 18408, + "epoch": 0.03676408910618207, + "loss/policy_avg": 0.1139398068189621, + "lr": 2.7797162576687116e-06, + "objective/entropy": 95.64404296875, + "objective/kl": 6.326751708984375, + "objective/non_score_reward": -0.31633755564689636, + "objective/rlhf_reward": 0.4230915095258716, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.283299446105957, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4306640625, + "step": 766, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99912428855896 + }, + { + "episode": 18432, + "epoch": 0.03681202142574685, + "loss/policy_avg": 0.07847250998020172, + "lr": 2.779428680981595e-06, + "objective/entropy": 82.99984741210938, + "objective/kl": 8.974623680114746, + "objective/non_score_reward": -0.4487311542034149, + "objective/rlhf_reward": -2.6923868656158447, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.083880424499512, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.37109375, + "step": 767, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967540502548218 + }, + { + "episode": 18456, + "epoch": 0.03685995374531162, + "loss/policy_avg": 0.025850310921669006, + "lr": 2.7791411042944784e-06, + "objective/entropy": 54.08171081542969, + "objective/kl": 4.000916481018066, + "objective/non_score_reward": -0.20004580914974213, + "objective/rlhf_reward": -1.2002748399972916, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.019412994384766, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.36328125, + "step": 768, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0015642642974854 + }, + { + "episode": 18480, + "epoch": 0.0369078860648764, + "loss/policy_avg": 0.011793037876486778, + "lr": 2.778853527607362e-06, + "objective/entropy": 68.54625701904297, + "objective/kl": 4.999691009521484, + "objective/non_score_reward": -0.24998459219932556, + "objective/rlhf_reward": 4.500092590227723, + "objective/scores": 1.0, + "policy/approxkl_avg": 5.163143157958984, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.326171875, + "step": 769, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986432790756226 + }, + { + "episode": 18504, + "epoch": 0.03695581838444117, + "loss/policy_avg": 0.013629766181111336, + "lr": 2.7785659509202453e-06, + "objective/entropy": 97.47563934326172, + "objective/kl": 4.93257999420166, + "objective/non_score_reward": -0.246628999710083, + "objective/rlhf_reward": 1.1042853203775378, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.0703868865966797, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4130859375, + "step": 770, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997673511505127 + }, + { + "episode": 18528, + "epoch": 0.037003750704005946, + "loss/policy_avg": -0.038465917110443115, + "lr": 2.7782783742331287e-06, + "objective/entropy": 86.32704162597656, + "objective/kl": 5.626559257507324, + "objective/non_score_reward": -0.2813279628753662, + "objective/rlhf_reward": -1.6879678294062614, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.9272661209106445, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.42578125, + "step": 771, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994585514068604 + }, + { + "episode": 18552, + "epoch": 0.03705168302357072, + "loss/policy_avg": 0.026842258870601654, + "lr": 2.7779907975460126e-06, + "objective/entropy": 76.15080261230469, + "objective/kl": 6.670185089111328, + "objective/non_score_reward": -0.33350926637649536, + "objective/rlhf_reward": -2.001055583357811, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3616268634796143, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.37109375, + "step": 772, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991178512573242 + }, + { + "episode": 18576, + "epoch": 0.037099615343135495, + "loss/policy_avg": 0.04850543662905693, + "lr": 2.777703220858896e-06, + "objective/entropy": 70.3290023803711, + "objective/kl": 4.530344009399414, + "objective/non_score_reward": -0.2265172302722931, + "objective/rlhf_reward": -1.3591032922267914, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.543682813644409, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3134765625, + "step": 773, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997136116027832 + }, + { + "episode": 18600, + "epoch": 0.037147547662700266, + "loss/policy_avg": 0.015337832272052765, + "lr": 2.7774156441717794e-06, + "objective/entropy": 123.72618103027344, + "objective/kl": 4.352928161621094, + "objective/non_score_reward": -0.21764642000198364, + "objective/rlhf_reward": -1.3058784753084183, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.7537943720817566, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.544921875, + "step": 774, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0031070709228516 + }, + { + "episode": 18624, + "epoch": 0.037195479982265044, + "loss/policy_avg": -0.0023349891416728497, + "lr": 2.777128067484663e-06, + "objective/entropy": 91.73686218261719, + "objective/kl": 2.7099223136901855, + "objective/non_score_reward": -0.135496124625206, + "objective/rlhf_reward": -0.8129766769707203, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8431189060211182, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.548828125, + "step": 775, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.001204252243042 + }, + { + "episode": 18648, + "epoch": 0.037243412301829815, + "loss/policy_avg": 0.0066936444491147995, + "lr": 2.7768404907975463e-06, + "objective/entropy": 111.25732421875, + "objective/kl": 6.928802490234375, + "objective/non_score_reward": -0.3464401364326477, + "objective/rlhf_reward": 0.2424761589218143, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 5.519787788391113, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4873046875, + "step": 776, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9968340396881104 + }, + { + "episode": 18672, + "epoch": 0.03729134462139459, + "loss/policy_avg": 0.20706447958946228, + "lr": 2.7765529141104297e-06, + "objective/entropy": 84.09353637695312, + "objective/kl": 5.760265350341797, + "objective/non_score_reward": -0.2880132794380188, + "objective/rlhf_reward": 0.40916353542698747, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.48813533782959, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3759765625, + "step": 777, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001509666442871 + }, + { + "episode": 18696, + "epoch": 0.037339276940959364, + "loss/policy_avg": -0.01147092878818512, + "lr": 2.7762653374233127e-06, + "objective/entropy": 111.05123138427734, + "objective/kl": 5.103515625, + "objective/non_score_reward": -0.25517579913139343, + "objective/rlhf_reward": -1.5310546457767487, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.482524871826172, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4892578125, + "step": 778, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9968037605285645 + }, + { + "episode": 18720, + "epoch": 0.03738720926052414, + "loss/policy_avg": 0.09854044765233994, + "lr": 2.775977760736196e-06, + "objective/entropy": 114.5250473022461, + "objective/kl": 5.7733001708984375, + "objective/non_score_reward": -0.2886649966239929, + "objective/rlhf_reward": -1.7319899871945381, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.220668315887451, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.521484375, + "step": 779, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999850869178772 + }, + { + "episode": 18744, + "epoch": 0.03743514158008891, + "loss/policy_avg": 0.08718999475240707, + "lr": 2.7756901840490796e-06, + "objective/entropy": 150.9942626953125, + "objective/kl": 2.8566081523895264, + "objective/non_score_reward": -0.14283041656017303, + "objective/rlhf_reward": -0.85698252171278, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.2518134117126465, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.640625, + "step": 780, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999668002128601 + }, + { + "episode": 18768, + "epoch": 0.03748307389965369, + "loss/policy_avg": 0.020397033542394638, + "lr": 2.775402607361963e-06, + "objective/entropy": 98.83026123046875, + "objective/kl": 3.6620230674743652, + "objective/non_score_reward": -0.18310117721557617, + "objective/rlhf_reward": 0.7941823017290439, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.264141321182251, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.470703125, + "step": 781, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996753931045532 + }, + { + "episode": 18792, + "epoch": 0.03753100621921846, + "loss/policy_avg": 0.0056516630575060844, + "lr": 2.775115030674847e-06, + "objective/entropy": 123.05590057373047, + "objective/kl": 3.564337730407715, + "objective/non_score_reward": -0.17821690440177917, + "objective/rlhf_reward": 2.716277143446844, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.151270627975464, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.564453125, + "step": 782, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011439323425293 + }, + { + "episode": 18816, + "epoch": 0.03757893853878324, + "loss/policy_avg": 0.017934028059244156, + "lr": 2.7748274539877303e-06, + "objective/entropy": 111.56707763671875, + "objective/kl": 3.377070188522339, + "objective/non_score_reward": -0.1688535064458847, + "objective/rlhf_reward": 0.8796682779184188, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.7617251873016357, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.515625, + "step": 783, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000300407409668 + }, + { + "episode": 18840, + "epoch": 0.03762687085834801, + "loss/policy_avg": -0.0041727181524038315, + "lr": 2.7745398773006137e-06, + "objective/entropy": 84.2708740234375, + "objective/kl": 3.9926199913024902, + "objective/non_score_reward": -0.1996309757232666, + "objective/rlhf_reward": -1.1977859139442444, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7265658378601074, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3994140625, + "step": 784, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013957023620605 + }, + { + "episode": 18864, + "epoch": 0.03767480317791279, + "loss/policy_avg": 0.024382026866078377, + "lr": 2.774252300613497e-06, + "objective/entropy": 85.52305603027344, + "objective/kl": 4.195804595947266, + "objective/non_score_reward": -0.20979022979736328, + "objective/rlhf_reward": 1.0623754348207477, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 1.6655771732330322, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3916015625, + "step": 785, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003176689147949 + }, + { + "episode": 18888, + "epoch": 0.03772273549747756, + "loss/policy_avg": 0.04530486464500427, + "lr": 2.7739647239263805e-06, + "objective/entropy": 118.19273376464844, + "objective/kl": 4.712536811828613, + "objective/non_score_reward": -0.23562681674957275, + "objective/rlhf_reward": -1.4137610048055649, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.816611289978027, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.52734375, + "step": 786, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978312253952026 + }, + { + "episode": 18912, + "epoch": 0.03777066781704234, + "loss/policy_avg": -0.0024204282090067863, + "lr": 2.773677147239264e-06, + "objective/entropy": 84.1767578125, + "objective/kl": 5.237006664276123, + "objective/non_score_reward": -0.26185035705566406, + "objective/rlhf_reward": 2.2144764983040504, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 5.969402313232422, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3779296875, + "step": 787, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001199960708618 + }, + { + "episode": 18936, + "epoch": 0.03781860013660711, + "loss/policy_avg": 0.0719216912984848, + "lr": 2.7733895705521474e-06, + "objective/entropy": 101.81163787841797, + "objective/kl": 6.171901702880859, + "objective/non_score_reward": -0.30859509110450745, + "objective/rlhf_reward": -1.8515705615282059, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.568731307983398, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4423828125, + "step": 788, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967327117919922 + }, + { + "episode": 18960, + "epoch": 0.037866532456171886, + "loss/policy_avg": 0.27508828043937683, + "lr": 2.773101993865031e-06, + "objective/entropy": 111.6267318725586, + "objective/kl": 7.142085075378418, + "objective/non_score_reward": -0.35710424184799194, + "objective/rlhf_reward": 0.4414339122535679, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.284224033355713, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.48046875, + "step": 789, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000262498855591 + }, + { + "episode": 18984, + "epoch": 0.03791446477573666, + "loss/policy_avg": -0.006008709315210581, + "lr": 2.7728144171779142e-06, + "objective/entropy": 109.97944641113281, + "objective/kl": 4.237686634063721, + "objective/non_score_reward": -0.21188434958457947, + "objective/rlhf_reward": 0.8659370549429787, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.3626675605773926, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4775390625, + "step": 790, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011210441589355 + }, + { + "episode": 19008, + "epoch": 0.037962397095301435, + "loss/policy_avg": 0.3601503372192383, + "lr": 2.7725268404907977e-06, + "objective/entropy": 75.80169677734375, + "objective/kl": 4.782435417175293, + "objective/non_score_reward": -0.23912176489830017, + "objective/rlhf_reward": -1.4347305223345757, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.025783061981201, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.34375, + "step": 791, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999582290649414 + }, + { + "episode": 19032, + "epoch": 0.038010329414866206, + "loss/policy_avg": 0.06423256546258926, + "lr": 2.772239263803681e-06, + "objective/entropy": 95.48409271240234, + "objective/kl": 4.233428001403809, + "objective/non_score_reward": -0.2116713970899582, + "objective/rlhf_reward": 1.3140310702087374, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.9178431034088135, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4521484375, + "step": 792, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999723434448242 + }, + { + "episode": 19056, + "epoch": 0.038058261734430984, + "loss/policy_avg": 0.007162738591432571, + "lr": 2.7719516871165645e-06, + "objective/entropy": 54.80524444580078, + "objective/kl": 4.506689548492432, + "objective/non_score_reward": -0.22533446550369263, + "objective/rlhf_reward": -1.352006733417511, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5609936714172363, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3427734375, + "step": 793, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991965293884277 + }, + { + "episode": 19080, + "epoch": 0.038106194053995755, + "loss/policy_avg": 0.003064230550080538, + "lr": 2.771664110429448e-06, + "objective/entropy": 91.19677734375, + "objective/kl": 3.7535312175750732, + "objective/non_score_reward": -0.1876765936613083, + "objective/rlhf_reward": 0.8739405572414397, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.3720126152038574, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4453125, + "step": 794, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002960205078125 + }, + { + "episode": 19104, + "epoch": 0.03815412637356053, + "loss/policy_avg": 0.0073019652627408504, + "lr": 2.7713765337423314e-06, + "objective/entropy": 99.40856170654297, + "objective/kl": 4.094510555267334, + "objective/non_score_reward": -0.2047255039215088, + "objective/rlhf_reward": -1.2283530980348587, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.393102169036865, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.412109375, + "step": 795, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009632110595703 + }, + { + "episode": 19128, + "epoch": 0.038202058693125304, + "loss/policy_avg": 0.024274475872516632, + "lr": 2.771088957055215e-06, + "objective/entropy": 74.23558044433594, + "objective/kl": 3.8404784202575684, + "objective/non_score_reward": -0.19202391803264618, + "objective/rlhf_reward": 0.985099577199353, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.1912550926208496, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.33203125, + "step": 796, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001590251922607 + }, + { + "episode": 19152, + "epoch": 0.03824999101269008, + "loss/policy_avg": 0.13562335073947906, + "lr": 2.7708013803680982e-06, + "objective/entropy": 85.50540161132812, + "objective/kl": 5.960662841796875, + "objective/non_score_reward": -0.2980331778526306, + "objective/rlhf_reward": -1.7881988547742367, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7445268630981445, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.509765625, + "step": 797, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0013182163238525 + }, + { + "episode": 19176, + "epoch": 0.03829792333225485, + "loss/policy_avg": 0.012542969547212124, + "lr": 2.7705138036809816e-06, + "objective/entropy": 82.32984924316406, + "objective/kl": 5.1433305740356445, + "objective/non_score_reward": -0.2571665346622467, + "objective/rlhf_reward": 2.2425793060046844, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.409942865371704, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3798828125, + "step": 798, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003058910369873 + }, + { + "episode": 19200, + "epoch": 0.03834585565181963, + "loss/policy_avg": 0.019332444295287132, + "lr": 2.770226226993865e-06, + "objective/entropy": 70.5998764038086, + "objective/kl": 3.46431827545166, + "objective/non_score_reward": -0.17321592569351196, + "objective/rlhf_reward": 0.6950933813945137, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.0147080421447754, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3291015625, + "step": 799, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003209352493286 + }, + { + "episode": 19224, + "epoch": 0.0383937879713844, + "loss/policy_avg": 0.1092795580625534, + "lr": 2.7699386503067485e-06, + "objective/entropy": 67.04051971435547, + "objective/kl": 4.605186462402344, + "objective/non_score_reward": -0.23025929927825928, + "objective/rlhf_reward": -1.3815558142960072, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.319470167160034, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3076171875, + "step": 800, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9971246719360352 + }, + { + "episode": 19248, + "epoch": 0.03844172029094918, + "loss/policy_avg": 0.13393041491508484, + "lr": 2.769651073619632e-06, + "objective/entropy": 105.3885726928711, + "objective/kl": 4.86433219909668, + "objective/non_score_reward": -0.2432166188955307, + "objective/rlhf_reward": -1.4592997133731842, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7161865234375, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 801, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0045337677001953 + }, + { + "episode": 19272, + "epoch": 0.03848965261051395, + "loss/policy_avg": 0.28256598114967346, + "lr": 2.7693634969325153e-06, + "objective/entropy": 78.38439178466797, + "objective/kl": 6.907025337219238, + "objective/non_score_reward": -0.3453512191772461, + "objective/rlhf_reward": -2.0721073746681213, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7634425163269043, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.373046875, + "step": 802, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999645471572876 + }, + { + "episode": 19296, + "epoch": 0.03853758493007873, + "loss/policy_avg": 0.01812692917883396, + "lr": 2.7690759202453988e-06, + "objective/entropy": 132.48806762695312, + "objective/kl": 6.092770576477051, + "objective/non_score_reward": -0.30463850498199463, + "objective/rlhf_reward": -1.8278310745954514, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.884298801422119, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.60546875, + "step": 803, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980494976043701 + }, + { + "episode": 19320, + "epoch": 0.0385855172496435, + "loss/policy_avg": 0.007059426978230476, + "lr": 2.768788343558282e-06, + "objective/entropy": 121.7144775390625, + "objective/kl": 4.22022819519043, + "objective/non_score_reward": -0.2110113948583603, + "objective/rlhf_reward": 0.7339315861463546, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.6252875328063965, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.529296875, + "step": 804, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990060329437256 + }, + { + "episode": 19344, + "epoch": 0.03863344956920828, + "loss/policy_avg": 0.07563814520835876, + "lr": 2.7685007668711656e-06, + "objective/entropy": 113.2296142578125, + "objective/kl": 6.071664333343506, + "objective/non_score_reward": -0.3035832345485687, + "objective/rlhf_reward": -1.8214992582798004, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.968968391418457, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4677734375, + "step": 805, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996099472045898 + }, + { + "episode": 19368, + "epoch": 0.03868138188877305, + "loss/policy_avg": 0.1319531351327896, + "lr": 2.7682131901840495e-06, + "objective/entropy": 94.48675537109375, + "objective/kl": 6.95914363861084, + "objective/non_score_reward": -0.34795713424682617, + "objective/rlhf_reward": -2.087742954492569, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.660217761993408, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.451171875, + "step": 806, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991017580032349 + }, + { + "episode": 19392, + "epoch": 0.03872931420833783, + "loss/policy_avg": -0.021271485835313797, + "lr": 2.767925613496933e-06, + "objective/entropy": 104.99981689453125, + "objective/kl": 5.471705436706543, + "objective/non_score_reward": -0.2735852599143982, + "objective/rlhf_reward": -1.6415113657712936, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8878135681152344, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4873046875, + "step": 807, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0018134117126465 + }, + { + "episode": 19416, + "epoch": 0.038777246527902605, + "loss/policy_avg": 0.0761772021651268, + "lr": 2.767638036809816e-06, + "objective/entropy": 113.0139389038086, + "objective/kl": 6.711310386657715, + "objective/non_score_reward": -0.3355655372142792, + "objective/rlhf_reward": 0.3077237318802837, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 8.774911880493164, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4873046875, + "step": 808, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9962475299835205 + }, + { + "episode": 19440, + "epoch": 0.038825178847467376, + "loss/policy_avg": 0.018107634037733078, + "lr": 2.7673504601226993e-06, + "objective/entropy": 83.24603271484375, + "objective/kl": 5.176031112670898, + "objective/non_score_reward": -0.25880157947540283, + "objective/rlhf_reward": -1.5528093725442886, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.035181999206543, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.39453125, + "step": 809, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9980095624923706 + }, + { + "episode": 19464, + "epoch": 0.038873111167032154, + "loss/policy_avg": 0.06706026196479797, + "lr": 2.7670628834355828e-06, + "objective/entropy": 84.4073486328125, + "objective/kl": 5.993706703186035, + "objective/non_score_reward": -0.2996853291988373, + "objective/rlhf_reward": -1.7981118634343147, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3937606811523438, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3623046875, + "step": 810, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000274658203125 + }, + { + "episode": 19488, + "epoch": 0.038921043486596925, + "loss/policy_avg": -0.003910548519343138, + "lr": 2.766775306748466e-06, + "objective/entropy": 84.22377014160156, + "objective/kl": 5.8457794189453125, + "objective/non_score_reward": -0.29228901863098145, + "objective/rlhf_reward": -1.753733903169632, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.7991180419921875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3955078125, + "step": 811, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99789559841156 + }, + { + "episode": 19512, + "epoch": 0.0389689758061617, + "loss/policy_avg": 0.062228281050920486, + "lr": 2.7664877300613496e-06, + "objective/entropy": 100.55255126953125, + "objective/kl": 4.8649516105651855, + "objective/non_score_reward": -0.24324756860733032, + "objective/rlhf_reward": -1.4594853818416595, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.599847793579102, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 812, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9985090494155884 + }, + { + "episode": 19536, + "epoch": 0.039016908125726474, + "loss/policy_avg": 0.11128182709217072, + "lr": 2.766200153374233e-06, + "objective/entropy": 84.16635131835938, + "objective/kl": 6.031026840209961, + "objective/non_score_reward": -0.30155134201049805, + "objective/rlhf_reward": 4.190691988915205, + "objective/scores": 1.0, + "policy/approxkl_avg": 4.555721759796143, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3564453125, + "step": 813, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996315360069275 + }, + { + "episode": 19560, + "epoch": 0.03906484044529125, + "loss/policy_avg": 0.05366883426904678, + "lr": 2.7659125766871165e-06, + "objective/entropy": 73.23033142089844, + "objective/kl": 2.1473348140716553, + "objective/non_score_reward": -0.10736674070358276, + "objective/rlhf_reward": -0.6442004181444645, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.1040027141571045, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4736328125, + "step": 814, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0026237964630127 + }, + { + "episode": 19584, + "epoch": 0.03911277276485602, + "loss/policy_avg": 1.3900920748710632e-05, + "lr": 2.765625e-06, + "objective/entropy": 128.50836181640625, + "objective/kl": 5.358626842498779, + "objective/non_score_reward": -0.26793134212493896, + "objective/rlhf_reward": 1.3924119919538498, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.5414252281188965, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.56640625, + "step": 815, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015034675598145 + }, + { + "episode": 19608, + "epoch": 0.0391607050844208, + "loss/policy_avg": 0.16122160851955414, + "lr": 2.7653374233128837e-06, + "objective/entropy": 105.82235717773438, + "objective/kl": 5.051989555358887, + "objective/non_score_reward": -0.25259947776794434, + "objective/rlhf_reward": 1.068462528398821, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 5.478572845458984, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.482421875, + "step": 816, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974201917648315 + }, + { + "episode": 19632, + "epoch": 0.03920863740398557, + "loss/policy_avg": 0.03654171898961067, + "lr": 2.765049846625767e-06, + "objective/entropy": 69.11418914794922, + "objective/kl": 3.575934886932373, + "objective/non_score_reward": -0.17879675328731537, + "objective/rlhf_reward": 4.92721950262785, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.1245384216308594, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.333984375, + "step": 817, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998886585235596 + }, + { + "episode": 19656, + "epoch": 0.03925656972355035, + "loss/policy_avg": 0.05904377996921539, + "lr": 2.7647622699386506e-06, + "objective/entropy": 115.7452621459961, + "objective/kl": 4.961341381072998, + "objective/non_score_reward": -0.24806706607341766, + "objective/rlhf_reward": 0.4043868717244472, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 5.301900863647461, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.521484375, + "step": 818, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003398895263672 + }, + { + "episode": 19680, + "epoch": 0.03930450204311512, + "loss/policy_avg": 0.05074892193078995, + "lr": 2.764474693251534e-06, + "objective/entropy": 101.5160140991211, + "objective/kl": 5.430910110473633, + "objective/non_score_reward": -0.27154552936553955, + "objective/rlhf_reward": 2.1563054048401527, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 1.3168843984603882, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.416015625, + "step": 819, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015063285827637 + }, + { + "episode": 19704, + "epoch": 0.0393524343626799, + "loss/policy_avg": 0.07415074855089188, + "lr": 2.7641871165644174e-06, + "objective/entropy": 107.78367614746094, + "objective/kl": 5.492318630218506, + "objective/non_score_reward": -0.27461594343185425, + "objective/rlhf_reward": 0.3523043692111968, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.081099987030029, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4970703125, + "step": 820, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995880126953125 + }, + { + "episode": 19728, + "epoch": 0.03940036668224467, + "loss/policy_avg": 0.05685888230800629, + "lr": 2.763899539877301e-06, + "objective/entropy": 110.51863861083984, + "objective/kl": 7.1427412033081055, + "objective/non_score_reward": -0.3571370244026184, + "objective/rlhf_reward": 1.642756479321163, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 6.41409969329834, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.494140625, + "step": 821, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970641136169434 + }, + { + "episode": 19752, + "epoch": 0.03944829900180945, + "loss/policy_avg": 0.11904612183570862, + "lr": 2.7636119631901843e-06, + "objective/entropy": 119.91838073730469, + "objective/kl": 4.42469596862793, + "objective/non_score_reward": -0.22123479843139648, + "objective/rlhf_reward": 0.9937080975223544, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.293944835662842, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.52734375, + "step": 822, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999306321144104 + }, + { + "episode": 19776, + "epoch": 0.03949623132137422, + "loss/policy_avg": -0.030578110367059708, + "lr": 2.7633243865030673e-06, + "objective/entropy": 100.55270385742188, + "objective/kl": 5.412230968475342, + "objective/non_score_reward": -0.27061158418655396, + "objective/rlhf_reward": 0.1825106141508851, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.3859965801239014, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4560546875, + "step": 823, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019192695617676 + }, + { + "episode": 19800, + "epoch": 0.039544163640939, + "loss/policy_avg": -0.11576860398054123, + "lr": 2.7630368098159507e-06, + "objective/entropy": 87.00970458984375, + "objective/kl": 6.5206451416015625, + "objective/non_score_reward": -0.32603228092193604, + "objective/rlhf_reward": 1.0438062511384487, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.4194695949554443, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.390625, + "step": 824, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003126621246338 + }, + { + "episode": 19824, + "epoch": 0.03959209596050377, + "loss/policy_avg": 0.20039743185043335, + "lr": 2.762749233128834e-06, + "objective/entropy": 110.70524597167969, + "objective/kl": 4.405543327331543, + "objective/non_score_reward": -0.22027719020843506, + "objective/rlhf_reward": 1.2623962667943927, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 8.91673755645752, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4775390625, + "step": 825, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976433515548706 + }, + { + "episode": 19848, + "epoch": 0.039640028280068546, + "loss/policy_avg": 0.0715736523270607, + "lr": 2.7624616564417176e-06, + "objective/entropy": 105.395751953125, + "objective/kl": 6.86796760559082, + "objective/non_score_reward": -0.34339842200279236, + "objective/rlhf_reward": -2.060390383005142, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.189173698425293, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4619140625, + "step": 826, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0026631355285645 + }, + { + "episode": 19872, + "epoch": 0.039687960599633317, + "loss/policy_avg": 0.18104898929595947, + "lr": 2.7621740797546014e-06, + "objective/entropy": 82.89332580566406, + "objective/kl": 4.62474250793457, + "objective/non_score_reward": -0.23123714327812195, + "objective/rlhf_reward": 0.7498203309659494, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 8.115823745727539, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.453125, + "step": 827, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9951534271240234 + }, + { + "episode": 19896, + "epoch": 0.039735892919198094, + "loss/policy_avg": -0.00045467447489500046, + "lr": 2.761886503067485e-06, + "objective/entropy": 92.42704772949219, + "objective/kl": 3.597553253173828, + "objective/non_score_reward": -0.17987768352031708, + "objective/rlhf_reward": 0.920733956620097, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.389037609100342, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.427734375, + "step": 828, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998868703842163 + }, + { + "episode": 19920, + "epoch": 0.039783825238762865, + "loss/policy_avg": 0.019980579614639282, + "lr": 2.7615989263803683e-06, + "objective/entropy": 103.88983154296875, + "objective/kl": 5.050657272338867, + "objective/non_score_reward": -0.25253283977508545, + "objective/rlhf_reward": -1.515197154134512, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3787875175476074, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.462890625, + "step": 829, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993971586227417 + }, + { + "episode": 19944, + "epoch": 0.03983175755832764, + "loss/policy_avg": 0.1518014371395111, + "lr": 2.7613113496932517e-06, + "objective/entropy": 105.58370971679688, + "objective/kl": 5.3396196365356445, + "objective/non_score_reward": -0.26698094606399536, + "objective/rlhf_reward": 0.29090353962691684, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.3401031494140625, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.51953125, + "step": 830, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994797706604004 + }, + { + "episode": 19968, + "epoch": 0.039879689877892414, + "loss/policy_avg": 0.04988197237253189, + "lr": 2.761023773006135e-06, + "objective/entropy": 113.88935852050781, + "objective/kl": 3.677480697631836, + "objective/non_score_reward": -0.18387405574321747, + "objective/rlhf_reward": 1.8967557474970818, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.969912052154541, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.546875, + "step": 831, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0024254322052 + }, + { + "episode": 19992, + "epoch": 0.03992762219745719, + "loss/policy_avg": 0.04003171622753143, + "lr": 2.7607361963190186e-06, + "objective/entropy": 94.63218688964844, + "objective/kl": 3.5297412872314453, + "objective/non_score_reward": -0.17648707330226898, + "objective/rlhf_reward": 0.74725749319208, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.2722699642181396, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4453125, + "step": 832, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985119104385376 + }, + { + "episode": 20016, + "epoch": 0.03997555451702196, + "loss/policy_avg": 0.040343496948480606, + "lr": 2.760448619631902e-06, + "objective/entropy": 95.25132751464844, + "objective/kl": 6.507745265960693, + "objective/non_score_reward": -0.32538729906082153, + "objective/rlhf_reward": -1.9523236155509949, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.373785972595215, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4228515625, + "step": 833, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996939897537231 + }, + { + "episode": 20040, + "epoch": 0.04002348683658674, + "loss/policy_avg": 0.10057518631219864, + "lr": 2.7601610429447854e-06, + "objective/entropy": 105.46624755859375, + "objective/kl": 5.231590270996094, + "objective/non_score_reward": -0.26157957315444946, + "objective/rlhf_reward": 1.4305227845907211, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.0121121406555176, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.470703125, + "step": 834, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003401756286621 + }, + { + "episode": 20064, + "epoch": 0.04007141915615151, + "loss/policy_avg": 0.07102719694375992, + "lr": 2.759873466257669e-06, + "objective/entropy": 66.77411651611328, + "objective/kl": 6.063455581665039, + "objective/non_score_reward": -0.3031728267669678, + "objective/rlhf_reward": -1.8190369084477425, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.284269332885742, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.359375, + "step": 835, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000934362411499 + }, + { + "episode": 20088, + "epoch": 0.04011935147571629, + "loss/policy_avg": 0.07561428844928741, + "lr": 2.7595858895705523e-06, + "objective/entropy": 95.34703063964844, + "objective/kl": 5.601785659790039, + "objective/non_score_reward": -0.28008925914764404, + "objective/rlhf_reward": -1.6805355846881866, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.804218292236328, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4560546875, + "step": 836, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970399141311646 + }, + { + "episode": 20112, + "epoch": 0.04016728379528106, + "loss/policy_avg": 0.011374767869710922, + "lr": 2.7592983128834357e-06, + "objective/entropy": 108.51011657714844, + "objective/kl": 2.9602508544921875, + "objective/non_score_reward": -0.14801256358623505, + "objective/rlhf_reward": 0.8463136061922394, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 5.061989784240723, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4765625, + "step": 837, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999849557876587 + }, + { + "episode": 20136, + "epoch": 0.04021521611484584, + "loss/policy_avg": 0.020030668005347252, + "lr": 2.759010736196319e-06, + "objective/entropy": 118.29524993896484, + "objective/kl": 3.315833568572998, + "objective/non_score_reward": -0.16579167544841766, + "objective/rlhf_reward": -0.9947500079870224, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.722015857696533, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.521484375, + "step": 838, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008695125579834 + }, + { + "episode": 20160, + "epoch": 0.04026314843441061, + "loss/policy_avg": 0.006477737799286842, + "lr": 2.7587231595092025e-06, + "objective/entropy": 86.98435974121094, + "objective/kl": 5.780436992645264, + "objective/non_score_reward": -0.2890218496322632, + "objective/rlhf_reward": 0.8499283698560688, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.5358500480651855, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.435546875, + "step": 839, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001558303833008 + }, + { + "episode": 20184, + "epoch": 0.04031108075397539, + "loss/policy_avg": 0.08458030223846436, + "lr": 2.758435582822086e-06, + "objective/entropy": 86.22892761230469, + "objective/kl": 7.268235206604004, + "objective/non_score_reward": -0.3634117543697357, + "objective/rlhf_reward": -2.1804704889655113, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1492838859558105, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.404296875, + "step": 840, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998035430908203 + }, + { + "episode": 20208, + "epoch": 0.04035901307354016, + "loss/policy_avg": 0.051969997584819794, + "lr": 2.7581480061349694e-06, + "objective/entropy": 98.99537658691406, + "objective/kl": 4.335845947265625, + "objective/non_score_reward": -0.21679231524467468, + "objective/rlhf_reward": -1.3007537573575974, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.330174446105957, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.439453125, + "step": 841, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0056657791137695 + }, + { + "episode": 20232, + "epoch": 0.04040694539310494, + "loss/policy_avg": 0.04716600477695465, + "lr": 2.757860429447853e-06, + "objective/entropy": 114.19577026367188, + "objective/kl": 7.542263031005859, + "objective/non_score_reward": -0.3771131932735443, + "objective/rlhf_reward": 0.3213803974153492, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 6.477425575256348, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.494140625, + "step": 842, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9956767559051514 + }, + { + "episode": 20256, + "epoch": 0.04045487771266971, + "loss/policy_avg": -0.016204383224248886, + "lr": 2.7575728527607362e-06, + "objective/entropy": 94.84222412109375, + "objective/kl": 2.759061574935913, + "objective/non_score_reward": -0.13795308768749237, + "objective/rlhf_reward": -0.8277185261249542, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9032620191574097, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.412109375, + "step": 843, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0042707920074463 + }, + { + "episode": 20280, + "epoch": 0.040502810032234486, + "loss/policy_avg": 1.4259287118911743, + "lr": 2.7572852760736197e-06, + "objective/entropy": 81.55256652832031, + "objective/kl": 3.4377763271331787, + "objective/non_score_reward": -0.1718888133764267, + "objective/rlhf_reward": 2.75424570450012, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.194827079772949, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3671875, + "step": 844, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006351470947266 + }, + { + "episode": 20304, + "epoch": 0.04055074235179926, + "loss/policy_avg": -0.005072722211480141, + "lr": 2.756997699386503e-06, + "objective/entropy": 75.37535095214844, + "objective/kl": 6.954894065856934, + "objective/non_score_reward": -0.3477446734905243, + "objective/rlhf_reward": -0.3520790532334961, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.225583076477051, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3408203125, + "step": 845, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016684532165527 + }, + { + "episode": 20328, + "epoch": 0.040598674671364035, + "loss/policy_avg": 0.0156900305300951, + "lr": 2.7567101226993865e-06, + "objective/entropy": 78.32573699951172, + "objective/kl": 4.871873378753662, + "objective/non_score_reward": -0.24359367787837982, + "objective/rlhf_reward": -1.4615620225667953, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8537607192993164, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.349609375, + "step": 846, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980969429016113 + }, + { + "episode": 20352, + "epoch": 0.040646606990928806, + "loss/policy_avg": -0.03346879407763481, + "lr": 2.75642254601227e-06, + "objective/entropy": 117.28727722167969, + "objective/kl": 4.212024688720703, + "objective/non_score_reward": -0.21060124039649963, + "objective/rlhf_reward": 0.47078157513297425, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.867406845092773, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.619140625, + "step": 847, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.003066062927246 + }, + { + "episode": 20376, + "epoch": 0.040694539310493584, + "loss/policy_avg": 0.07446737587451935, + "lr": 2.7561349693251534e-06, + "objective/entropy": 89.9272689819336, + "objective/kl": 4.069088935852051, + "objective/non_score_reward": -0.2034544199705124, + "objective/rlhf_reward": 1.1003902639795307, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 1.6569241285324097, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4267578125, + "step": 848, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0005970001220703 + }, + { + "episode": 20400, + "epoch": 0.040742471630058355, + "loss/policy_avg": 0.08451761305332184, + "lr": 2.755847392638037e-06, + "objective/entropy": 117.22856903076172, + "objective/kl": 5.92006254196167, + "objective/non_score_reward": -0.296003133058548, + "objective/rlhf_reward": -1.7760187536478043, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.989324569702148, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 849, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9949537515640259 + }, + { + "episode": 20424, + "epoch": 0.04079040394962313, + "loss/policy_avg": 0.05610909312963486, + "lr": 2.7555598159509206e-06, + "objective/entropy": 139.45138549804688, + "objective/kl": 4.134796619415283, + "objective/non_score_reward": -0.20673982799053192, + "objective/rlhf_reward": -1.2404388841241598, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.306093692779541, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.578125, + "step": 850, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976682662963867 + }, + { + "episode": 20448, + "epoch": 0.040838336269187904, + "loss/policy_avg": 0.05914907902479172, + "lr": 2.755272239263804e-06, + "objective/entropy": 108.95722961425781, + "objective/kl": 4.627956390380859, + "objective/non_score_reward": -0.2313978374004364, + "objective/rlhf_reward": -1.3883868604898453, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.375555992126465, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.501953125, + "step": 851, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983952045440674 + }, + { + "episode": 20472, + "epoch": 0.04088626858875268, + "loss/policy_avg": 0.05221937224268913, + "lr": 2.7549846625766875e-06, + "objective/entropy": 109.08580017089844, + "objective/kl": 4.53959846496582, + "objective/non_score_reward": -0.22697994112968445, + "objective/rlhf_reward": 0.9592372487832073, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.036683559417725, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.466796875, + "step": 852, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986205101013184 + }, + { + "episode": 20496, + "epoch": 0.04093420090831745, + "loss/policy_avg": -0.005350551567971706, + "lr": 2.7546970858895705e-06, + "objective/entropy": 101.99287414550781, + "objective/kl": 3.24263072013855, + "objective/non_score_reward": -0.16213154792785645, + "objective/rlhf_reward": 0.7615997075930916, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 1.1098346710205078, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.48828125, + "step": 853, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001429557800293 + }, + { + "episode": 20520, + "epoch": 0.04098213322788223, + "loss/policy_avg": 0.03330043703317642, + "lr": 2.754409509202454e-06, + "objective/entropy": 120.2259750366211, + "objective/kl": 5.415060043334961, + "objective/non_score_reward": -0.2707529664039612, + "objective/rlhf_reward": -1.624517872929573, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.744842052459717, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.51953125, + "step": 854, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996635913848877 + }, + { + "episode": 20544, + "epoch": 0.041030065547447, + "loss/policy_avg": 0.04016149416565895, + "lr": 2.7541219325153373e-06, + "objective/entropy": 98.10752868652344, + "objective/kl": 5.063037395477295, + "objective/non_score_reward": -0.25315186381340027, + "objective/rlhf_reward": -1.5189112052321434, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6739840507507324, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.421875, + "step": 855, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004093647003174 + }, + { + "episode": 20568, + "epoch": 0.04107799786701178, + "loss/policy_avg": 0.017953116446733475, + "lr": 2.7538343558282208e-06, + "objective/entropy": 105.27764129638672, + "objective/kl": 4.261870384216309, + "objective/non_score_reward": -0.21309351921081543, + "objective/rlhf_reward": -1.2785610035061836, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.015137672424316, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4609375, + "step": 856, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977598190307617 + }, + { + "episode": 20592, + "epoch": 0.04112593018657655, + "loss/policy_avg": 0.03681865334510803, + "lr": 2.753546779141104e-06, + "objective/entropy": 104.25900268554688, + "objective/kl": 2.7987937927246094, + "objective/non_score_reward": -0.13993969559669495, + "objective/rlhf_reward": 1.4814787294317249, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.720607280731201, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.427734375, + "step": 857, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985966682434082 + }, + { + "episode": 20616, + "epoch": 0.04117386250614133, + "loss/policy_avg": 0.21850131452083588, + "lr": 2.7532592024539876e-06, + "objective/entropy": 104.10131072998047, + "objective/kl": 5.344145774841309, + "objective/non_score_reward": -0.2672073245048523, + "objective/rlhf_reward": 2.1823347979170493, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 9.175273895263672, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.431640625, + "step": 858, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996328353881836 + }, + { + "episode": 20640, + "epoch": 0.0412217948257061, + "loss/policy_avg": -0.0008241254836320877, + "lr": 2.752971625766871e-06, + "objective/entropy": 70.7784423828125, + "objective/kl": 5.718829154968262, + "objective/non_score_reward": -0.2859414517879486, + "objective/rlhf_reward": 0.284351386129856, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.1953158378601074, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3056640625, + "step": 859, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989721775054932 + }, + { + "episode": 20664, + "epoch": 0.04126972714527088, + "loss/policy_avg": 0.10311653465032578, + "lr": 2.7526840490797545e-06, + "objective/entropy": 88.22096252441406, + "objective/kl": 6.733785629272461, + "objective/non_score_reward": -0.336689293384552, + "objective/rlhf_reward": -0.02013576403260242, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.44441556930542, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.53125, + "step": 860, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9970378875732422 + }, + { + "episode": 20688, + "epoch": 0.04131765946483565, + "loss/policy_avg": 0.017344467341899872, + "lr": 2.7523964723926383e-06, + "objective/entropy": 121.20269775390625, + "objective/kl": 3.4900290966033936, + "objective/non_score_reward": -0.1745014488697052, + "objective/rlhf_reward": 0.7591713329197202, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.9581027030944824, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5703125, + "step": 861, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000880718231201 + }, + { + "episode": 20712, + "epoch": 0.04136559178440043, + "loss/policy_avg": 0.07152405381202698, + "lr": 2.7521088957055218e-06, + "objective/entropy": 105.45358276367188, + "objective/kl": 6.212447166442871, + "objective/non_score_reward": -0.3106223940849304, + "objective/rlhf_reward": 0.7203251552941295, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.697158098220825, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4560546875, + "step": 862, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001934051513672 + }, + { + "episode": 20736, + "epoch": 0.0414135241039652, + "loss/policy_avg": 0.052584048360586166, + "lr": 2.751821319018405e-06, + "objective/entropy": 122.77789306640625, + "objective/kl": 5.065316200256348, + "objective/non_score_reward": -0.2532657980918884, + "objective/rlhf_reward": 2.265983613668125, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.879781723022461, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.5546875, + "step": 863, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995744228363037 + }, + { + "episode": 20760, + "epoch": 0.041461456423529976, + "loss/policy_avg": 0.5650435090065002, + "lr": 2.7515337423312886e-06, + "objective/entropy": 93.74668884277344, + "objective/kl": 4.2336931228637695, + "objective/non_score_reward": -0.21168462932109833, + "objective/rlhf_reward": -1.2701077312231064, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.202671051025391, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4345703125, + "step": 864, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002694129943848 + }, + { + "episode": 20784, + "epoch": 0.041509388743094754, + "loss/policy_avg": 0.07701190561056137, + "lr": 2.751246165644172e-06, + "objective/entropy": 99.4636459350586, + "objective/kl": 4.484285354614258, + "objective/non_score_reward": -0.22421424090862274, + "objective/rlhf_reward": -1.3452853858470917, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4721755981445312, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.45703125, + "step": 865, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998805522918701 + }, + { + "episode": 20808, + "epoch": 0.041557321062659525, + "loss/policy_avg": 0.06387520581483841, + "lr": 2.7509585889570555e-06, + "objective/entropy": 106.19366455078125, + "objective/kl": 5.599823951721191, + "objective/non_score_reward": -0.2799912095069885, + "objective/rlhf_reward": 0.32005295157432545, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.339226245880127, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4921875, + "step": 866, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978303909301758 + }, + { + "episode": 20832, + "epoch": 0.0416052533822243, + "loss/policy_avg": -0.023269008845090866, + "lr": 2.7506710122699385e-06, + "objective/entropy": 77.35414123535156, + "objective/kl": 6.168684959411621, + "objective/non_score_reward": -0.30843427777290344, + "objective/rlhf_reward": -1.8506055623292923, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.298649787902832, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3525390625, + "step": 867, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995925426483154 + }, + { + "episode": 20856, + "epoch": 0.041653185701789074, + "loss/policy_avg": 0.12372495234012604, + "lr": 2.750383435582822e-06, + "objective/entropy": 80.84477233886719, + "objective/kl": 7.008013725280762, + "objective/non_score_reward": -0.3504006862640381, + "objective/rlhf_reward": -2.102404199540615, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.15028190612793, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3779296875, + "step": 868, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966930150985718 + }, + { + "episode": 20880, + "epoch": 0.04170111802135385, + "loss/policy_avg": 0.05874063819646835, + "lr": 2.7500958588957053e-06, + "objective/entropy": 74.10958862304688, + "objective/kl": 7.9385881423950195, + "objective/non_score_reward": -0.3969293534755707, + "objective/rlhf_reward": 3.6184237897396088, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.260176181793213, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.326171875, + "step": 869, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994597434997559 + }, + { + "episode": 20904, + "epoch": 0.04174905034091862, + "loss/policy_avg": -0.0030971840023994446, + "lr": 2.7498082822085887e-06, + "objective/entropy": 94.70626831054688, + "objective/kl": 5.453081130981445, + "objective/non_score_reward": -0.27265408635139465, + "objective/rlhf_reward": 0.6851924370575908, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 5.65567684173584, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.388671875, + "step": 870, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986525774002075 + }, + { + "episode": 20928, + "epoch": 0.0417969826604834, + "loss/policy_avg": -0.040299758315086365, + "lr": 2.7495207055214726e-06, + "objective/entropy": 88.19462585449219, + "objective/kl": 5.567352294921875, + "objective/non_score_reward": -0.27836763858795166, + "objective/rlhf_reward": -1.6702057048678398, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.536426067352295, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.388671875, + "step": 871, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0045156478881836 + }, + { + "episode": 20952, + "epoch": 0.04184491498004817, + "loss/policy_avg": 0.05515659600496292, + "lr": 2.749233128834356e-06, + "objective/entropy": 98.49591064453125, + "objective/kl": 4.450832366943359, + "objective/non_score_reward": -0.22254163026809692, + "objective/rlhf_reward": -1.3352497704327106, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.523943901062012, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4755859375, + "step": 872, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9960885047912598 + }, + { + "episode": 20976, + "epoch": 0.04189284729961295, + "loss/policy_avg": 0.04976364225149155, + "lr": 2.7489455521472394e-06, + "objective/entropy": 77.69338989257812, + "objective/kl": 2.7088420391082764, + "objective/non_score_reward": -0.1354421228170395, + "objective/rlhf_reward": -0.8126526921987534, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.019434690475464, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3486328125, + "step": 873, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997022151947021 + }, + { + "episode": 21000, + "epoch": 0.04194077961917772, + "loss/policy_avg": 0.17887847125530243, + "lr": 2.748657975460123e-06, + "objective/entropy": 129.4071807861328, + "objective/kl": 5.023492813110352, + "objective/non_score_reward": -0.251174658536911, + "objective/rlhf_reward": 1.0770114866258593, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.061830520629883, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.595703125, + "step": 874, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999417781829834 + }, + { + "episode": 21024, + "epoch": 0.0419887119387425, + "loss/policy_avg": 0.05642791837453842, + "lr": 2.7483703987730063e-06, + "objective/entropy": 111.92089080810547, + "objective/kl": 4.186690330505371, + "objective/non_score_reward": -0.20933453738689423, + "objective/rlhf_reward": 0.5501728707344565, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.5122809410095215, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.513671875, + "step": 875, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013322830200195 + }, + { + "episode": 21048, + "epoch": 0.04203664425830727, + "loss/policy_avg": -0.013104887679219246, + "lr": 2.7480828220858897e-06, + "objective/entropy": 80.44026184082031, + "objective/kl": 5.6122541427612305, + "objective/non_score_reward": -0.2806127071380615, + "objective/rlhf_reward": 0.12250370135319566, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.519102096557617, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.3779296875, + "step": 876, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984514713287354 + }, + { + "episode": 21072, + "epoch": 0.04208457657787205, + "loss/policy_avg": 0.1458892971277237, + "lr": 2.747795245398773e-06, + "objective/entropy": 79.9604721069336, + "objective/kl": 8.001222610473633, + "objective/non_score_reward": -0.40006113052368164, + "objective/rlhf_reward": 0.5996332615613937, + "objective/scores": 0.5, + "policy/approxkl_avg": 5.283113479614258, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3681640625, + "step": 877, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9959279298782349 + }, + { + "episode": 21096, + "epoch": 0.04213250889743682, + "loss/policy_avg": 0.1440894454717636, + "lr": 2.7475076687116566e-06, + "objective/entropy": 122.78379821777344, + "objective/kl": 4.5403642654418945, + "objective/non_score_reward": -0.22701823711395264, + "objective/rlhf_reward": 1.2219499406578036, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.3491005897521973, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.53125, + "step": 878, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0026185512542725 + }, + { + "episode": 21120, + "epoch": 0.042180441217001596, + "loss/policy_avg": 0.16970399022102356, + "lr": 2.74722009202454e-06, + "objective/entropy": 90.9150619506836, + "objective/kl": 4.270197868347168, + "objective/non_score_reward": -0.21350988745689392, + "objective/rlhf_reward": -1.2810591831803322, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6999521255493164, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3935546875, + "step": 879, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988821744918823 + }, + { + "episode": 21144, + "epoch": 0.04222837353656637, + "loss/policy_avg": 0.07936170697212219, + "lr": 2.7469325153374234e-06, + "objective/entropy": 73.20059204101562, + "objective/kl": 4.407593727111816, + "objective/non_score_reward": -0.22037969529628754, + "objective/rlhf_reward": 4.677721846848726, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.7976770401000977, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3525390625, + "step": 880, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000945806503296 + }, + { + "episode": 21168, + "epoch": 0.042276305856131145, + "loss/policy_avg": 0.02518809586763382, + "lr": 2.746644938650307e-06, + "objective/entropy": 118.84789276123047, + "objective/kl": 4.3895463943481445, + "objective/non_score_reward": -0.21947729587554932, + "objective/rlhf_reward": -1.3168637156486511, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.302042484283447, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 881, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982171058654785 + }, + { + "episode": 21192, + "epoch": 0.042324238175695916, + "loss/policy_avg": 0.04045046493411064, + "lr": 2.7463573619631903e-06, + "objective/entropy": 86.51045227050781, + "objective/kl": 5.505293846130371, + "objective/non_score_reward": -0.2752646803855896, + "objective/rlhf_reward": 0.154591846966866, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.1698360443115234, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4619140625, + "step": 882, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002819538116455 + }, + { + "episode": 21216, + "epoch": 0.042372170495260694, + "loss/policy_avg": 0.06933768838644028, + "lr": 2.7460697852760737e-06, + "objective/entropy": 131.65875244140625, + "objective/kl": 3.0303447246551514, + "objective/non_score_reward": -0.15151724219322205, + "objective/rlhf_reward": -0.9091034270823002, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.208834648132324, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.658203125, + "step": 883, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0010170936584473 + }, + { + "episode": 21240, + "epoch": 0.042420102814825465, + "loss/policy_avg": -0.02139214798808098, + "lr": 2.745782208588957e-06, + "objective/entropy": 115.83447265625, + "objective/kl": 3.30128812789917, + "objective/non_score_reward": -0.16506439447402954, + "objective/rlhf_reward": -0.9903864078223705, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.816974639892578, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.47265625, + "step": 884, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015525817871094 + }, + { + "episode": 21264, + "epoch": 0.04246803513439024, + "loss/policy_avg": 0.03548167645931244, + "lr": 2.7454946319018406e-06, + "objective/entropy": 109.49153137207031, + "objective/kl": 8.256359100341797, + "objective/non_score_reward": -0.41281795501708984, + "objective/rlhf_reward": -2.476907655596733, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.971353530883789, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.455078125, + "step": 885, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970200061798096 + }, + { + "episode": 21288, + "epoch": 0.042515967453955014, + "loss/policy_avg": 0.06995643675327301, + "lr": 2.745207055214724e-06, + "objective/entropy": 112.09738159179688, + "objective/kl": 2.380129337310791, + "objective/non_score_reward": -0.11900646984577179, + "objective/rlhf_reward": 1.0203501434893094, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 1.7031198740005493, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4873046875, + "step": 886, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0048887729644775 + }, + { + "episode": 21312, + "epoch": 0.04256389977351979, + "loss/policy_avg": 0.06821275502443314, + "lr": 2.7449194785276074e-06, + "objective/entropy": 79.43799591064453, + "objective/kl": 4.8454084396362305, + "objective/non_score_reward": -0.24227041006088257, + "objective/rlhf_reward": 0.8674944649983409, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 1.6262102127075195, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4501953125, + "step": 887, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0008058547973633 + }, + { + "episode": 21336, + "epoch": 0.04261183209308456, + "loss/policy_avg": 0.022584067657589912, + "lr": 2.744631901840491e-06, + "objective/entropy": 114.75713348388672, + "objective/kl": 6.00483512878418, + "objective/non_score_reward": -0.30024176836013794, + "objective/rlhf_reward": -1.8014506474137306, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.433263778686523, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.52734375, + "step": 888, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991891384124756 + }, + { + "episode": 21360, + "epoch": 0.04265976441264934, + "loss/policy_avg": 0.03385358676314354, + "lr": 2.7443443251533743e-06, + "objective/entropy": 102.15078735351562, + "objective/kl": 6.3586344718933105, + "objective/non_score_reward": -0.31793174147605896, + "objective/rlhf_reward": 1.8779881470781974, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.2060935497283936, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46875, + "step": 889, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998732089996338 + }, + { + "episode": 21384, + "epoch": 0.04270769673221411, + "loss/policy_avg": 0.14809367060661316, + "lr": 2.7440567484662577e-06, + "objective/entropy": 113.23397064208984, + "objective/kl": 6.922299861907959, + "objective/non_score_reward": -0.34611496329307556, + "objective/rlhf_reward": -2.0766897797584534, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.452608108520508, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.49609375, + "step": 890, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000066041946411 + }, + { + "episode": 21408, + "epoch": 0.04275562905177889, + "loss/policy_avg": 0.016204480081796646, + "lr": 2.743769171779141e-06, + "objective/entropy": 98.05013275146484, + "objective/kl": 6.41542911529541, + "objective/non_score_reward": -0.3207714557647705, + "objective/rlhf_reward": -1.924628660082817, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.541947364807129, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.52734375, + "step": 891, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975664615631104 + }, + { + "episode": 21432, + "epoch": 0.04280356137134366, + "loss/policy_avg": -0.01099843718111515, + "lr": 2.7434815950920245e-06, + "objective/entropy": 95.92872619628906, + "objective/kl": 4.786355018615723, + "objective/non_score_reward": -0.23931774497032166, + "objective/rlhf_reward": -1.435906432569027, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.46561861038208, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4189453125, + "step": 892, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002164363861084 + }, + { + "episode": 21456, + "epoch": 0.04285149369090844, + "loss/policy_avg": 0.06194370612502098, + "lr": 2.743194018404908e-06, + "objective/entropy": 104.93562316894531, + "objective/kl": 4.976661682128906, + "objective/non_score_reward": -0.2488330900669098, + "objective/rlhf_reward": 4.507001504302025, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.0145277976989746, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4609375, + "step": 893, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994291067123413 + }, + { + "episode": 21480, + "epoch": 0.04289942601047321, + "loss/policy_avg": 0.08915743976831436, + "lr": 2.7429064417177914e-06, + "objective/entropy": 75.71527099609375, + "objective/kl": 0.4236017167568207, + "objective/non_score_reward": -0.02118009328842163, + "objective/rlhf_reward": 1.6790994179786476, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.3862743377685547, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3603515625, + "step": 894, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.005826473236084 + }, + { + "episode": 21504, + "epoch": 0.04294735833003799, + "loss/policy_avg": 0.008110836148262024, + "lr": 2.7426188650306752e-06, + "objective/entropy": 106.16041564941406, + "objective/kl": 5.740138053894043, + "objective/non_score_reward": -0.2870068848133087, + "objective/rlhf_reward": 0.4152018212188614, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 7.944973945617676, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4765625, + "step": 895, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9959440231323242 + }, + { + "episode": 21528, + "epoch": 0.04299529064960276, + "loss/policy_avg": 0.22013719379901886, + "lr": 2.7423312883435587e-06, + "objective/entropy": 107.139892578125, + "objective/kl": 2.643519878387451, + "objective/non_score_reward": -0.132176011800766, + "objective/rlhf_reward": -0.7930559664964676, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8977525234222412, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4912109375, + "step": 896, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009241104125977 + }, + { + "episode": 21552, + "epoch": 0.04304322296916754, + "loss/policy_avg": 0.03527402877807617, + "lr": 2.7420437116564417e-06, + "objective/entropy": 142.33316040039062, + "objective/kl": 6.6685791015625, + "objective/non_score_reward": -0.3334289491176605, + "objective/rlhf_reward": -2.000573769211769, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.058274269104004, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.59765625, + "step": 897, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964004755020142 + }, + { + "episode": 21576, + "epoch": 0.04309115528873231, + "loss/policy_avg": 0.10246305912733078, + "lr": 2.741756134969325e-06, + "objective/entropy": 114.56505584716797, + "objective/kl": 3.857632637023926, + "objective/non_score_reward": -0.19288164377212524, + "objective/rlhf_reward": 0.7354994427851047, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.2964859008789062, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4931640625, + "step": 898, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0021910667419434 + }, + { + "episode": 21600, + "epoch": 0.043139087608297086, + "loss/policy_avg": 0.011130789294838905, + "lr": 2.7414685582822085e-06, + "objective/entropy": 90.21443939208984, + "objective/kl": 0.9975759983062744, + "objective/non_score_reward": -0.049878813326358795, + "objective/rlhf_reward": -0.29927286226302385, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.20232892036438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4228515625, + "step": 899, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0035924911499023 + }, + { + "episode": 21624, + "epoch": 0.04318701992786186, + "loss/policy_avg": 0.039997391402721405, + "lr": 2.741180981595092e-06, + "objective/entropy": 86.077392578125, + "objective/kl": 6.2177557945251465, + "objective/non_score_reward": -0.31088781356811523, + "objective/rlhf_reward": 0.27191631574524766, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.069952964782715, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4130859375, + "step": 900, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984291791915894 + }, + { + "episode": 21648, + "epoch": 0.043234952247426635, + "loss/policy_avg": 0.04747004061937332, + "lr": 2.7408934049079754e-06, + "objective/entropy": 93.36293029785156, + "objective/kl": 6.745440483093262, + "objective/non_score_reward": -0.3372720181941986, + "objective/rlhf_reward": -2.0236320942640305, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.067727565765381, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.404296875, + "step": 901, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0021262168884277 + }, + { + "episode": 21672, + "epoch": 0.043282884566991406, + "loss/policy_avg": 0.062298938632011414, + "lr": 2.740605828220859e-06, + "objective/entropy": 158.00497436523438, + "objective/kl": 2.6463863849639893, + "objective/non_score_reward": -0.13231933116912842, + "objective/rlhf_reward": -0.7939159050583839, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.788686990737915, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.642578125, + "step": 902, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008063316345215 + }, + { + "episode": 21696, + "epoch": 0.043330816886556184, + "loss/policy_avg": 0.07696983218193054, + "lr": 2.7403182515337422e-06, + "objective/entropy": 94.35783386230469, + "objective/kl": 4.610112190246582, + "objective/non_score_reward": -0.23050561547279358, + "objective/rlhf_reward": 0.3513553526148878, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.7395734786987305, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4228515625, + "step": 903, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973050355911255 + }, + { + "episode": 21720, + "epoch": 0.043378749206120955, + "loss/policy_avg": 0.037807952612638474, + "lr": 2.7400306748466256e-06, + "objective/entropy": 119.59163665771484, + "objective/kl": 3.666079521179199, + "objective/non_score_reward": -0.18330398201942444, + "objective/rlhf_reward": 0.7063561451972756, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.3346781730651855, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.52734375, + "step": 904, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009615421295166 + }, + { + "episode": 21744, + "epoch": 0.04342668152568573, + "loss/policy_avg": 0.09552240371704102, + "lr": 2.7397430981595095e-06, + "objective/entropy": 96.55064392089844, + "objective/kl": 5.937407493591309, + "objective/non_score_reward": -0.2968703508377075, + "objective/rlhf_reward": -1.781222090125084, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.493988513946533, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.443359375, + "step": 905, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983994960784912 + }, + { + "episode": 21768, + "epoch": 0.043474613845250504, + "loss/policy_avg": 0.08816215395927429, + "lr": 2.739455521472393e-06, + "objective/entropy": 74.68084716796875, + "objective/kl": 4.026675224304199, + "objective/non_score_reward": -0.20133376121520996, + "objective/rlhf_reward": -1.2080025374889374, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1329426765441895, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.416015625, + "step": 906, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007076263427734 + }, + { + "episode": 21792, + "epoch": 0.04352254616481528, + "loss/policy_avg": 0.048463840037584305, + "lr": 2.7391679447852763e-06, + "objective/entropy": 94.76558685302734, + "objective/kl": 3.8665177822113037, + "objective/non_score_reward": -0.19332587718963623, + "objective/rlhf_reward": 1.4241039958955737, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.018582344055176, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4033203125, + "step": 907, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999642372131348 + }, + { + "episode": 21816, + "epoch": 0.04357047848438005, + "loss/policy_avg": 0.046391744166612625, + "lr": 2.7388803680981598e-06, + "objective/entropy": 95.7754135131836, + "objective/kl": 3.305962085723877, + "objective/non_score_reward": -0.16529810428619385, + "objective/rlhf_reward": -0.9917886070907116, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.290882110595703, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4169921875, + "step": 908, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002667903900146 + }, + { + "episode": 21840, + "epoch": 0.04361841080394483, + "loss/policy_avg": 0.022068558260798454, + "lr": 2.738592791411043e-06, + "objective/entropy": 93.92354583740234, + "objective/kl": 7.804844856262207, + "objective/non_score_reward": -0.39024221897125244, + "objective/rlhf_reward": -2.341453403234482, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.566788196563721, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4130859375, + "step": 909, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977188110351562 + }, + { + "episode": 21864, + "epoch": 0.0436663431235096, + "loss/policy_avg": 0.019387660548090935, + "lr": 2.7383052147239266e-06, + "objective/entropy": 97.21346282958984, + "objective/kl": 3.7127041816711426, + "objective/non_score_reward": -0.18563520908355713, + "objective/rlhf_reward": -1.1138112470507622, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.1848278045654297, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.513671875, + "step": 910, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000063896179199 + }, + { + "episode": 21888, + "epoch": 0.04371427544307438, + "loss/policy_avg": 0.04289707541465759, + "lr": 2.73801763803681e-06, + "objective/entropy": 122.38936614990234, + "objective/kl": 2.772216320037842, + "objective/non_score_reward": -0.13861083984375, + "objective/rlhf_reward": 0.902724015702375, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.894400119781494, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.6015625, + "step": 911, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984886646270752 + }, + { + "episode": 21912, + "epoch": 0.04376220776263915, + "loss/policy_avg": 0.09565110504627228, + "lr": 2.737730061349693e-06, + "objective/entropy": 74.01140594482422, + "objective/kl": 5.779548645019531, + "objective/non_score_reward": -0.2889774441719055, + "objective/rlhf_reward": -1.7338645905256271, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.471281051635742, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3662109375, + "step": 912, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9966537952423096 + }, + { + "episode": 21936, + "epoch": 0.04381014008220393, + "loss/policy_avg": 0.05133858323097229, + "lr": 2.7374424846625765e-06, + "objective/entropy": 104.79302978515625, + "objective/kl": 6.972027778625488, + "objective/non_score_reward": -0.34860140085220337, + "objective/rlhf_reward": -2.09160840511322, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7017860412597656, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4609375, + "step": 913, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974641799926758 + }, + { + "episode": 21960, + "epoch": 0.0438580724017687, + "loss/policy_avg": 0.16980934143066406, + "lr": 2.73715490797546e-06, + "objective/entropy": 115.5631103515625, + "objective/kl": 7.108046531677246, + "objective/non_score_reward": -0.3554023206233978, + "objective/rlhf_reward": -2.132413923740387, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.97141432762146, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5234375, + "step": 914, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015578269958496 + }, + { + "episode": 21984, + "epoch": 0.04390600472133348, + "loss/policy_avg": 0.025971466675400734, + "lr": 2.7368673312883438e-06, + "objective/entropy": 85.31682586669922, + "objective/kl": 3.013920545578003, + "objective/non_score_reward": -0.1506960093975067, + "objective/rlhf_reward": 1.0958239361643791, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.7238693237304688, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3876953125, + "step": 915, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0028491020202637 + }, + { + "episode": 22008, + "epoch": 0.04395393704089825, + "loss/policy_avg": 0.11167575418949127, + "lr": 2.736579754601227e-06, + "objective/entropy": 104.54998016357422, + "objective/kl": 5.454062461853027, + "objective/non_score_reward": -0.2727031409740448, + "objective/rlhf_reward": 0.6848981316734317, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.703617811203003, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.470703125, + "step": 916, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007400512695312 + }, + { + "episode": 22032, + "epoch": 0.044001869360463026, + "loss/policy_avg": 0.02664884738624096, + "lr": 2.7362921779141106e-06, + "objective/entropy": 127.33946228027344, + "objective/kl": 5.278966426849365, + "objective/non_score_reward": -0.26394835114479065, + "objective/rlhf_reward": -1.5836900174617767, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4439291954040527, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.587890625, + "step": 917, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999948263168335 + }, + { + "episode": 22056, + "epoch": 0.0440498016800278, + "loss/policy_avg": 0.0019025951623916626, + "lr": 2.736004601226994e-06, + "objective/entropy": 85.67864990234375, + "objective/kl": 6.074202537536621, + "objective/non_score_reward": -0.30371010303497314, + "objective/rlhf_reward": -0.01608061442362929, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.1197152137756348, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4140625, + "step": 918, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000668525695801 + }, + { + "episode": 22080, + "epoch": 0.044097733999592575, + "loss/policy_avg": 0.060195442289114, + "lr": 2.7357170245398775e-06, + "objective/entropy": 87.2577133178711, + "objective/kl": 5.802783012390137, + "objective/non_score_reward": -0.2901391386985779, + "objective/rlhf_reward": -1.7408347874879837, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5291924476623535, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.400390625, + "step": 919, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000680685043335 + }, + { + "episode": 22104, + "epoch": 0.044145666319157346, + "loss/policy_avg": 0.018363337963819504, + "lr": 2.735429447852761e-06, + "objective/entropy": 108.52610778808594, + "objective/kl": 6.441878318786621, + "objective/non_score_reward": -0.3220939338207245, + "objective/rlhf_reward": -1.9325635135173798, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.0373382568359375, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.50390625, + "step": 920, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976403713226318 + }, + { + "episode": 22128, + "epoch": 0.044193598638722124, + "loss/policy_avg": -0.016048770397901535, + "lr": 2.7351418711656443e-06, + "objective/entropy": 92.68255615234375, + "objective/kl": 5.644326686859131, + "objective/non_score_reward": -0.28221631050109863, + "objective/rlhf_reward": 0.11288205882323121, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.740767240524292, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.384765625, + "step": 921, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001626014709473 + }, + { + "episode": 22152, + "epoch": 0.0442415309582869, + "loss/policy_avg": 0.32801878452301025, + "lr": 2.7348542944785277e-06, + "objective/entropy": 92.73130798339844, + "objective/kl": 2.943939208984375, + "objective/non_score_reward": -0.1471969485282898, + "objective/rlhf_reward": 1.7008776088418456, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.7797112464904785, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.443359375, + "step": 922, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970462322235107 + }, + { + "episode": 22176, + "epoch": 0.04428946327785167, + "loss/policy_avg": -0.0032882727682590485, + "lr": 2.734566717791411e-06, + "objective/entropy": 97.3709716796875, + "objective/kl": 6.1697916984558105, + "objective/non_score_reward": -0.3084896206855774, + "objective/rlhf_reward": -1.8509375974535942, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.395082473754883, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.466796875, + "step": 923, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997704267501831 + }, + { + "episode": 22200, + "epoch": 0.04433739559741645, + "loss/policy_avg": 0.005330020561814308, + "lr": 2.7342791411042946e-06, + "objective/entropy": 125.12557220458984, + "objective/kl": 2.310811996459961, + "objective/non_score_reward": -0.11554059386253357, + "objective/rlhf_reward": 3.0923349917811565, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 1.0219082832336426, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.564453125, + "step": 924, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000429630279541 + }, + { + "episode": 22224, + "epoch": 0.04438532791698122, + "loss/policy_avg": 0.0634724497795105, + "lr": 2.733991564417178e-06, + "objective/entropy": 95.837646484375, + "objective/kl": 5.895445823669434, + "objective/non_score_reward": -0.29477232694625854, + "objective/rlhf_reward": -1.7686337158083916, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.011226654052734, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.419921875, + "step": 925, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980305433273315 + }, + { + "episode": 22248, + "epoch": 0.044433260236546, + "loss/policy_avg": -0.010175063274800777, + "lr": 2.7337039877300614e-06, + "objective/entropy": 112.55145263671875, + "objective/kl": 2.3379321098327637, + "objective/non_score_reward": -0.1168965995311737, + "objective/rlhf_reward": 1.1914096821537818, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.4074738025665283, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.51953125, + "step": 926, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998610019683838 + }, + { + "episode": 22272, + "epoch": 0.04448119255611077, + "loss/policy_avg": 0.04643001779913902, + "lr": 2.733416411042945e-06, + "objective/entropy": 104.46885681152344, + "objective/kl": 6.487161159515381, + "objective/non_score_reward": -0.3243580460548401, + "objective/rlhf_reward": -1.9461482912302017, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4030284881591797, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4375, + "step": 927, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0009424686431885 + }, + { + "episode": 22296, + "epoch": 0.04452912487567555, + "loss/policy_avg": 0.11505647003650665, + "lr": 2.7331288343558283e-06, + "objective/entropy": 104.02301025390625, + "objective/kl": 7.606575965881348, + "objective/non_score_reward": -0.38032883405685425, + "objective/rlhf_reward": 3.718027174472809, + "objective/scores": 1.0, + "policy/approxkl_avg": 4.598485946655273, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4638671875, + "step": 928, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995492696762085 + }, + { + "episode": 22320, + "epoch": 0.04457705719524032, + "loss/policy_avg": -0.00015689246356487274, + "lr": 2.7328412576687117e-06, + "objective/entropy": 96.71681213378906, + "objective/kl": 4.243178844451904, + "objective/non_score_reward": -0.2121589481830597, + "objective/rlhf_reward": 0.4614353768423878, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.3629658222198486, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.44140625, + "step": 929, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002902030944824 + }, + { + "episode": 22344, + "epoch": 0.0446249895148051, + "loss/policy_avg": 0.029812797904014587, + "lr": 2.732553680981595e-06, + "objective/entropy": 116.431884765625, + "objective/kl": 3.643251419067383, + "objective/non_score_reward": -0.1821625679731369, + "objective/rlhf_reward": -1.092975340783596, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8700733184814453, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.51953125, + "step": 930, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000304937362671 + }, + { + "episode": 22368, + "epoch": 0.04467292183436987, + "loss/policy_avg": 0.040443792939186096, + "lr": 2.7322661042944786e-06, + "objective/entropy": 88.82817077636719, + "objective/kl": 4.018810272216797, + "objective/non_score_reward": -0.20094048976898193, + "objective/rlhf_reward": 0.7943570837378501, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.0998260974884033, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.38671875, + "step": 931, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024185180664062 + }, + { + "episode": 22392, + "epoch": 0.04472085415393465, + "loss/policy_avg": 0.14331990480422974, + "lr": 2.731978527607362e-06, + "objective/entropy": 107.67217254638672, + "objective/kl": 6.511950492858887, + "objective/non_score_reward": -0.32559752464294434, + "objective/rlhf_reward": -1.9535851925611496, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.49924373626709, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4609375, + "step": 932, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014355182647705 + }, + { + "episode": 22416, + "epoch": 0.04476878647349942, + "loss/policy_avg": 0.01659548096358776, + "lr": 2.7316909509202454e-06, + "objective/entropy": 74.92904663085938, + "objective/kl": 2.761991500854492, + "objective/non_score_reward": -0.1380995810031891, + "objective/rlhf_reward": -0.8285974636673927, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5036778450012207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3603515625, + "step": 933, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002938747406006 + }, + { + "episode": 22440, + "epoch": 0.044816718793064196, + "loss/policy_avg": 0.10943184792995453, + "lr": 2.731403374233129e-06, + "objective/entropy": 97.6327133178711, + "objective/kl": 5.039051055908203, + "objective/non_score_reward": -0.25195252895355225, + "objective/rlhf_reward": -1.5117152519524097, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.2282891273498535, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.458984375, + "step": 934, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999511957168579 + }, + { + "episode": 22464, + "epoch": 0.04486465111262897, + "loss/policy_avg": 0.004865717142820358, + "lr": 2.7311157975460123e-06, + "objective/entropy": 110.07501220703125, + "objective/kl": 3.7247045040130615, + "objective/non_score_reward": -0.1862352341413498, + "objective/rlhf_reward": 0.6887686212898526, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.7141635417938232, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48046875, + "step": 935, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999855399131775 + }, + { + "episode": 22488, + "epoch": 0.044912583432193745, + "loss/policy_avg": 0.008874917402863503, + "lr": 2.7308282208588957e-06, + "objective/entropy": 80.63064575195312, + "objective/kl": 5.575377464294434, + "objective/non_score_reward": -0.2787688374519348, + "objective/rlhf_reward": 0.3273869901895522, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.270630836486816, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3671875, + "step": 936, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991203546524048 + }, + { + "episode": 22512, + "epoch": 0.044960515751758516, + "loss/policy_avg": 0.14564327895641327, + "lr": 2.730540644171779e-06, + "objective/entropy": 88.33494567871094, + "objective/kl": 5.296100616455078, + "objective/non_score_reward": -0.26480501890182495, + "objective/rlhf_reward": 4.411169871687889, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.8610594272613525, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4150390625, + "step": 937, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998091459274292 + }, + { + "episode": 22536, + "epoch": 0.045008448071323294, + "loss/policy_avg": 0.08021347224712372, + "lr": 2.7302530674846626e-06, + "objective/entropy": 104.26788330078125, + "objective/kl": 5.003875255584717, + "objective/non_score_reward": -0.2501937747001648, + "objective/rlhf_reward": 0.4988374449312686, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.324687957763672, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4541015625, + "step": 938, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987668991088867 + }, + { + "episode": 22560, + "epoch": 0.045056380390888065, + "loss/policy_avg": 0.0057596853002905846, + "lr": 2.7299654907975464e-06, + "objective/entropy": 113.08381652832031, + "objective/kl": 6.2460432052612305, + "objective/non_score_reward": -0.3123021721839905, + "objective/rlhf_reward": 0.7102463749410602, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.294472694396973, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.5009765625, + "step": 939, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001492977142334 + }, + { + "episode": 22584, + "epoch": 0.04510431271045284, + "loss/policy_avg": 0.036595072597265244, + "lr": 2.72967791411043e-06, + "objective/entropy": 96.92338562011719, + "objective/kl": 3.3484859466552734, + "objective/non_score_reward": -0.1674242913722992, + "objective/rlhf_reward": -1.0045457631349564, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.087923526763916, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.431640625, + "step": 940, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999979734420776 + }, + { + "episode": 22608, + "epoch": 0.045152245030017614, + "loss/policy_avg": 0.012802268378436565, + "lr": 2.729390337423313e-06, + "objective/entropy": 97.29847717285156, + "objective/kl": 4.223211288452148, + "objective/non_score_reward": -0.2111605852842331, + "objective/rlhf_reward": 0.7330365926027297, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.7140288352966309, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.46875, + "step": 941, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0028557777404785 + }, + { + "episode": 22632, + "epoch": 0.04520017734958239, + "loss/policy_avg": 0.05701259523630142, + "lr": 2.7291027607361963e-06, + "objective/entropy": 100.47308349609375, + "objective/kl": 5.439868927001953, + "objective/non_score_reward": -0.2719934582710266, + "objective/rlhf_reward": -1.6319605261087418, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.008981704711914, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4541015625, + "step": 942, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992172718048096 + }, + { + "episode": 22656, + "epoch": 0.04524810966914716, + "loss/policy_avg": 0.10332086682319641, + "lr": 2.7288151840490797e-06, + "objective/entropy": 119.48175048828125, + "objective/kl": 4.848974704742432, + "objective/non_score_reward": -0.24244873225688934, + "objective/rlhf_reward": -1.4546923860907555, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4161722660064697, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.5078125, + "step": 943, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001235008239746 + }, + { + "episode": 22680, + "epoch": 0.04529604198871194, + "loss/policy_avg": 0.010710829868912697, + "lr": 2.728527607361963e-06, + "objective/entropy": 102.04789733886719, + "objective/kl": 9.006404876708984, + "objective/non_score_reward": -0.45032021403312683, + "objective/rlhf_reward": -0.8957413549183572, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.53708028793335, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.455078125, + "step": 944, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997517466545105 + }, + { + "episode": 22704, + "epoch": 0.04534397430827671, + "loss/policy_avg": 0.02535150945186615, + "lr": 2.7282400306748465e-06, + "objective/entropy": 96.99497985839844, + "objective/kl": 4.857267379760742, + "objective/non_score_reward": -0.24286334216594696, + "objective/rlhf_reward": -1.4571800827980042, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.163844585418701, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4697265625, + "step": 945, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9978132247924805 + }, + { + "episode": 22728, + "epoch": 0.04539190662784149, + "loss/policy_avg": 0.07130890339612961, + "lr": 2.72795245398773e-06, + "objective/entropy": 111.84847259521484, + "objective/kl": 5.694476127624512, + "objective/non_score_reward": -0.28472381830215454, + "objective/rlhf_reward": 0.2916572242975234, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.7781383991241455, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.50390625, + "step": 946, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99855375289917 + }, + { + "episode": 22752, + "epoch": 0.04543983894740626, + "loss/policy_avg": 0.10589248687028885, + "lr": 2.7276648773006134e-06, + "objective/entropy": 107.23970794677734, + "objective/kl": 5.4178571701049805, + "objective/non_score_reward": -0.270892858505249, + "objective/rlhf_reward": -1.6253572013229132, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.750500202178955, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.498046875, + "step": 947, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.005340576171875 + }, + { + "episode": 22776, + "epoch": 0.04548777126697104, + "loss/policy_avg": 0.03559113293886185, + "lr": 2.727377300613497e-06, + "objective/entropy": 83.23880004882812, + "objective/kl": 6.056788444519043, + "objective/non_score_reward": -0.3028394281864166, + "objective/rlhf_reward": -1.8170365206897259, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.8807108402252197, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.34375, + "step": 948, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978594779968262 + }, + { + "episode": 22800, + "epoch": 0.04553570358653581, + "loss/policy_avg": 0.5360040664672852, + "lr": 2.7270897239263807e-06, + "objective/entropy": 89.0067138671875, + "objective/kl": 5.365682125091553, + "objective/non_score_reward": -0.2682841420173645, + "objective/rlhf_reward": 4.390295296907425, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.443664073944092, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.419921875, + "step": 949, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016674995422363 + }, + { + "episode": 22824, + "epoch": 0.04558363590610059, + "loss/policy_avg": 0.006036663427948952, + "lr": 2.726802147239264e-06, + "objective/entropy": 108.69181823730469, + "objective/kl": 4.120975494384766, + "objective/non_score_reward": -0.20604878664016724, + "objective/rlhf_reward": 0.49809621943987237, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.055539846420288, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.44921875, + "step": 950, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999030590057373 + }, + { + "episode": 22848, + "epoch": 0.04563156822566536, + "loss/policy_avg": 0.008449395187199116, + "lr": 2.7265145705521475e-06, + "objective/entropy": 70.66554260253906, + "objective/kl": 5.04306697845459, + "objective/non_score_reward": -0.25215333700180054, + "objective/rlhf_reward": 0.29325996687424516, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.7413932085037231, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3515625, + "step": 951, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99995756149292 + }, + { + "episode": 22872, + "epoch": 0.04567950054523014, + "loss/policy_avg": 0.0286868903785944, + "lr": 2.726226993865031e-06, + "objective/entropy": 72.30919647216797, + "objective/kl": 6.184192657470703, + "objective/non_score_reward": -0.3092096745967865, + "objective/rlhf_reward": -1.8552580028772354, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.661703109741211, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.357421875, + "step": 952, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984246492385864 + }, + { + "episode": 22896, + "epoch": 0.04572743286479491, + "loss/policy_avg": 0.04054219648241997, + "lr": 2.7259394171779144e-06, + "objective/entropy": 82.32579040527344, + "objective/kl": 7.229793071746826, + "objective/non_score_reward": -0.3614896237850189, + "objective/rlhf_reward": 0.15217910069713625, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.4775242805480957, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3720703125, + "step": 953, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012311935424805 + }, + { + "episode": 22920, + "epoch": 0.045775365184359686, + "loss/policy_avg": 0.07581532001495361, + "lr": 2.725651840490798e-06, + "objective/entropy": 80.49742889404297, + "objective/kl": 4.760402202606201, + "objective/non_score_reward": -0.23802006244659424, + "objective/rlhf_reward": -1.4281204994767904, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7996206283569336, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.44921875, + "step": 954, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9978868961334229 + }, + { + "episode": 22944, + "epoch": 0.04582329750392446, + "loss/policy_avg": 0.0036409690510481596, + "lr": 2.7253642638036812e-06, + "objective/entropy": 103.03123474121094, + "objective/kl": 4.775422096252441, + "objective/non_score_reward": -0.23877108097076416, + "objective/rlhf_reward": -1.432626597583294, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.40445613861084, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4892578125, + "step": 955, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001413345336914 + }, + { + "episode": 22968, + "epoch": 0.045871229823489235, + "loss/policy_avg": 0.045600250363349915, + "lr": 2.7250766871165642e-06, + "objective/entropy": 107.71067810058594, + "objective/kl": 6.895091533660889, + "objective/non_score_reward": -0.34475457668304443, + "objective/rlhf_reward": -2.068527564406395, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5691802501678467, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.51171875, + "step": 956, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999849796295166 + }, + { + "episode": 22992, + "epoch": 0.045919162143054006, + "loss/policy_avg": 0.08728957921266556, + "lr": 2.7247891104294476e-06, + "objective/entropy": 124.19444274902344, + "objective/kl": 5.5474066734313965, + "objective/non_score_reward": -0.2773703336715698, + "objective/rlhf_reward": 0.6568947705973152, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.9586308002471924, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.564453125, + "step": 957, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998801350593567 + }, + { + "episode": 23016, + "epoch": 0.045967094462618784, + "loss/policy_avg": 0.05592570826411247, + "lr": 2.724501533742331e-06, + "objective/entropy": 71.75582885742188, + "objective/kl": 6.178896903991699, + "objective/non_score_reward": -0.30894485116004944, + "objective/rlhf_reward": -1.853669062256813, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.383726119995117, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4111328125, + "step": 958, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973199367523193 + }, + { + "episode": 23040, + "epoch": 0.046015026782183555, + "loss/policy_avg": -0.01571398787200451, + "lr": 2.7242139570552145e-06, + "objective/entropy": 95.94173431396484, + "objective/kl": 3.679628372192383, + "objective/non_score_reward": -0.18398141860961914, + "objective/rlhf_reward": 1.0333546482433213, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.837796211242676, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.423828125, + "step": 959, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001438856124878 + }, + { + "episode": 23064, + "epoch": 0.04606295910174833, + "loss/policy_avg": -0.023282315582036972, + "lr": 2.7239263803680983e-06, + "objective/entropy": 79.48719787597656, + "objective/kl": 5.302938461303711, + "objective/non_score_reward": -0.265146940946579, + "objective/rlhf_reward": 0.14350743143714295, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.8397903442382812, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3662109375, + "step": 960, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001072883605957 + }, + { + "episode": 23088, + "epoch": 0.046110891421313104, + "loss/policy_avg": 0.058374930173158646, + "lr": 2.7236388036809818e-06, + "objective/entropy": 98.51213073730469, + "objective/kl": 5.004692077636719, + "objective/non_score_reward": -0.25023460388183594, + "objective/rlhf_reward": 0.3913815778187122, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 6.336297988891602, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3935546875, + "step": 961, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996168613433838 + }, + { + "episode": 23112, + "epoch": 0.04615882374087788, + "loss/policy_avg": 0.04081372916698456, + "lr": 2.723351226993865e-06, + "objective/entropy": 110.80862426757812, + "objective/kl": 2.4009406566619873, + "objective/non_score_reward": -0.12004703283309937, + "objective/rlhf_reward": -0.7202822417020798, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.954480171203613, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.51171875, + "step": 962, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9988830089569092 + }, + { + "episode": 23136, + "epoch": 0.04620675606044265, + "loss/policy_avg": 0.117588572204113, + "lr": 2.7230636503067486e-06, + "objective/entropy": 100.67294311523438, + "objective/kl": 3.8533999919891357, + "objective/non_score_reward": -0.19267001748085022, + "objective/rlhf_reward": -1.1560200676321983, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.340381145477295, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4423828125, + "step": 963, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978582859039307 + }, + { + "episode": 23160, + "epoch": 0.04625468838000743, + "loss/policy_avg": 0.09446854889392853, + "lr": 2.722776073619632e-06, + "objective/entropy": 97.16107177734375, + "objective/kl": 2.051748514175415, + "objective/non_score_reward": -0.10258741676807404, + "objective/rlhf_reward": -0.6155244931578636, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.275383472442627, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.435546875, + "step": 964, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0046606063842773 + }, + { + "episode": 23184, + "epoch": 0.0463026206995722, + "loss/policy_avg": -0.001366584561765194, + "lr": 2.7224884969325155e-06, + "objective/entropy": 84.63103485107422, + "objective/kl": 3.1320254802703857, + "objective/non_score_reward": -0.15660127997398376, + "objective/rlhf_reward": 5.060392368584871, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.006709575653076, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.369140625, + "step": 965, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012717247009277 + }, + { + "episode": 23208, + "epoch": 0.04635055301913698, + "loss/policy_avg": 0.006195601541548967, + "lr": 2.722200920245399e-06, + "objective/entropy": 108.6970443725586, + "objective/kl": 4.3008012771606445, + "objective/non_score_reward": -0.21504007279872894, + "objective/rlhf_reward": 1.2938189712526293, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.7301254272460938, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.5, + "step": 966, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001582384109497 + }, + { + "episode": 23232, + "epoch": 0.04639848533870175, + "loss/policy_avg": 0.23275716602802277, + "lr": 2.7219133435582823e-06, + "objective/entropy": 84.40180969238281, + "objective/kl": 5.035403251647949, + "objective/non_score_reward": -0.25177013874053955, + "objective/rlhf_reward": 0.2237680286065422, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.6178174018859863, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4208984375, + "step": 967, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004007816314697 + }, + { + "episode": 23256, + "epoch": 0.04644641765826653, + "loss/policy_avg": 0.009572385810315609, + "lr": 2.7216257668711658e-06, + "objective/entropy": 64.4578628540039, + "objective/kl": 4.744684219360352, + "objective/non_score_reward": -0.2372342050075531, + "objective/rlhf_reward": 0.4693840455702152, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.3526954650878906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3037109375, + "step": 968, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988703727722168 + }, + { + "episode": 23280, + "epoch": 0.0464943499778313, + "loss/policy_avg": -0.012998582795262337, + "lr": 2.721338190184049e-06, + "objective/entropy": 83.9383544921875, + "objective/kl": 5.240660667419434, + "objective/non_score_reward": -0.26203304529190063, + "objective/rlhf_reward": 0.5650449924577606, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.325332164764404, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4189453125, + "step": 969, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993977546691895 + }, + { + "episode": 23304, + "epoch": 0.04654228229739608, + "loss/policy_avg": 0.028443288058042526, + "lr": 2.7210506134969326e-06, + "objective/entropy": 118.51365661621094, + "objective/kl": 4.790587902069092, + "objective/non_score_reward": -0.23952937126159668, + "objective/rlhf_reward": 1.5628237202763557, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.440047740936279, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.51171875, + "step": 970, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966790676116943 + }, + { + "episode": 23328, + "epoch": 0.04659021461696085, + "loss/policy_avg": -0.012223450466990471, + "lr": 2.720763036809816e-06, + "objective/entropy": 109.84603881835938, + "objective/kl": 2.6118721961975098, + "objective/non_score_reward": -0.13059358298778534, + "objective/rlhf_reward": -0.7835615277290344, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6468024253845215, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4453125, + "step": 971, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007357597351074 + }, + { + "episode": 23352, + "epoch": 0.046638146936525626, + "loss/policy_avg": -0.009978484362363815, + "lr": 2.7204754601226995e-06, + "objective/entropy": 87.99717712402344, + "objective/kl": 3.5080699920654297, + "objective/non_score_reward": -0.17540352046489716, + "objective/rlhf_reward": 0.7537589257003102, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 7.06591796875, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.400390625, + "step": 972, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000241279602051 + }, + { + "episode": 23376, + "epoch": 0.0466860792560904, + "loss/policy_avg": -0.04778427630662918, + "lr": 2.720187883435583e-06, + "objective/entropy": 93.74858856201172, + "objective/kl": 6.588132381439209, + "objective/non_score_reward": -0.32940658926963806, + "objective/rlhf_reward": 1.0235604718327522, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.795896053314209, + "policy/clipfrac_avg": 1.8333333730697632, + "policy/entropy_avg": 0.4248046875, + "step": 973, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999933242797852 + }, + { + "episode": 23400, + "epoch": 0.046734011575655175, + "loss/policy_avg": 0.09068219363689423, + "lr": 2.7199003067484663e-06, + "objective/entropy": 83.57235717773438, + "objective/kl": 3.3302783966064453, + "objective/non_score_reward": -0.16651391983032227, + "objective/rlhf_reward": 0.8070964289249215, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.907102584838867, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3759765625, + "step": 974, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964741468429565 + }, + { + "episode": 23424, + "epoch": 0.046781943895219946, + "loss/policy_avg": -0.011703098192811012, + "lr": 2.7196127300613497e-06, + "objective/entropy": 111.66146850585938, + "objective/kl": 4.023141384124756, + "objective/non_score_reward": -0.2011570781469345, + "objective/rlhf_reward": 0.5992375311793122, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 0.9623205661773682, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4892578125, + "step": 975, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0032670497894287 + }, + { + "episode": 23448, + "epoch": 0.046829876214784724, + "loss/policy_avg": 0.14769582450389862, + "lr": 2.719325153374233e-06, + "objective/entropy": 101.52580261230469, + "objective/kl": 3.952418327331543, + "objective/non_score_reward": -0.1976209282875061, + "objective/rlhf_reward": -1.1857254952192307, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.116534233093262, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.45703125, + "step": 976, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981496334075928 + }, + { + "episode": 23472, + "epoch": 0.046877808534349495, + "loss/policy_avg": 0.011783421970903873, + "lr": 2.7190375766871166e-06, + "objective/entropy": 59.774864196777344, + "objective/kl": 5.560410976409912, + "objective/non_score_reward": -0.27802056074142456, + "objective/rlhf_reward": -1.668123185634613, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.944483995437622, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41015625, + "step": 977, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003912448883057 + }, + { + "episode": 23496, + "epoch": 0.04692574085391427, + "loss/policy_avg": 0.01912693865597248, + "lr": 2.71875e-06, + "objective/entropy": 76.89988708496094, + "objective/kl": 2.7551612854003906, + "objective/non_score_reward": -0.1377580761909485, + "objective/rlhf_reward": 1.1734516248106956, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.9555230140686035, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3662109375, + "step": 978, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963996410369873 + }, + { + "episode": 23520, + "epoch": 0.046973673173479044, + "loss/policy_avg": 0.004157432820647955, + "lr": 2.7184624233128834e-06, + "objective/entropy": 89.18321228027344, + "objective/kl": 3.8379149436950684, + "objective/non_score_reward": -0.1918957531452179, + "objective/rlhf_reward": 0.6548055147172246, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.008542060852051, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4267578125, + "step": 979, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0035862922668457 + }, + { + "episode": 23544, + "epoch": 0.04702160549304382, + "loss/policy_avg": 0.23004812002182007, + "lr": 2.718174846625767e-06, + "objective/entropy": 77.39889526367188, + "objective/kl": 6.054265022277832, + "objective/non_score_reward": -0.3027132451534271, + "objective/rlhf_reward": 4.183720573782921, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.743056535720825, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3564453125, + "step": 980, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002519369125366 + }, + { + "episode": 23568, + "epoch": 0.0470695378126086, + "loss/policy_avg": 0.01676180586218834, + "lr": 2.7178872699386503e-06, + "objective/entropy": 96.66015625, + "objective/kl": 5.910038948059082, + "objective/non_score_reward": -0.2955019176006317, + "objective/rlhf_reward": -1.773011490702629, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9037346839904785, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.46875, + "step": 981, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004220008850098 + }, + { + "episode": 23592, + "epoch": 0.04711747013217337, + "loss/policy_avg": 0.2623444199562073, + "lr": 2.7175996932515337e-06, + "objective/entropy": 106.00523376464844, + "objective/kl": 7.123592376708984, + "objective/non_score_reward": -0.3561796545982361, + "objective/rlhf_reward": -2.137077882885933, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.32297420501709, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.494140625, + "step": 982, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001258373260498 + }, + { + "episode": 23616, + "epoch": 0.04716540245173815, + "loss/policy_avg": 0.015227511525154114, + "lr": 2.7173121165644176e-06, + "objective/entropy": 87.03790283203125, + "objective/kl": 5.42379903793335, + "objective/non_score_reward": -0.27118998765945435, + "objective/rlhf_reward": 0.2656494241646137, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.560935974121094, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.419921875, + "step": 983, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964410066604614 + }, + { + "episode": 23640, + "epoch": 0.04721333477130292, + "loss/policy_avg": 0.03965728357434273, + "lr": 2.717024539877301e-06, + "objective/entropy": 91.1021728515625, + "objective/kl": 4.47096586227417, + "objective/non_score_reward": -0.2235482931137085, + "objective/rlhf_reward": 0.3930991992250763, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.8851051330566406, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.388671875, + "step": 984, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992451667785645 + }, + { + "episode": 23664, + "epoch": 0.0472612670908677, + "loss/policy_avg": -0.008754530921578407, + "lr": 2.7167369631901844e-06, + "objective/entropy": 104.1549301147461, + "objective/kl": 4.998203277587891, + "objective/non_score_reward": -0.2499101758003235, + "objective/rlhf_reward": 1.500538945198059, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.295590877532959, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.484375, + "step": 985, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000474691390991 + }, + { + "episode": 23688, + "epoch": 0.04730919941043247, + "loss/policy_avg": 0.0166005901992321, + "lr": 2.7164493865030674e-06, + "objective/entropy": 80.62799835205078, + "objective/kl": 2.2910280227661133, + "objective/non_score_reward": -0.11455138772726059, + "objective/rlhf_reward": 3.0982702434939555, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 1.5412615537643433, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3662109375, + "step": 986, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001638650894165 + }, + { + "episode": 23712, + "epoch": 0.04735713172999725, + "loss/policy_avg": 0.018123328685760498, + "lr": 2.716161809815951e-06, + "objective/entropy": 86.35733032226562, + "objective/kl": 4.2451677322387695, + "objective/non_score_reward": -0.21225838363170624, + "objective/rlhf_reward": -1.2735502049326897, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.0672439336776733, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4091796875, + "step": 987, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013692378997803 + }, + { + "episode": 23736, + "epoch": 0.04740506404956202, + "loss/policy_avg": 0.08874578028917313, + "lr": 2.7158742331288343e-06, + "objective/entropy": 103.39131927490234, + "objective/kl": 6.157857418060303, + "objective/non_score_reward": -0.3078928589820862, + "objective/rlhf_reward": 0.152642861008644, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.952267646789551, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.466796875, + "step": 988, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984214305877686 + }, + { + "episode": 23760, + "epoch": 0.047452996369126796, + "loss/policy_avg": 0.04315752163529396, + "lr": 2.7155866564417177e-06, + "objective/entropy": 95.83990478515625, + "objective/kl": 5.688697814941406, + "objective/non_score_reward": -0.2844349145889282, + "objective/rlhf_reward": -1.7066095471382141, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4004085063934326, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4248046875, + "step": 989, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999322891235352 + }, + { + "episode": 23784, + "epoch": 0.04750092868869157, + "loss/policy_avg": 0.006664841435849667, + "lr": 2.715299079754601e-06, + "objective/entropy": 92.47750854492188, + "objective/kl": 6.639688014984131, + "objective/non_score_reward": -0.33198437094688416, + "objective/rlhf_reward": 1.0080937147140503, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.656175136566162, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4169921875, + "step": 990, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003013610839844 + }, + { + "episode": 23808, + "epoch": 0.047548861008256345, + "loss/policy_avg": 0.04151364415884018, + "lr": 2.7150115030674846e-06, + "objective/entropy": 108.64663696289062, + "objective/kl": 4.321229934692383, + "objective/non_score_reward": -0.21606147289276123, + "objective/rlhf_reward": 1.2876905278475972, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 7.940506935119629, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4892578125, + "step": 991, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975425004959106 + }, + { + "episode": 23832, + "epoch": 0.047596793327821116, + "loss/policy_avg": -0.018848204985260963, + "lr": 2.714723926380368e-06, + "objective/entropy": 117.80276489257812, + "objective/kl": 7.126543045043945, + "objective/non_score_reward": -0.35632723569869995, + "objective/rlhf_reward": -0.24517387035576443, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.863253116607666, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.494140625, + "step": 992, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000507354736328 + }, + { + "episode": 23856, + "epoch": 0.047644725647385894, + "loss/policy_avg": 0.0932856947183609, + "lr": 2.7144363496932514e-06, + "objective/entropy": 82.00892639160156, + "objective/kl": 4.429141044616699, + "objective/non_score_reward": -0.22145704925060272, + "objective/rlhf_reward": 0.47743773063433503, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.34073543548584, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.353515625, + "step": 993, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0038039684295654 + }, + { + "episode": 23880, + "epoch": 0.047692657966950665, + "loss/policy_avg": 0.043818142265081406, + "lr": 2.7141487730061353e-06, + "objective/entropy": 123.29915618896484, + "objective/kl": 2.8483166694641113, + "objective/non_score_reward": -0.1424158364534378, + "objective/rlhf_reward": -0.8544950112700462, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.1818857192993164, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.53515625, + "step": 994, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0017359256744385 + }, + { + "episode": 23904, + "epoch": 0.04774059028651544, + "loss/policy_avg": 0.026942212134599686, + "lr": 2.7138611963190187e-06, + "objective/entropy": 107.86337280273438, + "objective/kl": 8.00693416595459, + "objective/non_score_reward": -0.40034669637680054, + "objective/rlhf_reward": -2.4020799547433853, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.120025873184204, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.462890625, + "step": 995, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0027036666870117 + }, + { + "episode": 23928, + "epoch": 0.047788522606080214, + "loss/policy_avg": 0.1064753532409668, + "lr": 2.713573619631902e-06, + "objective/entropy": 89.39593505859375, + "objective/kl": 6.297635078430176, + "objective/non_score_reward": -0.3148817718029022, + "objective/rlhf_reward": -1.8892906457185745, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.029832601547241, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.419921875, + "step": 996, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998153448104858 + }, + { + "episode": 23952, + "epoch": 0.04783645492564499, + "loss/policy_avg": 0.048496752977371216, + "lr": 2.7132860429447855e-06, + "objective/entropy": 96.38912963867188, + "objective/kl": 7.67226505279541, + "objective/non_score_reward": -0.3836132884025574, + "objective/rlhf_reward": -2.3016794621944427, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.417209625244141, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.49609375, + "step": 997, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982893466949463 + }, + { + "episode": 23976, + "epoch": 0.04788438724520976, + "loss/policy_avg": 0.058566801249980927, + "lr": 2.712998466257669e-06, + "objective/entropy": 91.35702514648438, + "objective/kl": 5.690446376800537, + "objective/non_score_reward": -0.2845223546028137, + "objective/rlhf_reward": -1.707133948802948, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.222789764404297, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4345703125, + "step": 998, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977848529815674 + }, + { + "episode": 24000, + "epoch": 0.04793231956477454, + "loss/policy_avg": 0.09508547186851501, + "lr": 2.7127108895705524e-06, + "objective/entropy": 119.1187744140625, + "objective/kl": 4.543236255645752, + "objective/non_score_reward": -0.22716180980205536, + "objective/rlhf_reward": 1.2210885045291873, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.7275621891021729, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.55078125, + "step": 999, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001678705215454 + }, + { + "episode": 24024, + "epoch": 0.04798025188433931, + "loss/policy_avg": 0.06598925590515137, + "lr": 2.712423312883436e-06, + "objective/entropy": 97.84954833984375, + "objective/kl": 9.23884105682373, + "objective/non_score_reward": -0.46194207668304443, + "objective/rlhf_reward": -0.6344092778454887, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.291071891784668, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4619140625, + "step": 1000, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975736141204834 + }, + { + "episode": 24048, + "epoch": 0.04802818420390409, + "loss/policy_avg": 0.14304128289222717, + "lr": 2.712135736196319e-06, + "objective/entropy": 69.17900085449219, + "objective/kl": 7.928518295288086, + "objective/non_score_reward": -0.3964259624481201, + "objective/rlhf_reward": -0.4857664990731869, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.732501745223999, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3310546875, + "step": 1001, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004940032958984 + }, + { + "episode": 24072, + "epoch": 0.04807611652346886, + "loss/policy_avg": 0.07809796929359436, + "lr": 2.7118481595092022e-06, + "objective/entropy": 89.22886657714844, + "objective/kl": 5.446550369262695, + "objective/non_score_reward": -0.2723275423049927, + "objective/rlhf_reward": -1.6339651197195053, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.716407775878906, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.40234375, + "step": 1002, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979349374771118 + }, + { + "episode": 24096, + "epoch": 0.04812404884303364, + "loss/policy_avg": 0.10415972769260406, + "lr": 2.7115605828220857e-06, + "objective/entropy": 75.09962463378906, + "objective/kl": 4.435895919799805, + "objective/non_score_reward": -0.22179478406906128, + "objective/rlhf_reward": 4.669231301173568, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.1636972427368164, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3564453125, + "step": 1003, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002255439758301 + }, + { + "episode": 24120, + "epoch": 0.04817198116259841, + "loss/policy_avg": 0.029158011078834534, + "lr": 2.7112730061349695e-06, + "objective/entropy": 120.26954650878906, + "objective/kl": 2.2640914916992188, + "objective/non_score_reward": -0.11320458352565765, + "objective/rlhf_reward": 1.3207725398242474, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.3226474523544312, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.53515625, + "step": 1004, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002615451812744 + }, + { + "episode": 24144, + "epoch": 0.04821991348216319, + "loss/policy_avg": -0.007527500856667757, + "lr": 2.710985429447853e-06, + "objective/entropy": 83.14630889892578, + "objective/kl": 5.495266914367676, + "objective/non_score_reward": -0.2747633457183838, + "objective/rlhf_reward": 0.24420924600871463, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.7093753814697266, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.37109375, + "step": 1005, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003782272338867 + }, + { + "episode": 24168, + "epoch": 0.04826784580172796, + "loss/policy_avg": 0.053870946168899536, + "lr": 2.7106978527607364e-06, + "objective/entropy": 87.84416961669922, + "objective/kl": 3.0082812309265137, + "objective/non_score_reward": -0.15041404962539673, + "objective/rlhf_reward": 1.6815750879408808, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.9335345029830933, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4326171875, + "step": 1006, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997098445892334 + }, + { + "episode": 24192, + "epoch": 0.048315778121292736, + "loss/policy_avg": 0.04058392345905304, + "lr": 2.71041027607362e-06, + "objective/entropy": 80.2613525390625, + "objective/kl": 2.9383134841918945, + "objective/non_score_reward": -0.14691567420959473, + "objective/rlhf_reward": 5.118505969643593, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.1952719688415527, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.41015625, + "step": 1007, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990084171295166 + }, + { + "episode": 24216, + "epoch": 0.04836371044085751, + "loss/policy_avg": 0.01754375547170639, + "lr": 2.7101226993865032e-06, + "objective/entropy": 118.96743774414062, + "objective/kl": 3.166090726852417, + "objective/non_score_reward": -0.15830454230308533, + "objective/rlhf_reward": -0.9498271495103836, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.118056774139404, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.5390625, + "step": 1008, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973429441452026 + }, + { + "episode": 24240, + "epoch": 0.048411642760422285, + "loss/policy_avg": 0.059708237648010254, + "lr": 2.7098351226993866e-06, + "objective/entropy": 107.4655990600586, + "objective/kl": 5.354775428771973, + "objective/non_score_reward": -0.2677387595176697, + "objective/rlhf_reward": -1.60643251799047, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.092805862426758, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.486328125, + "step": 1009, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996842384338379 + }, + { + "episode": 24264, + "epoch": 0.048459575079987056, + "loss/policy_avg": 0.07357467710971832, + "lr": 2.70954754601227e-06, + "objective/entropy": 84.27162170410156, + "objective/kl": 3.0251669883728027, + "objective/non_score_reward": -0.15125834941864014, + "objective/rlhf_reward": -0.9075500145554543, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.909226894378662, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.380859375, + "step": 1010, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970598220825195 + }, + { + "episode": 24288, + "epoch": 0.048507507399551834, + "loss/policy_avg": -0.042653124779462814, + "lr": 2.7092599693251535e-06, + "objective/entropy": 112.82060241699219, + "objective/kl": 4.3306779861450195, + "objective/non_score_reward": -0.21653388440608978, + "objective/rlhf_reward": -1.2992032915353775, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.10900354385376, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.494140625, + "step": 1011, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002593755722046 + }, + { + "episode": 24312, + "epoch": 0.048555439719116605, + "loss/policy_avg": 0.08248042315244675, + "lr": 2.708972392638037e-06, + "objective/entropy": 67.11446380615234, + "objective/kl": 3.9913511276245117, + "objective/non_score_reward": -0.19956755638122559, + "objective/rlhf_reward": -1.1974053233861923, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.11003303527832, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.505859375, + "step": 1012, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.998565435409546 + }, + { + "episode": 24336, + "epoch": 0.04860337203868138, + "loss/policy_avg": 0.005220565013587475, + "lr": 2.7086848159509203e-06, + "objective/entropy": 112.75480651855469, + "objective/kl": 2.578864336013794, + "objective/non_score_reward": -0.12894323468208313, + "objective/rlhf_reward": 1.032520562166098, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.566116809844971, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4482421875, + "step": 1013, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998550415039062 + }, + { + "episode": 24360, + "epoch": 0.048651304358246154, + "loss/policy_avg": 0.04755939543247223, + "lr": 2.7083972392638038e-06, + "objective/entropy": 119.0550765991211, + "objective/kl": 7.012206077575684, + "objective/non_score_reward": -0.3506103456020355, + "objective/rlhf_reward": -2.103661820292473, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.913430690765381, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.525390625, + "step": 1014, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997126579284668 + }, + { + "episode": 24384, + "epoch": 0.04869923667781093, + "loss/policy_avg": 0.06394179165363312, + "lr": 2.708109662576687e-06, + "objective/entropy": 85.69821166992188, + "objective/kl": 5.094526290893555, + "objective/non_score_reward": -0.2547262907028198, + "objective/rlhf_reward": -1.5283578783273697, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5340023040771484, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.388671875, + "step": 1015, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992547035217285 + }, + { + "episode": 24408, + "epoch": 0.0487471689973757, + "loss/policy_avg": 0.0049768975004553795, + "lr": 2.7078220858895706e-06, + "objective/entropy": 100.70613861083984, + "objective/kl": 3.6095340251922607, + "objective/non_score_reward": -0.18047671020030975, + "objective/rlhf_reward": 1.5011991468431445, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.7452387809753418, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.431640625, + "step": 1016, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002347469329834 + }, + { + "episode": 24432, + "epoch": 0.04879510131694048, + "loss/policy_avg": 0.02415352314710617, + "lr": 2.707534509202454e-06, + "objective/entropy": 92.86202239990234, + "objective/kl": 6.751158714294434, + "objective/non_score_reward": -0.33755791187286377, + "objective/rlhf_reward": -0.2909585431321777, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.562203407287598, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4130859375, + "step": 1017, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994796514511108 + }, + { + "episode": 24456, + "epoch": 0.04884303363650525, + "loss/policy_avg": 0.050056565552949905, + "lr": 2.7072469325153375e-06, + "objective/entropy": 90.67337036132812, + "objective/kl": 5.25379753112793, + "objective/non_score_reward": -0.26268988847732544, + "objective/rlhf_reward": 0.7449775795985225, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 1.1872470378875732, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.435546875, + "step": 1018, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.003338575363159 + }, + { + "episode": 24480, + "epoch": 0.04889096595607003, + "loss/policy_avg": 0.017885159701108932, + "lr": 2.706959355828221e-06, + "objective/entropy": 118.72476196289062, + "objective/kl": 4.744619369506836, + "objective/non_score_reward": -0.23723094165325165, + "objective/rlhf_reward": 4.576614283025265, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.2537102699279785, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.515625, + "step": 1019, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999507188796997 + }, + { + "episode": 24504, + "epoch": 0.0489388982756348, + "loss/policy_avg": 0.6075594425201416, + "lr": 2.7066717791411043e-06, + "objective/entropy": 103.5989990234375, + "objective/kl": 3.5243334770202637, + "objective/non_score_reward": -0.17621664702892303, + "objective/rlhf_reward": 1.5267593824477887, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.2235445976257324, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4345703125, + "step": 1020, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002223491668701 + }, + { + "episode": 24528, + "epoch": 0.04898683059519958, + "loss/policy_avg": 0.0017945989966392517, + "lr": 2.7063842024539878e-06, + "objective/entropy": 109.43583679199219, + "objective/kl": 3.609422445297241, + "objective/non_score_reward": -0.18047115206718445, + "objective/rlhf_reward": 0.7233531733394895, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.899511337280273, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.498046875, + "step": 1021, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972443580627441 + }, + { + "episode": 24552, + "epoch": 0.04903476291476435, + "loss/policy_avg": 0.03668120875954628, + "lr": 2.706096625766871e-06, + "objective/entropy": 132.55810546875, + "objective/kl": 3.104830265045166, + "objective/non_score_reward": -0.15524151921272278, + "objective/rlhf_reward": -0.9314490556716919, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1814181804656982, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.583984375, + "step": 1022, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004368305206299 + }, + { + "episode": 24576, + "epoch": 0.04908269523432913, + "loss/policy_avg": 0.028224727138876915, + "lr": 2.7058090490797546e-06, + "objective/entropy": 114.84757995605469, + "objective/kl": 2.815242290496826, + "objective/non_score_reward": -0.1407621204853058, + "objective/rlhf_reward": -0.8445727415382862, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.610652923583984, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.533203125, + "step": 1023, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997359037399292 + }, + { + "episode": 24600, + "epoch": 0.0491306275538939, + "loss/policy_avg": 0.0691232830286026, + "lr": 2.705521472392638e-06, + "objective/entropy": 73.5706558227539, + "objective/kl": 5.08782958984375, + "objective/non_score_reward": -0.25439149141311646, + "objective/rlhf_reward": -1.5263487994670868, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.89267635345459, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3447265625, + "step": 1024, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002517700195312 + }, + { + "episode": 24624, + "epoch": 0.04917855987345868, + "loss/policy_avg": 0.06508079171180725, + "lr": 2.7052338957055215e-06, + "objective/entropy": 140.537109375, + "objective/kl": 3.1546378135681152, + "objective/non_score_reward": -0.15773187577724457, + "objective/rlhf_reward": -0.9463912770152092, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5689103603363037, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.615234375, + "step": 1025, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0026845932006836 + }, + { + "episode": 24648, + "epoch": 0.04922649219302345, + "loss/policy_avg": 0.024848511442542076, + "lr": 2.704946319018405e-06, + "objective/entropy": 90.95262145996094, + "objective/kl": 3.578584671020508, + "objective/non_score_reward": -0.17892923951148987, + "objective/rlhf_reward": 1.5104839709760638, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.7189772129058838, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3720703125, + "step": 1026, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002112865447998 + }, + { + "episode": 24672, + "epoch": 0.049274424512588226, + "loss/policy_avg": 0.015545201487839222, + "lr": 2.7046587423312883e-06, + "objective/entropy": 110.74850463867188, + "objective/kl": 4.172607421875, + "objective/non_score_reward": -0.20863036811351776, + "objective/rlhf_reward": -1.2517821118235588, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6949081420898438, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.478515625, + "step": 1027, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001539707183838 + }, + { + "episode": 24696, + "epoch": 0.049322356832153, + "loss/policy_avg": 0.04598255828022957, + "lr": 2.704371165644172e-06, + "objective/entropy": 107.20006561279297, + "objective/kl": 2.9018144607543945, + "objective/non_score_reward": -0.1450907289981842, + "objective/rlhf_reward": 5.129455681890249, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.3738057613372803, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.48828125, + "step": 1028, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0033085346221924 + }, + { + "episode": 24720, + "epoch": 0.049370289151717775, + "loss/policy_avg": -0.010371837764978409, + "lr": 2.7040835889570556e-06, + "objective/entropy": 85.39283752441406, + "objective/kl": 3.2707860469818115, + "objective/non_score_reward": -0.163539320230484, + "objective/rlhf_reward": -0.9812358543276787, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6907291412353516, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.529296875, + "step": 1029, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000976085662842 + }, + { + "episode": 24744, + "epoch": 0.049418221471282546, + "loss/policy_avg": 0.08002160489559174, + "lr": 2.7037960122699386e-06, + "objective/entropy": 97.64878845214844, + "objective/kl": 7.287301063537598, + "objective/non_score_reward": -0.3643650710582733, + "objective/rlhf_reward": -0.18619032204151165, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.4693284034729004, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4365234375, + "step": 1030, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002549409866333 + }, + { + "episode": 24768, + "epoch": 0.049466153790847324, + "loss/policy_avg": -0.0019757114350795746, + "lr": 2.703508435582822e-06, + "objective/entropy": 99.55775451660156, + "objective/kl": 5.603240489959717, + "objective/non_score_reward": -0.2801620364189148, + "objective/rlhf_reward": 0.9030872416855785, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.2326644659042358, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4521484375, + "step": 1031, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0006840229034424 + }, + { + "episode": 24792, + "epoch": 0.049514086110412095, + "loss/policy_avg": 0.09300331026315689, + "lr": 2.7032208588957054e-06, + "objective/entropy": 89.08953857421875, + "objective/kl": 5.504295349121094, + "objective/non_score_reward": -0.2752147614955902, + "objective/rlhf_reward": 1.3487114161252975, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.637001991271973, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3974609375, + "step": 1032, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972859621047974 + }, + { + "episode": 24816, + "epoch": 0.04956201842997687, + "loss/policy_avg": 0.06362459063529968, + "lr": 2.702933282208589e-06, + "objective/entropy": 90.6005859375, + "objective/kl": 6.35899543762207, + "objective/non_score_reward": -0.3179497718811035, + "objective/rlhf_reward": -1.9076984971761703, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4063432216644287, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4140625, + "step": 1033, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018343925476074 + }, + { + "episode": 24840, + "epoch": 0.049609950749541644, + "loss/policy_avg": 0.020037805661559105, + "lr": 2.7026457055214723e-06, + "objective/entropy": 98.22341918945312, + "objective/kl": 5.960907936096191, + "objective/non_score_reward": -0.29804539680480957, + "objective/rlhf_reward": -1.7882723361253738, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6099629402160645, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.435546875, + "step": 1034, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001791477203369 + }, + { + "episode": 24864, + "epoch": 0.04965788306910642, + "loss/policy_avg": 0.061475642025470734, + "lr": 2.7023581288343557e-06, + "objective/entropy": 83.22535705566406, + "objective/kl": 6.32804012298584, + "objective/non_score_reward": -0.31640201807022095, + "objective/rlhf_reward": -0.09223190346943999, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.434661388397217, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.451171875, + "step": 1035, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9968898296356201 + }, + { + "episode": 24888, + "epoch": 0.04970581538867119, + "loss/policy_avg": 0.025768093764781952, + "lr": 2.702070552147239e-06, + "objective/entropy": 108.4962158203125, + "objective/kl": 3.771277904510498, + "objective/non_score_reward": -0.18856388330459595, + "objective/rlhf_reward": -1.1313833072781563, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.628310203552246, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.56640625, + "step": 1036, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990639686584473 + }, + { + "episode": 24912, + "epoch": 0.04975374770823597, + "loss/policy_avg": 0.0246554184705019, + "lr": 2.7017829754601226e-06, + "objective/entropy": 99.86882781982422, + "objective/kl": 4.707627773284912, + "objective/non_score_reward": -0.23538139462471008, + "objective/rlhf_reward": 1.58771163970232, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.275207281112671, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.455078125, + "step": 1037, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000839233398438 + }, + { + "episode": 24936, + "epoch": 0.04980168002780075, + "loss/policy_avg": 0.3910715579986572, + "lr": 2.7014953987730064e-06, + "objective/entropy": 90.42269897460938, + "objective/kl": 5.15154504776001, + "objective/non_score_reward": -0.25757724046707153, + "objective/rlhf_reward": 0.18892553373134957, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.051049709320068, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4169921875, + "step": 1038, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993643760681152 + }, + { + "episode": 24960, + "epoch": 0.04984961234736552, + "loss/policy_avg": 0.013361216522753239, + "lr": 2.70120782208589e-06, + "objective/entropy": 68.70500946044922, + "objective/kl": 5.1257429122924805, + "objective/non_score_reward": -0.256287157535553, + "objective/rlhf_reward": -1.5377227813005447, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.506808280944824, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3125, + "step": 1039, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000718593597412 + }, + { + "episode": 24984, + "epoch": 0.0498975446669303, + "loss/policy_avg": 0.04723326861858368, + "lr": 2.7009202453987733e-06, + "objective/entropy": 76.6446533203125, + "objective/kl": 6.629053115844727, + "objective/non_score_reward": -0.3314526677131653, + "objective/rlhf_reward": -0.25432699621760024, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.492891788482666, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3544921875, + "step": 1040, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99686861038208 + }, + { + "episode": 25008, + "epoch": 0.04994547698649507, + "loss/policy_avg": -0.01758612133562565, + "lr": 2.7006326687116567e-06, + "objective/entropy": 100.53507995605469, + "objective/kl": 6.206151485443115, + "objective/non_score_reward": -0.3103075921535492, + "objective/rlhf_reward": 0.2753976442326439, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.9276843070983887, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.419921875, + "step": 1041, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996836185455322 + }, + { + "episode": 25032, + "epoch": 0.04999340930605985, + "loss/policy_avg": 0.06158098205924034, + "lr": 2.70034509202454e-06, + "objective/entropy": 129.49801635742188, + "objective/kl": 6.650812149047852, + "objective/non_score_reward": -0.3325406312942505, + "objective/rlhf_reward": -1.9952437728643417, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.869114398956299, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 1042, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997481346130371 + }, + { + "episode": 25056, + "epoch": 0.05004134162562462, + "loss/policy_avg": 0.012017151340842247, + "lr": 2.7000575153374236e-06, + "objective/entropy": 98.99923706054688, + "objective/kl": 2.9874420166015625, + "objective/non_score_reward": -0.14937210083007812, + "objective/rlhf_reward": -0.8962325714528561, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.47694987058639526, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4228515625, + "step": 1043, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0028514862060547 + }, + { + "episode": 25080, + "epoch": 0.050089273945189396, + "loss/policy_avg": 0.01772144064307213, + "lr": 2.699769938650307e-06, + "objective/entropy": 106.39108276367188, + "objective/kl": 4.67110013961792, + "objective/non_score_reward": -0.23355500400066376, + "objective/rlhf_reward": -1.4013300240039825, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5515239238739014, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4912109375, + "step": 1044, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000089168548584 + }, + { + "episode": 25104, + "epoch": 0.05013720626475417, + "loss/policy_avg": 0.025676341727375984, + "lr": 2.69948236196319e-06, + "objective/entropy": 106.17345428466797, + "objective/kl": 3.408492088317871, + "objective/non_score_reward": -0.17042462527751923, + "objective/rlhf_reward": 1.97745231539011, + "objective/scores": 0.5, + "policy/approxkl_avg": 5.1122283935546875, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4697265625, + "step": 1045, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979991912841797 + }, + { + "episode": 25128, + "epoch": 0.050185138584318945, + "loss/policy_avg": -0.0024718251079320908, + "lr": 2.6991947852760734e-06, + "objective/entropy": 87.14985656738281, + "objective/kl": 7.8195343017578125, + "objective/non_score_reward": -0.3909766674041748, + "objective/rlhf_reward": -2.345860183238983, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7266485691070557, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4423828125, + "step": 1046, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0023632049560547 + }, + { + "episode": 25152, + "epoch": 0.050233070903883716, + "loss/policy_avg": -0.031009195372462273, + "lr": 2.698907208588957e-06, + "objective/entropy": 95.13604736328125, + "objective/kl": 5.25629186630249, + "objective/non_score_reward": -0.26281458139419556, + "objective/rlhf_reward": -1.5768874287605286, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.050119161605835, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4140625, + "step": 1047, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995667934417725 + }, + { + "episode": 25176, + "epoch": 0.050281003223448494, + "loss/policy_avg": -0.003023129887878895, + "lr": 2.6986196319018403e-06, + "objective/entropy": 133.37908935546875, + "objective/kl": 6.648040771484375, + "objective/non_score_reward": -0.3324020504951477, + "objective/rlhf_reward": -1.994412213563919, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.8278441429138184, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.599609375, + "step": 1048, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994492530822754 + }, + { + "episode": 25200, + "epoch": 0.050328935543013265, + "loss/policy_avg": 0.10148762166500092, + "lr": 2.698332055214724e-06, + "objective/entropy": 103.1484375, + "objective/kl": 6.5511627197265625, + "objective/non_score_reward": -0.32755813002586365, + "objective/rlhf_reward": -1.9653487280011177, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4036879539489746, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4697265625, + "step": 1049, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987576007843018 + }, + { + "episode": 25224, + "epoch": 0.05037686786257804, + "loss/policy_avg": 0.106508269906044, + "lr": 2.6980444785276075e-06, + "objective/entropy": 96.95748901367188, + "objective/kl": 7.169236660003662, + "objective/non_score_reward": -0.35846179723739624, + "objective/rlhf_reward": -2.1507708206772804, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.612119674682617, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.39453125, + "step": 1050, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975073337554932 + }, + { + "episode": 25248, + "epoch": 0.050424800182142814, + "loss/policy_avg": 0.04231148585677147, + "lr": 2.697756901840491e-06, + "objective/entropy": 80.9957275390625, + "objective/kl": 3.820591926574707, + "objective/non_score_reward": -0.1910296082496643, + "objective/rlhf_reward": 0.9910654284466637, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.916405200958252, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.373046875, + "step": 1051, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995100498199463 + }, + { + "episode": 25272, + "epoch": 0.05047273250170759, + "loss/policy_avg": 0.05944293737411499, + "lr": 2.6974693251533744e-06, + "objective/entropy": 85.8700180053711, + "objective/kl": 7.372799873352051, + "objective/non_score_reward": -0.3686399459838867, + "objective/rlhf_reward": -2.211839884519577, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3360795974731445, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4638671875, + "step": 1052, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.000526189804077 + }, + { + "episode": 25296, + "epoch": 0.05052066482127236, + "loss/policy_avg": 0.04855162650346756, + "lr": 2.697181748466258e-06, + "objective/entropy": 147.2095947265625, + "objective/kl": 4.985818386077881, + "objective/non_score_reward": -0.24929091334342957, + "objective/rlhf_reward": 0.8253713987369302, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.028564929962158, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.67578125, + "step": 1053, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992462396621704 + }, + { + "episode": 25320, + "epoch": 0.05056859714083714, + "loss/policy_avg": 0.03917790576815605, + "lr": 2.6968941717791412e-06, + "objective/entropy": 94.20643615722656, + "objective/kl": 6.04376220703125, + "objective/non_score_reward": -0.30218812823295593, + "objective/rlhf_reward": -1.813128650188446, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.913430213928223, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.416015625, + "step": 1054, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9948991537094116 + }, + { + "episode": 25344, + "epoch": 0.05061652946040191, + "loss/policy_avg": 0.2682764232158661, + "lr": 2.6966065950920247e-06, + "objective/entropy": 85.73516845703125, + "objective/kl": 6.388323783874512, + "objective/non_score_reward": -0.3194161653518677, + "objective/rlhf_reward": -0.1103169436215129, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.425605297088623, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.392578125, + "step": 1055, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984767436981201 + }, + { + "episode": 25368, + "epoch": 0.05066446177996669, + "loss/policy_avg": 0.06052929162979126, + "lr": 2.696319018404908e-06, + "objective/entropy": 109.15867614746094, + "objective/kl": 7.656983852386475, + "objective/non_score_reward": -0.38284915685653687, + "objective/rlhf_reward": -2.297094941139221, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.438884735107422, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4716796875, + "step": 1056, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993010759353638 + }, + { + "episode": 25392, + "epoch": 0.05071239409953146, + "loss/policy_avg": -0.060102906078100204, + "lr": 2.6960314417177915e-06, + "objective/entropy": 78.56459045410156, + "objective/kl": 3.284517765045166, + "objective/non_score_reward": -0.16422587633132935, + "objective/rlhf_reward": 5.014644704759121, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.3127694129943848, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.359375, + "step": 1057, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003509998321533 + }, + { + "episode": 25416, + "epoch": 0.05076032641909624, + "loss/policy_avg": -0.02001863159239292, + "lr": 2.695743865030675e-06, + "objective/entropy": 71.29380798339844, + "objective/kl": 4.287959098815918, + "objective/non_score_reward": -0.21439795196056366, + "objective/rlhf_reward": -1.2863877825438976, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6923816204071045, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3310546875, + "step": 1058, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003952980041504 + }, + { + "episode": 25440, + "epoch": 0.05080825873866101, + "loss/policy_avg": 0.06954438984394073, + "lr": 2.6954562883435584e-06, + "objective/entropy": 125.73146057128906, + "objective/kl": 4.117081165313721, + "objective/non_score_reward": -0.20585405826568604, + "objective/rlhf_reward": -1.235124371945858, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0499606132507324, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.5625, + "step": 1059, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011332035064697 + }, + { + "episode": 25464, + "epoch": 0.05085619105822579, + "loss/policy_avg": 0.15793174505233765, + "lr": 2.695168711656442e-06, + "objective/entropy": 93.87025451660156, + "objective/kl": 5.391411781311035, + "objective/non_score_reward": -0.26957058906555176, + "objective/rlhf_reward": -1.617423564195633, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.463604927062988, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4619140625, + "step": 1060, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986833333969116 + }, + { + "episode": 25488, + "epoch": 0.05090412337779056, + "loss/policy_avg": -0.030255354940891266, + "lr": 2.6948811349693252e-06, + "objective/entropy": 82.12006378173828, + "objective/kl": 4.363125801086426, + "objective/non_score_reward": -0.21815630793571472, + "objective/rlhf_reward": -1.3089378625154495, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.6944248080253601, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.369140625, + "step": 1061, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003108501434326 + }, + { + "episode": 25512, + "epoch": 0.050952055697355336, + "loss/policy_avg": 0.1283409297466278, + "lr": 2.6945935582822086e-06, + "objective/entropy": 90.52935791015625, + "objective/kl": 4.869753837585449, + "objective/non_score_reward": -0.24348768591880798, + "objective/rlhf_reward": -1.4609260484576225, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.171123743057251, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.400390625, + "step": 1062, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004133701324463 + }, + { + "episode": 25536, + "epoch": 0.05099998801692011, + "loss/policy_avg": -0.038133323192596436, + "lr": 2.694305981595092e-06, + "objective/entropy": 85.88705444335938, + "objective/kl": 4.34666633605957, + "objective/non_score_reward": -0.21733331680297852, + "objective/rlhf_reward": 2.4815785908085517, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.8542449474334717, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.390625, + "step": 1063, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008718967437744 + }, + { + "episode": 25560, + "epoch": 0.051047920336484885, + "loss/policy_avg": 0.018995456397533417, + "lr": 2.6940184049079755e-06, + "objective/entropy": 102.41753387451172, + "objective/kl": 5.507162094116211, + "objective/non_score_reward": -0.275358110666275, + "objective/rlhf_reward": -1.652148649096489, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0196542739868164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.470703125, + "step": 1064, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991952180862427 + }, + { + "episode": 25584, + "epoch": 0.051095852656049656, + "loss/policy_avg": 0.02189953438937664, + "lr": 2.693730828220859e-06, + "objective/entropy": 97.60025024414062, + "objective/kl": 3.7985644340515137, + "objective/non_score_reward": -0.18992821872234344, + "objective/rlhf_reward": 0.8604307621717452, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.570662498474121, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.42578125, + "step": 1065, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975755214691162 + }, + { + "episode": 25608, + "epoch": 0.051143784975614434, + "loss/policy_avg": 0.09388387948274612, + "lr": 2.6934432515337424e-06, + "objective/entropy": 89.61599731445312, + "objective/kl": 5.730984687805176, + "objective/non_score_reward": -0.2865492105484009, + "objective/rlhf_reward": -1.7192952446639538, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.3976335525512695, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3935546875, + "step": 1066, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970524311065674 + }, + { + "episode": 25632, + "epoch": 0.051191717295179205, + "loss/policy_avg": 0.07741451263427734, + "lr": 2.6931556748466258e-06, + "objective/entropy": 92.00106048583984, + "objective/kl": 7.2181854248046875, + "objective/non_score_reward": -0.3609093129634857, + "objective/rlhf_reward": -2.1654557585716248, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.446036338806152, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3828125, + "step": 1067, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019752979278564 + }, + { + "episode": 25656, + "epoch": 0.05123964961474398, + "loss/policy_avg": 0.0610862672328949, + "lr": 2.692868098159509e-06, + "objective/entropy": 74.00466918945312, + "objective/kl": 7.197944164276123, + "objective/non_score_reward": -0.3598972260951996, + "objective/rlhf_reward": 0.42467599186916083, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.3070895671844482, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.416015625, + "step": 1068, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998591423034668 + }, + { + "episode": 25680, + "epoch": 0.051287581934308754, + "loss/policy_avg": 0.2528974413871765, + "lr": 2.6925805214723926e-06, + "objective/entropy": 124.92976379394531, + "objective/kl": 4.640925407409668, + "objective/non_score_reward": -0.23204627633094788, + "objective/rlhf_reward": -1.3922776728868484, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.168655872344971, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.560546875, + "step": 1069, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976460933685303 + }, + { + "episode": 25704, + "epoch": 0.05133551425387353, + "loss/policy_avg": -0.009870676323771477, + "lr": 2.692292944785276e-06, + "objective/entropy": 90.36968231201172, + "objective/kl": 4.621509552001953, + "objective/non_score_reward": -0.2310754507780075, + "objective/rlhf_reward": -1.3864527195692062, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9489268064498901, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4033203125, + "step": 1070, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0058841705322266 + }, + { + "episode": 25728, + "epoch": 0.0513834465734383, + "loss/policy_avg": -0.004592197015881538, + "lr": 2.6920053680981595e-06, + "objective/entropy": 110.4934310913086, + "objective/kl": 5.948945999145508, + "objective/non_score_reward": -0.2974473237991333, + "objective/rlhf_reward": 0.21531617641448964, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.3166446685791016, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4970703125, + "step": 1071, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002019166946411 + }, + { + "episode": 25752, + "epoch": 0.05143137889300308, + "loss/policy_avg": 0.045978888869285583, + "lr": 2.6917177914110433e-06, + "objective/entropy": 71.1092300415039, + "objective/kl": 4.022435665130615, + "objective/non_score_reward": -0.20112180709838867, + "objective/rlhf_reward": 4.793269246816635, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.0449717044830322, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.33203125, + "step": 1072, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012996196746826 + }, + { + "episode": 25776, + "epoch": 0.05147931121256785, + "loss/policy_avg": -0.021327612921595573, + "lr": 2.6914302147239268e-06, + "objective/entropy": 81.15596771240234, + "objective/kl": 4.029016017913818, + "objective/non_score_reward": -0.20145079493522644, + "objective/rlhf_reward": 2.576873759267967, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.071308612823486, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.365234375, + "step": 1073, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001601219177246 + }, + { + "episode": 25800, + "epoch": 0.05152724353213263, + "loss/policy_avg": 0.08145155012607574, + "lr": 2.69114263803681e-06, + "objective/entropy": 89.00228118896484, + "objective/kl": 6.974478721618652, + "objective/non_score_reward": -0.3487238883972168, + "objective/rlhf_reward": -0.09234352409839641, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.053592681884766, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.37890625, + "step": 1074, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985065460205078 + }, + { + "episode": 25824, + "epoch": 0.0515751758516974, + "loss/policy_avg": 0.05030512809753418, + "lr": 2.690855061349693e-06, + "objective/entropy": 77.50820922851562, + "objective/kl": 3.2328972816467285, + "objective/non_score_reward": -0.1616448611021042, + "objective/rlhf_reward": -0.9698691219091415, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.225308418273926, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.365234375, + "step": 1075, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983296394348145 + }, + { + "episode": 25848, + "epoch": 0.05162310817126218, + "loss/policy_avg": 0.3414855897426605, + "lr": 2.6905674846625766e-06, + "objective/entropy": 80.98123168945312, + "objective/kl": 5.9499006271362305, + "objective/non_score_reward": -0.2974950075149536, + "objective/rlhf_reward": 0.21503002941608418, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.8293230533599854, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3740234375, + "step": 1076, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999202847480774 + }, + { + "episode": 25872, + "epoch": 0.05167104049082695, + "loss/policy_avg": 0.01846831850707531, + "lr": 2.69027990797546e-06, + "objective/entropy": 122.08592224121094, + "objective/kl": 5.0494065284729, + "objective/non_score_reward": -0.25247034430503845, + "objective/rlhf_reward": 4.485178090631962, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.006554126739502, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.572265625, + "step": 1077, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994326829910278 + }, + { + "episode": 25896, + "epoch": 0.05171897281039173, + "loss/policy_avg": 1.162643313407898, + "lr": 2.6899923312883435e-06, + "objective/entropy": 90.00956726074219, + "objective/kl": 8.310735702514648, + "objective/non_score_reward": -0.4155367314815521, + "objective/rlhf_reward": 0.09083891484756201, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.9102622270584106, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.408203125, + "step": 1078, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002535820007324 + }, + { + "episode": 25920, + "epoch": 0.0517669051299565, + "loss/policy_avg": 0.17476975917816162, + "lr": 2.689704754601227e-06, + "objective/entropy": 95.99754333496094, + "objective/kl": 6.232204437255859, + "objective/non_score_reward": -0.31161028146743774, + "objective/rlhf_reward": -1.869661569595337, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.02110481262207, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.447265625, + "step": 1079, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002382755279541 + }, + { + "episode": 25944, + "epoch": 0.05181483744952128, + "loss/policy_avg": 0.1580069214105606, + "lr": 2.6894171779141103e-06, + "objective/entropy": 110.78697967529297, + "objective/kl": 8.150861740112305, + "objective/non_score_reward": -0.40754303336143494, + "objective/rlhf_reward": -0.7108693912728943, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 5.650873184204102, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.478515625, + "step": 1080, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995501041412354 + }, + { + "episode": 25968, + "epoch": 0.05186276976908605, + "loss/policy_avg": 0.024494625627994537, + "lr": 2.6891296012269937e-06, + "objective/entropy": 83.67759704589844, + "objective/kl": 4.970746040344238, + "objective/non_score_reward": -0.24853728711605072, + "objective/rlhf_reward": -1.4912237226963043, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.145186185836792, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3720703125, + "step": 1081, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012364387512207 + }, + { + "episode": 25992, + "epoch": 0.051910702088650826, + "loss/policy_avg": 0.08320475369691849, + "lr": 2.688842024539877e-06, + "objective/entropy": 143.62094116210938, + "objective/kl": 2.220930814743042, + "objective/non_score_reward": -0.11104654520750046, + "objective/rlhf_reward": -0.666279274970293, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6850529909133911, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.662109375, + "step": 1082, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0029830932617188 + }, + { + "episode": 26016, + "epoch": 0.0519586344082156, + "loss/policy_avg": -0.021369092166423798, + "lr": 2.688554447852761e-06, + "objective/entropy": 116.96939086914062, + "objective/kl": 2.9203994274139404, + "objective/non_score_reward": -0.14601998031139374, + "objective/rlhf_reward": 1.444996968989468, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.263298511505127, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5390625, + "step": 1083, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001739263534546 + }, + { + "episode": 26040, + "epoch": 0.052006566727780375, + "loss/policy_avg": -0.0038502956740558147, + "lr": 2.6882668711656444e-06, + "objective/entropy": 92.00323486328125, + "objective/kl": 5.040329933166504, + "objective/non_score_reward": -0.25201651453971863, + "objective/rlhf_reward": -1.5120989605784416, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.689406394958496, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4423828125, + "step": 1084, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980416297912598 + }, + { + "episode": 26064, + "epoch": 0.052054499047345146, + "loss/policy_avg": 0.12430846691131592, + "lr": 2.687979294478528e-06, + "objective/entropy": 98.50570678710938, + "objective/kl": 5.1425700187683105, + "objective/non_score_reward": -0.25712850689888, + "objective/rlhf_reward": 1.4572290405631065, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.943840980529785, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4267578125, + "step": 1085, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0026111602783203 + }, + { + "episode": 26088, + "epoch": 0.052102431366909924, + "loss/policy_avg": 0.01346190832555294, + "lr": 2.6876917177914113e-06, + "objective/entropy": 76.96609497070312, + "objective/kl": 4.847370147705078, + "objective/non_score_reward": -0.24236845970153809, + "objective/rlhf_reward": -1.4542107284069061, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3451855182647705, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3564453125, + "step": 1086, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996061325073242 + }, + { + "episode": 26112, + "epoch": 0.052150363686474695, + "loss/policy_avg": -0.004887872375547886, + "lr": 2.6874041411042947e-06, + "objective/entropy": 130.09368896484375, + "objective/kl": 3.8717565536499023, + "objective/non_score_reward": -0.19358782470226288, + "objective/rlhf_reward": 1.4225323555232974, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.3307621479034424, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.556640625, + "step": 1087, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016374588012695 + }, + { + "episode": 26136, + "epoch": 0.05219829600603947, + "loss/policy_avg": 0.005219799000769854, + "lr": 2.687116564417178e-06, + "objective/entropy": 115.71604919433594, + "objective/kl": 4.252038955688477, + "objective/non_score_reward": -0.2126019448041916, + "objective/rlhf_reward": 0.45877728908217774, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 1.4403825998306274, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.51953125, + "step": 1088, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0029852390289307 + }, + { + "episode": 26160, + "epoch": 0.052246228325604244, + "loss/policy_avg": 0.03626570478081703, + "lr": 2.6868289877300616e-06, + "objective/entropy": 102.53549194335938, + "objective/kl": 2.6790547370910645, + "objective/non_score_reward": -0.13395272195339203, + "objective/rlhf_reward": -0.8037163764238358, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.475315093994141, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4619140625, + "step": 1089, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000112771987915 + }, + { + "episode": 26184, + "epoch": 0.05229416064516902, + "loss/policy_avg": -0.01193223800510168, + "lr": 2.6865414110429446e-06, + "objective/entropy": 85.09180450439453, + "objective/kl": 5.377470016479492, + "objective/non_score_reward": -0.26887354254722595, + "objective/rlhf_reward": 0.524002012651099, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.344847202301025, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.400390625, + "step": 1090, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995840787887573 + }, + { + "episode": 26208, + "epoch": 0.05234209296473379, + "loss/policy_avg": 0.05091739445924759, + "lr": 2.686253834355828e-06, + "objective/entropy": 78.49052429199219, + "objective/kl": 4.861946105957031, + "objective/non_score_reward": -0.24309733510017395, + "objective/rlhf_reward": 1.541416086256504, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.2298853397369385, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3662109375, + "step": 1091, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0057835578918457 + }, + { + "episode": 26232, + "epoch": 0.05239002528429857, + "loss/policy_avg": 0.05987296998500824, + "lr": 2.6859662576687114e-06, + "objective/entropy": 102.1220703125, + "objective/kl": 2.9255664348602295, + "objective/non_score_reward": -0.14627832174301147, + "objective/rlhf_reward": 0.9285100658775601, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.5727765560150146, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4775390625, + "step": 1092, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984405040740967 + }, + { + "episode": 26256, + "epoch": 0.05243795760386334, + "loss/policy_avg": 0.09344521164894104, + "lr": 2.6856786809815953e-06, + "objective/entropy": 137.17372131347656, + "objective/kl": 4.935346603393555, + "objective/non_score_reward": -0.2467673271894455, + "objective/rlhf_reward": -1.480603851377964, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3327267169952393, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.61328125, + "step": 1093, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0004453659057617 + }, + { + "episode": 26280, + "epoch": 0.05248588992342812, + "loss/policy_avg": 0.037197284400463104, + "lr": 2.6853911042944787e-06, + "objective/entropy": 85.84927368164062, + "objective/kl": 2.4550423622131348, + "objective/non_score_reward": -0.1227521300315857, + "objective/rlhf_reward": -0.736512765288353, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2548646926879883, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3818359375, + "step": 1094, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0028016567230225 + }, + { + "episode": 26304, + "epoch": 0.0525338222429929, + "loss/policy_avg": 0.0020785462111234665, + "lr": 2.685103527607362e-06, + "objective/entropy": 80.61239624023438, + "objective/kl": 3.826831340789795, + "objective/non_score_reward": -0.1913415640592575, + "objective/rlhf_reward": -1.1480493396520615, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.8170492649078369, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3642578125, + "step": 1095, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0076863765716553 + }, + { + "episode": 26328, + "epoch": 0.05258175456255767, + "loss/policy_avg": 0.030912555754184723, + "lr": 2.6848159509202456e-06, + "objective/entropy": 107.50244140625, + "objective/kl": 3.698716878890991, + "objective/non_score_reward": -0.18493585288524628, + "objective/rlhf_reward": -1.1096149906516075, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6073464155197144, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4951171875, + "step": 1096, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007123947143555 + }, + { + "episode": 26352, + "epoch": 0.052629686882122446, + "loss/policy_avg": 0.0211525559425354, + "lr": 2.684528374233129e-06, + "objective/entropy": 123.37950897216797, + "objective/kl": 5.552650451660156, + "objective/non_score_reward": -0.27763253450393677, + "objective/rlhf_reward": 2.1197834485155753, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.138249397277832, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.55078125, + "step": 1097, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989380836486816 + }, + { + "episode": 26376, + "epoch": 0.05267761920168722, + "loss/policy_avg": -0.00902971625328064, + "lr": 2.6842407975460124e-06, + "objective/entropy": 93.1274642944336, + "objective/kl": 4.2247748374938965, + "objective/non_score_reward": -0.21123874187469482, + "objective/rlhf_reward": 0.5387475897909436, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.1872198581695557, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4228515625, + "step": 1098, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003018379211426 + }, + { + "episode": 26400, + "epoch": 0.052725551521251995, + "loss/policy_avg": 0.008544385433197021, + "lr": 2.683953220858896e-06, + "objective/entropy": 122.2210922241211, + "objective/kl": 2.2896101474761963, + "objective/non_score_reward": -0.11448051780462265, + "objective/rlhf_reward": 1.3131169192492962, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.886326551437378, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.501953125, + "step": 1099, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006933212280273 + }, + { + "episode": 26424, + "epoch": 0.052773483840816766, + "loss/policy_avg": 0.10443119704723358, + "lr": 2.6836656441717793e-06, + "objective/entropy": 47.938575744628906, + "objective/kl": 4.679109573364258, + "objective/non_score_reward": -0.2339554727077484, + "objective/rlhf_reward": 0.4890564244678821, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.006369590759277, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3447265625, + "step": 1100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982452392578125 + }, + { + "episode": 26448, + "epoch": 0.052821416160381544, + "loss/policy_avg": 0.039556995034217834, + "lr": 2.6833780674846627e-06, + "objective/entropy": 103.51300048828125, + "objective/kl": 5.946491241455078, + "objective/non_score_reward": -0.29732459783554077, + "objective/rlhf_reward": 0.353295654844178, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.277250289916992, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4833984375, + "step": 1101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001726150512695 + }, + { + "episode": 26472, + "epoch": 0.052869348479946315, + "loss/policy_avg": -0.015102000907063484, + "lr": 2.683090490797546e-06, + "objective/entropy": 99.54130554199219, + "objective/kl": 6.767150402069092, + "objective/non_score_reward": -0.338357537984848, + "objective/rlhf_reward": -2.030145138502121, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.551590919494629, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.453125, + "step": 1102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983646869659424 + }, + { + "episode": 26496, + "epoch": 0.05291728079951109, + "loss/policy_avg": 0.04424537718296051, + "lr": 2.6828029141104295e-06, + "objective/entropy": 110.62435150146484, + "objective/kl": 4.049128532409668, + "objective/non_score_reward": -0.20245644450187683, + "objective/rlhf_reward": -1.2147385627031326, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8717929124832153, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4697265625, + "step": 1103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996132850646973 + }, + { + "episode": 26520, + "epoch": 0.052965213119075864, + "loss/policy_avg": 0.05446460843086243, + "lr": 2.682515337423313e-06, + "objective/entropy": 102.2375717163086, + "objective/kl": 1.9513758420944214, + "objective/non_score_reward": -0.09756879508495331, + "objective/rlhf_reward": -0.5854127258062363, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.141634464263916, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4501953125, + "step": 1104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008771419525146 + }, + { + "episode": 26544, + "epoch": 0.05301314543864064, + "loss/policy_avg": 0.03195279464125633, + "lr": 2.6822277607361964e-06, + "objective/entropy": 99.55656433105469, + "objective/kl": 5.64494514465332, + "objective/non_score_reward": -0.28224727511405945, + "objective/rlhf_reward": 1.3065164238214493, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.392418384552002, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4765625, + "step": 1105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979453086853027 + }, + { + "episode": 26568, + "epoch": 0.05306107775820541, + "loss/policy_avg": 0.12074029445648193, + "lr": 2.68194018404908e-06, + "objective/entropy": 119.58946228027344, + "objective/kl": 3.6619327068328857, + "objective/non_score_reward": -0.18309663236141205, + "objective/rlhf_reward": 0.6358091413871132, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.9358670711517334, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.501953125, + "step": 1106, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997448444366455 + }, + { + "episode": 26592, + "epoch": 0.05310901007777019, + "loss/policy_avg": 0.0242460910230875, + "lr": 2.6816526073619632e-06, + "objective/entropy": 113.30186462402344, + "objective/kl": 4.510379791259766, + "objective/non_score_reward": -0.22551898658275604, + "objective/rlhf_reward": 1.6468861028552055, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.70161247253418, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48046875, + "step": 1107, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997785091400146 + }, + { + "episode": 26616, + "epoch": 0.05315694239733496, + "loss/policy_avg": -0.022288450971245766, + "lr": 2.6813650306748467e-06, + "objective/entropy": 86.7301025390625, + "objective/kl": 5.560698509216309, + "objective/non_score_reward": -0.27803492546081543, + "objective/rlhf_reward": 0.652907387499905, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.94580078125, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.361328125, + "step": 1108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014841556549072 + }, + { + "episode": 26640, + "epoch": 0.05320487471689974, + "loss/policy_avg": 0.2012617588043213, + "lr": 2.68107745398773e-06, + "objective/entropy": 98.65434265136719, + "objective/kl": 7.60493278503418, + "objective/non_score_reward": -0.380246639251709, + "objective/rlhf_reward": -0.2814799249172212, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.425183296203613, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.439453125, + "step": 1109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989205598831177 + }, + { + "episode": 26664, + "epoch": 0.05325280703646451, + "loss/policy_avg": 0.09516517072916031, + "lr": 2.6807898773006135e-06, + "objective/entropy": 94.5146713256836, + "objective/kl": 4.2626190185546875, + "objective/non_score_reward": -0.21313093602657318, + "objective/rlhf_reward": -1.2787856832146645, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.326416254043579, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4072265625, + "step": 1110, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0017504692077637 + }, + { + "episode": 26688, + "epoch": 0.05330073935602929, + "loss/policy_avg": -0.04179363325238228, + "lr": 2.680502300613497e-06, + "objective/entropy": 96.63059997558594, + "objective/kl": 4.44108772277832, + "objective/non_score_reward": -0.2220543622970581, + "objective/rlhf_reward": -1.3323261439800262, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6641474962234497, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.416015625, + "step": 1111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015130043029785 + }, + { + "episode": 26712, + "epoch": 0.05334867167559406, + "loss/policy_avg": 0.07445335388183594, + "lr": 2.6802147239263804e-06, + "objective/entropy": 88.36076354980469, + "objective/kl": 6.752433776855469, + "objective/non_score_reward": -0.33762168884277344, + "objective/rlhf_reward": -0.2913413018091835, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 7.508903980255127, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3994140625, + "step": 1112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9955856800079346 + }, + { + "episode": 26736, + "epoch": 0.05339660399515884, + "loss/policy_avg": 0.03495807200670242, + "lr": 2.679927147239264e-06, + "objective/entropy": 91.1797103881836, + "objective/kl": 4.596096515655518, + "objective/non_score_reward": -0.2298048436641693, + "objective/rlhf_reward": -1.3788288980722427, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.204251527786255, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4091796875, + "step": 1113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996941089630127 + }, + { + "episode": 26760, + "epoch": 0.05344453631472361, + "loss/policy_avg": 0.08032204210758209, + "lr": 2.6796395705521472e-06, + "objective/entropy": 78.4005355834961, + "objective/kl": 5.9305033683776855, + "objective/non_score_reward": -0.29652518033981323, + "objective/rlhf_reward": -1.7791510671377182, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.47076940536499, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3603515625, + "step": 1114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.993044376373291 + }, + { + "episode": 26784, + "epoch": 0.05349246863428839, + "loss/policy_avg": -0.030530108138918877, + "lr": 2.6793519938650306e-06, + "objective/entropy": 106.04249572753906, + "objective/kl": 6.004731178283691, + "objective/non_score_reward": -0.3002365827560425, + "objective/rlhf_reward": -1.8014194937422872, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.333747148513794, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.45703125, + "step": 1115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997634887695312 + }, + { + "episode": 26808, + "epoch": 0.05354040095385316, + "loss/policy_avg": 0.02031274512410164, + "lr": 2.679064417177914e-06, + "objective/entropy": 91.51631927490234, + "objective/kl": 5.170792579650879, + "objective/non_score_reward": -0.2585396468639374, + "objective/rlhf_reward": 1.4487621933221817, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.692028045654297, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4306640625, + "step": 1116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001706123352051 + }, + { + "episode": 26832, + "epoch": 0.053588333273417936, + "loss/policy_avg": 0.01536665391176939, + "lr": 2.678776840490798e-06, + "objective/entropy": 106.1983642578125, + "objective/kl": 4.398663520812988, + "objective/non_score_reward": -0.2199331521987915, + "objective/rlhf_reward": 2.4659796156865768, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.739922046661377, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4677734375, + "step": 1117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988198280334473 + }, + { + "episode": 26856, + "epoch": 0.05363626559298271, + "loss/policy_avg": 0.07479061186313629, + "lr": 2.6784892638036813e-06, + "objective/entropy": 95.84239196777344, + "objective/kl": 6.744388103485107, + "objective/non_score_reward": -0.3372194170951843, + "objective/rlhf_reward": -2.023316316306591, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.844968795776367, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4482421875, + "step": 1118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9939875602722168 + }, + { + "episode": 26880, + "epoch": 0.053684197912547485, + "loss/policy_avg": -0.03654705733060837, + "lr": 2.6782016871165644e-06, + "objective/entropy": 88.05001068115234, + "objective/kl": 4.1610236167907715, + "objective/non_score_reward": -0.2080511748790741, + "objective/rlhf_reward": 1.7516929656267166, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.5429580211639404, + "policy/clipfrac_avg": 1.8333333730697632, + "policy/entropy_avg": 0.3876953125, + "step": 1119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001499652862549 + }, + { + "episode": 26904, + "epoch": 0.053732130232112256, + "loss/policy_avg": 0.0602584034204483, + "lr": 2.6779141104294478e-06, + "objective/entropy": 94.37205505371094, + "objective/kl": 4.892537593841553, + "objective/non_score_reward": -0.24462687969207764, + "objective/rlhf_reward": 0.26662773190892564, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.0389404296875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4130859375, + "step": 1120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975436925888062 + }, + { + "episode": 26928, + "epoch": 0.053780062551677034, + "loss/policy_avg": 0.03952522948384285, + "lr": 2.677626533742331e-06, + "objective/entropy": 114.91151428222656, + "objective/kl": 4.518158912658691, + "objective/non_score_reward": -0.22590792179107666, + "objective/rlhf_reward": -1.3554475903511047, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.820559024810791, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4951171875, + "step": 1121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984760284423828 + }, + { + "episode": 26952, + "epoch": 0.053827994871241805, + "loss/policy_avg": 0.05496615916490555, + "lr": 2.6773389570552146e-06, + "objective/entropy": 85.8456039428711, + "objective/kl": 5.286839485168457, + "objective/non_score_reward": -0.26434198021888733, + "objective/rlhf_reward": -1.586051806807518, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.544083595275879, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.361328125, + "step": 1122, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993293285369873 + }, + { + "episode": 26976, + "epoch": 0.05387592719080658, + "loss/policy_avg": 0.06733623892068863, + "lr": 2.677051380368098e-06, + "objective/entropy": 91.79418182373047, + "objective/kl": 2.3960418701171875, + "objective/non_score_reward": -0.1198020800948143, + "objective/rlhf_reward": 1.0873674934150013, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 9.045648574829102, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.41796875, + "step": 1123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9932656288146973 + }, + { + "episode": 27000, + "epoch": 0.053923859510371354, + "loss/policy_avg": 0.0771905928850174, + "lr": 2.6767638036809815e-06, + "objective/entropy": 63.82981872558594, + "objective/kl": 4.332212448120117, + "objective/non_score_reward": -0.2166105955839157, + "objective/rlhf_reward": -1.2996636182069778, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.904680013656616, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.376953125, + "step": 1124, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001800060272217 + }, + { + "episode": 27024, + "epoch": 0.05397179182993613, + "loss/policy_avg": -0.03417348861694336, + "lr": 2.676476226993865e-06, + "objective/entropy": 97.75364685058594, + "objective/kl": 6.129705429077148, + "objective/non_score_reward": -0.30648529529571533, + "objective/rlhf_reward": 0.2983313508738411, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.262725830078125, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.42578125, + "step": 1125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999388456344604 + }, + { + "episode": 27048, + "epoch": 0.0540197241495009, + "loss/policy_avg": -0.025505676865577698, + "lr": 2.6761886503067483e-06, + "objective/entropy": 106.66728210449219, + "objective/kl": 3.0582921504974365, + "objective/non_score_reward": -0.1529145985841751, + "objective/rlhf_reward": -0.917487621307373, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.767441987991333, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4619140625, + "step": 1126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013375282287598 + }, + { + "episode": 27072, + "epoch": 0.05406765646906568, + "loss/policy_avg": 0.17864881455898285, + "lr": 2.675901073619632e-06, + "objective/entropy": 120.01412200927734, + "objective/kl": 3.30631422996521, + "objective/non_score_reward": -0.16531570255756378, + "objective/rlhf_reward": -0.9918941929936409, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7364017963409424, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53125, + "step": 1127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0026094913482666 + }, + { + "episode": 27096, + "epoch": 0.05411558878863045, + "loss/policy_avg": 0.0003708861768245697, + "lr": 2.6756134969325156e-06, + "objective/entropy": 95.44573974609375, + "objective/kl": 6.2458014488220215, + "objective/non_score_reward": -0.3122900724411011, + "objective/rlhf_reward": 1.1262594982981682, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.6454837322235107, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4150390625, + "step": 1128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002362728118896 + }, + { + "episode": 27120, + "epoch": 0.05416352110819523, + "loss/policy_avg": 0.06501474231481552, + "lr": 2.675325920245399e-06, + "objective/entropy": 93.83169555664062, + "objective/kl": 4.886845111846924, + "objective/non_score_reward": -0.24434226751327515, + "objective/rlhf_reward": 0.533946432173252, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.5892629623413086, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4072265625, + "step": 1129, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007100105285645 + }, + { + "episode": 27144, + "epoch": 0.05421145342776, + "loss/policy_avg": 0.028068795800209045, + "lr": 2.6750383435582825e-06, + "objective/entropy": 96.39513397216797, + "objective/kl": 8.28425121307373, + "objective/non_score_reward": -0.41421255469322205, + "objective/rlhf_reward": -2.48527529835701, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.161108016967773, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4091796875, + "step": 1130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995183944702148 + }, + { + "episode": 27168, + "epoch": 0.05425938574732478, + "loss/policy_avg": 0.059878990054130554, + "lr": 2.674750766871166e-06, + "objective/entropy": 105.95393371582031, + "objective/kl": 8.452995300292969, + "objective/non_score_reward": -0.4226498007774353, + "objective/rlhf_reward": -2.5358985662460327, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.729820251464844, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4970703125, + "step": 1131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985740184783936 + }, + { + "episode": 27192, + "epoch": 0.05430731806688955, + "loss/policy_avg": 0.11399880051612854, + "lr": 2.6744631901840493e-06, + "objective/entropy": 90.37095642089844, + "objective/kl": 2.693025588989258, + "objective/non_score_reward": -0.1346512883901596, + "objective/rlhf_reward": -0.8079077266156673, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.9193320274353027, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3759765625, + "step": 1132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996883869171143 + }, + { + "episode": 27216, + "epoch": 0.05435525038645433, + "loss/policy_avg": 0.0996660590171814, + "lr": 2.6741756134969327e-06, + "objective/entropy": 89.13509368896484, + "objective/kl": 6.152263164520264, + "objective/non_score_reward": -0.30761316418647766, + "objective/rlhf_reward": 0.4754378433872226, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.595724105834961, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3896484375, + "step": 1133, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000275135040283 + }, + { + "episode": 27240, + "epoch": 0.0544031827060191, + "loss/policy_avg": 0.01150241307914257, + "lr": 2.6738880368098157e-06, + "objective/entropy": 74.48603057861328, + "objective/kl": 3.3848493099212646, + "objective/non_score_reward": -0.1692424714565277, + "objective/rlhf_reward": 0.8773345735362377, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.069715976715088, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3564453125, + "step": 1134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001410961151123 + }, + { + "episode": 27264, + "epoch": 0.05445111502558388, + "loss/policy_avg": 0.03166067600250244, + "lr": 2.673600460122699e-06, + "objective/entropy": 93.8587875366211, + "objective/kl": 3.831500768661499, + "objective/non_score_reward": -0.1915750503540039, + "objective/rlhf_reward": 0.6567296793104443, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.58781099319458, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4267578125, + "step": 1135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998103141784668 + }, + { + "episode": 27288, + "epoch": 0.05449904734514865, + "loss/policy_avg": 0.01161002367734909, + "lr": 2.6733128834355826e-06, + "objective/entropy": 89.36381530761719, + "objective/kl": 4.230252265930176, + "objective/non_score_reward": -0.21151261031627655, + "objective/rlhf_reward": 0.8681674309481514, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.5447715520858765, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4501953125, + "step": 1136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001729965209961 + }, + { + "episode": 27312, + "epoch": 0.054546979664713426, + "loss/policy_avg": 0.08293555676937103, + "lr": 2.6730253067484664e-06, + "objective/entropy": 83.92723083496094, + "objective/kl": 6.325356960296631, + "objective/non_score_reward": -0.31626784801483154, + "objective/rlhf_reward": 1.8879714482409171, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.28829288482666, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.41796875, + "step": 1137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998842477798462 + }, + { + "episode": 27336, + "epoch": 0.0545949119842782, + "loss/policy_avg": 0.04554343223571777, + "lr": 2.67273773006135e-06, + "objective/entropy": 97.24337005615234, + "objective/kl": 5.163900852203369, + "objective/non_score_reward": -0.25819501280784607, + "objective/rlhf_reward": 1.0348892715932818, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.4818319082260132, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.43359375, + "step": 1138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024900436401367 + }, + { + "episode": 27360, + "epoch": 0.054642844303842975, + "loss/policy_avg": 0.10421346127986908, + "lr": 2.6724501533742333e-06, + "objective/entropy": 121.57249450683594, + "objective/kl": 4.407256603240967, + "objective/non_score_reward": -0.2203628122806549, + "objective/rlhf_reward": -1.3221768364310265, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7852377891540527, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.521484375, + "step": 1139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990224838256836 + }, + { + "episode": 27384, + "epoch": 0.054690776623407746, + "loss/policy_avg": 0.0725453570485115, + "lr": 2.6721625766871167e-06, + "objective/entropy": 113.88622283935547, + "objective/kl": 2.6006157398223877, + "objective/non_score_reward": -0.1300307959318161, + "objective/rlhf_reward": -0.7801847793161869, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.894242286682129, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.546875, + "step": 1140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997913122177124 + }, + { + "episode": 27408, + "epoch": 0.054738708942972523, + "loss/policy_avg": 0.10692949593067169, + "lr": 2.671875e-06, + "objective/entropy": 80.08686828613281, + "objective/kl": 5.211799144744873, + "objective/non_score_reward": -0.26058995723724365, + "objective/rlhf_reward": 0.7575771223355297, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.168432235717773, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3564453125, + "step": 1141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987369775772095 + }, + { + "episode": 27432, + "epoch": 0.054786641262537294, + "loss/policy_avg": 0.11978423595428467, + "lr": 2.6715874233128836e-06, + "objective/entropy": 100.02444458007812, + "objective/kl": 4.100367546081543, + "objective/non_score_reward": -0.20501834154129028, + "objective/rlhf_reward": 4.769889973104, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.5303272008895874, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4306640625, + "step": 1142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010287761688232 + }, + { + "episode": 27456, + "epoch": 0.05483457358210207, + "loss/policy_avg": 0.13152985274791718, + "lr": 2.671299846625767e-06, + "objective/entropy": 109.86874389648438, + "objective/kl": 5.124700546264648, + "objective/non_score_reward": -0.2562350034713745, + "objective/rlhf_reward": -1.53741005808115, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.635559320449829, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.484375, + "step": 1143, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994277954101562 + }, + { + "episode": 27480, + "epoch": 0.05488250590166684, + "loss/policy_avg": -0.01842358522117138, + "lr": 2.6710122699386504e-06, + "objective/entropy": 97.14021301269531, + "objective/kl": 7.593907356262207, + "objective/non_score_reward": -0.3796953856945038, + "objective/rlhf_reward": -2.2781722843647003, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.597567081451416, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4560546875, + "step": 1144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991778135299683 + }, + { + "episode": 27504, + "epoch": 0.05493043822123162, + "loss/policy_avg": 0.027250809594988823, + "lr": 2.670724693251534e-06, + "objective/entropy": 86.82876586914062, + "objective/kl": 5.854022026062012, + "objective/non_score_reward": -0.2927010953426361, + "objective/rlhf_reward": 2.0293720238787345, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.9635722637176514, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.396484375, + "step": 1145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981366395950317 + }, + { + "episode": 27528, + "epoch": 0.05497837054079639, + "loss/policy_avg": 0.007476774975657463, + "lr": 2.6704371165644173e-06, + "objective/entropy": 125.54287719726562, + "objective/kl": 2.6274185180664062, + "objective/non_score_reward": -0.1313709318637848, + "objective/rlhf_reward": -0.7882255539298058, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.637908458709717, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.568359375, + "step": 1146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0001416206359863 + }, + { + "episode": 27552, + "epoch": 0.05502630286036117, + "loss/policy_avg": 0.005840647965669632, + "lr": 2.6701495398773007e-06, + "objective/entropy": 80.70919036865234, + "objective/kl": 2.5856428146362305, + "objective/non_score_reward": -0.129282146692276, + "objective/rlhf_reward": 3.009885674802702, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.64048171043396, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4150390625, + "step": 1147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001910924911499 + }, + { + "episode": 27576, + "epoch": 0.05507423517992594, + "loss/policy_avg": -0.020214717835187912, + "lr": 2.669861963190184e-06, + "objective/entropy": 90.07586669921875, + "objective/kl": 5.335107326507568, + "objective/non_score_reward": -0.2667553424835205, + "objective/rlhf_reward": 0.13385679497278558, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 7.320379257202148, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4169921875, + "step": 1148, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000408172607422 + }, + { + "episode": 27600, + "epoch": 0.05512216749949072, + "loss/policy_avg": 0.039229054003953934, + "lr": 2.6695743865030676e-06, + "objective/entropy": 83.39730834960938, + "objective/kl": 5.491415977478027, + "objective/non_score_reward": -0.2745707929134369, + "objective/rlhf_reward": 4.352575302124023, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.4964402914047241, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3720703125, + "step": 1149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014398097991943 + }, + { + "episode": 27624, + "epoch": 0.05517009981905549, + "loss/policy_avg": 0.006023553665727377, + "lr": 2.669286809815951e-06, + "objective/entropy": 81.18807983398438, + "objective/kl": 4.099208354949951, + "objective/non_score_reward": -0.204960435628891, + "objective/rlhf_reward": 0.9074805982817543, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.8608381748199463, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.408203125, + "step": 1150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9991472959518433 + }, + { + "episode": 27648, + "epoch": 0.05521803213862027, + "loss/policy_avg": 0.03251512348651886, + "lr": 2.6689992331288344e-06, + "objective/entropy": 112.71952819824219, + "objective/kl": 3.425093412399292, + "objective/non_score_reward": -0.17125467956066132, + "objective/rlhf_reward": -1.0275279842317104, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.531571388244629, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.607421875, + "step": 1151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9997931718826294 + }, + { + "episode": 27672, + "epoch": 0.05526596445818504, + "loss/policy_avg": 0.01777615025639534, + "lr": 2.668711656441718e-06, + "objective/entropy": 112.26338195800781, + "objective/kl": 4.395780563354492, + "objective/non_score_reward": -0.21978901326656342, + "objective/rlhf_reward": 0.6812659278512, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.8384833335876465, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.49609375, + "step": 1152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981086254119873 + }, + { + "episode": 27696, + "epoch": 0.05531389677774982, + "loss/policy_avg": 0.07646454870700836, + "lr": 2.6684240797546013e-06, + "objective/entropy": 146.47323608398438, + "objective/kl": 3.389132261276245, + "objective/non_score_reward": -0.1694566309452057, + "objective/rlhf_reward": 1.5673196801157685, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.5567519664764404, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.654296875, + "step": 1153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977970123291016 + }, + { + "episode": 27720, + "epoch": 0.055361829097314595, + "loss/policy_avg": 0.1364010125398636, + "lr": 2.6681365030674847e-06, + "objective/entropy": 107.58038330078125, + "objective/kl": 1.6906406879425049, + "objective/non_score_reward": -0.08453203737735748, + "objective/rlhf_reward": 1.492807798087597, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.7190680503845215, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4970703125, + "step": 1154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996492862701416 + }, + { + "episode": 27744, + "epoch": 0.055409761416879366, + "loss/policy_avg": 0.08620058000087738, + "lr": 2.667848926380368e-06, + "objective/entropy": 96.89375305175781, + "objective/kl": 3.417637348175049, + "objective/non_score_reward": -0.17088185250759125, + "objective/rlhf_reward": 0.7090978130594574, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.964858293533325, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4560546875, + "step": 1155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973666667938232 + }, + { + "episode": 27768, + "epoch": 0.055457693736444144, + "loss/policy_avg": 0.059395574033260345, + "lr": 2.6675613496932515e-06, + "objective/entropy": 83.75469970703125, + "objective/kl": 5.590975761413574, + "objective/non_score_reward": -0.2795487940311432, + "objective/rlhf_reward": -1.677292674779892, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.860408306121826, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3759765625, + "step": 1156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99812912940979 + }, + { + "episode": 27792, + "epoch": 0.055505626056008915, + "loss/policy_avg": 0.058242298662662506, + "lr": 2.667273773006135e-06, + "objective/entropy": 88.4250717163086, + "objective/kl": 4.7617998123168945, + "objective/non_score_reward": -0.23808996379375458, + "objective/rlhf_reward": 0.46424945560010333, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.7721972465515137, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3916015625, + "step": 1157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985663890838623 + }, + { + "episode": 27816, + "epoch": 0.05555355837557369, + "loss/policy_avg": 0.007366602774709463, + "lr": 2.6669861963190184e-06, + "objective/entropy": 80.23500061035156, + "objective/kl": 4.870950698852539, + "objective/non_score_reward": -0.24354752898216248, + "objective/rlhf_reward": -1.4612852036952972, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.9633209705352783, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.41796875, + "step": 1158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9998345375061035 + }, + { + "episode": 27840, + "epoch": 0.055601490695138464, + "loss/policy_avg": 0.04662128537893295, + "lr": 2.666698619631902e-06, + "objective/entropy": 77.71585083007812, + "objective/kl": 6.85030460357666, + "objective/non_score_reward": -0.342515230178833, + "objective/rlhf_reward": 0.2660255219388965, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 9.878190994262695, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.341796875, + "step": 1159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.995159387588501 + }, + { + "episode": 27864, + "epoch": 0.05564942301470324, + "loss/policy_avg": 0.04583994299173355, + "lr": 2.6664110429447852e-06, + "objective/entropy": 99.98128509521484, + "objective/kl": 4.004051208496094, + "objective/non_score_reward": -0.2002025544643402, + "objective/rlhf_reward": 2.5843632989508323, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 7.163872718811035, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.458984375, + "step": 1160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980669021606445 + }, + { + "episode": 27888, + "epoch": 0.05569735533426801, + "loss/policy_avg": 0.03873520344495773, + "lr": 2.666123466257669e-06, + "objective/entropy": 92.34793090820312, + "objective/kl": 4.986799240112305, + "objective/non_score_reward": -0.2493399828672409, + "objective/rlhf_reward": 4.503960117697716, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.5075056552886963, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4091796875, + "step": 1161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001192569732666 + }, + { + "episode": 27912, + "epoch": 0.05574528765383279, + "loss/policy_avg": -0.005000388249754906, + "lr": 2.6658358895705525e-06, + "objective/entropy": 78.02603912353516, + "objective/kl": 5.831294059753418, + "objective/non_score_reward": -0.2915647029876709, + "objective/rlhf_reward": 0.5717288042951587, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.266764163970947, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3740234375, + "step": 1162, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990018606185913 + }, + { + "episode": 27936, + "epoch": 0.05579321997339756, + "loss/policy_avg": 0.01324117835611105, + "lr": 2.665548312883436e-06, + "objective/entropy": 87.77718353271484, + "objective/kl": 5.045506000518799, + "objective/non_score_reward": -0.2522753179073334, + "objective/rlhf_reward": -1.5136517211794853, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.577069878578186, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.435546875, + "step": 1163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0017969608306885 + }, + { + "episode": 27960, + "epoch": 0.05584115229296234, + "loss/policy_avg": 0.024114342406392097, + "lr": 2.665260736196319e-06, + "objective/entropy": 88.68128967285156, + "objective/kl": 4.947149753570557, + "objective/non_score_reward": -0.24735748767852783, + "objective/rlhf_reward": -1.484144801273942, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6320457458496094, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3994140625, + "step": 1164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001357078552246 + }, + { + "episode": 27984, + "epoch": 0.05588908461252711, + "loss/policy_avg": 0.027663497254252434, + "lr": 2.6649731595092024e-06, + "objective/entropy": 78.7529525756836, + "objective/kl": 4.376445293426514, + "objective/non_score_reward": -0.21882224082946777, + "objective/rlhf_reward": -1.3129335045814514, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5460267066955566, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.341796875, + "step": 1165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002506971359253 + }, + { + "episode": 28008, + "epoch": 0.05593701693209189, + "loss/policy_avg": 0.14389236271381378, + "lr": 2.664685582822086e-06, + "objective/entropy": 97.23067474365234, + "objective/kl": 2.5956039428710938, + "objective/non_score_reward": -0.12978020310401917, + "objective/rlhf_reward": -0.7786810956895351, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5695700645446777, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.431640625, + "step": 1166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000123977661133 + }, + { + "episode": 28032, + "epoch": 0.05598494925165666, + "loss/policy_avg": 0.15681564807891846, + "lr": 2.6643980061349692e-06, + "objective/entropy": 87.3509521484375, + "objective/kl": 5.827916622161865, + "objective/non_score_reward": -0.29139578342437744, + "objective/rlhf_reward": -1.7483748346567154, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.654561996459961, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4501953125, + "step": 1167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9975271224975586 + }, + { + "episode": 28056, + "epoch": 0.05603288157122144, + "loss/policy_avg": 0.06072466820478439, + "lr": 2.6641104294478526e-06, + "objective/entropy": 120.56087493896484, + "objective/kl": 4.093770980834961, + "objective/non_score_reward": -0.20468854904174805, + "objective/rlhf_reward": -1.2281312718987465, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.832676649093628, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.53515625, + "step": 1168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003864288330078 + }, + { + "episode": 28080, + "epoch": 0.05608081389078621, + "loss/policy_avg": 0.0215420201420784, + "lr": 2.663822852760736e-06, + "objective/entropy": 85.42818450927734, + "objective/kl": 6.5486907958984375, + "objective/non_score_reward": -0.3274345397949219, + "objective/rlhf_reward": -1.96460722386837, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.732733964920044, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.431640625, + "step": 1169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9999141693115234 + }, + { + "episode": 28104, + "epoch": 0.05612874621035099, + "loss/policy_avg": -0.0006338693201541901, + "lr": 2.6635352760736195e-06, + "objective/entropy": 125.00828552246094, + "objective/kl": 4.396636009216309, + "objective/non_score_reward": -0.21983179450035095, + "objective/rlhf_reward": -1.318990807980299, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.203152060508728, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.515625, + "step": 1170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0049400329589844 + }, + { + "episode": 28128, + "epoch": 0.05617667852991576, + "loss/policy_avg": 0.04888657480478287, + "lr": 2.6632476993865034e-06, + "objective/entropy": 130.97093200683594, + "objective/kl": 4.540463447570801, + "objective/non_score_reward": -0.2270231544971466, + "objective/rlhf_reward": -1.3621388748288155, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8380043506622314, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 1171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0028505325317383 + }, + { + "episode": 28152, + "epoch": 0.056224610849480536, + "loss/policy_avg": 0.10968983918428421, + "lr": 2.6629601226993868e-06, + "objective/entropy": 98.93994140625, + "objective/kl": 7.529016017913818, + "objective/non_score_reward": -0.3764508366584778, + "objective/rlhf_reward": 0.32535455200690955, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.66267204284668, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.404296875, + "step": 1172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980343580245972 + }, + { + "episode": 28176, + "epoch": 0.05627254316904531, + "loss/policy_avg": -0.009219329804182053, + "lr": 2.66267254601227e-06, + "objective/entropy": 94.8162841796875, + "objective/kl": 5.397879600524902, + "objective/non_score_reward": -0.26989397406578064, + "objective/rlhf_reward": 0.7017530884195331, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 5.415735721588135, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.42578125, + "step": 1173, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980649948120117 + }, + { + "episode": 28200, + "epoch": 0.056320475488610085, + "loss/policy_avg": 0.11069285869598389, + "lr": 2.6623849693251536e-06, + "objective/entropy": 137.881591796875, + "objective/kl": 3.291466474533081, + "objective/non_score_reward": -0.16457334160804749, + "objective/rlhf_reward": 1.1498031456430091, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.7789411544799805, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.65234375, + "step": 1174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0020415782928467 + }, + { + "episode": 28224, + "epoch": 0.056368407808174856, + "loss/policy_avg": 0.1428847461938858, + "lr": 2.662097392638037e-06, + "objective/entropy": 98.12935638427734, + "objective/kl": 3.938546657562256, + "objective/non_score_reward": -0.19692735373973846, + "objective/rlhf_reward": 2.604014421342056, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.0037732124328613, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.412109375, + "step": 1175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001410722732544 + }, + { + "episode": 28248, + "epoch": 0.056416340127739634, + "loss/policy_avg": 0.01627153344452381, + "lr": 2.6618098159509205e-06, + "objective/entropy": 64.60018157958984, + "objective/kl": 4.74695348739624, + "objective/non_score_reward": -0.23734769225120544, + "objective/rlhf_reward": 1.575914017856121, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.1768012046813965, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.37890625, + "step": 1176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9973962306976318 + }, + { + "episode": 28272, + "epoch": 0.056464272447304405, + "loss/policy_avg": -0.03618495538830757, + "lr": 2.661522239263804e-06, + "objective/entropy": 119.80192565917969, + "objective/kl": 5.215415000915527, + "objective/non_score_reward": -0.2607707679271698, + "objective/rlhf_reward": 0.43537549674510945, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.9520022869110107, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.6640625, + "step": 1177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0018014907836914 + }, + { + "episode": 28296, + "epoch": 0.05651220476686918, + "loss/policy_avg": -0.012625349685549736, + "lr": 2.6612346625766873e-06, + "objective/entropy": 112.26144409179688, + "objective/kl": 2.1360063552856445, + "objective/non_score_reward": -0.10680033266544342, + "objective/rlhf_reward": -0.6408019531518221, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6824402809143066, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.52734375, + "step": 1178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000779390335083 + }, + { + "episode": 28320, + "epoch": 0.056560137086433954, + "loss/policy_avg": -0.03699297457933426, + "lr": 2.6609470858895703e-06, + "objective/entropy": 97.54267883300781, + "objective/kl": 4.99289608001709, + "objective/non_score_reward": -0.24964480102062225, + "objective/rlhf_reward": -1.4978688601404428, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.7862565517425537, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4638671875, + "step": 1179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982776641845703 + }, + { + "episode": 28344, + "epoch": 0.05660806940599873, + "loss/policy_avg": 0.05514891445636749, + "lr": 2.6606595092024538e-06, + "objective/entropy": 102.04390716552734, + "objective/kl": 6.983734130859375, + "objective/non_score_reward": -0.3491867184638977, + "objective/rlhf_reward": -2.095120094716549, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.2256221771240234, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.462890625, + "step": 1180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999751329421997 + }, + { + "episode": 28368, + "epoch": 0.0566560017255635, + "loss/policy_avg": 0.05543017387390137, + "lr": 2.660371932515337e-06, + "objective/entropy": 93.04561614990234, + "objective/kl": 4.886442184448242, + "objective/non_score_reward": -0.24432210624217987, + "objective/rlhf_reward": -1.465932548046112, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3926234245300293, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.41796875, + "step": 1181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993884563446045 + }, + { + "episode": 28392, + "epoch": 0.05670393404512828, + "loss/policy_avg": 0.06946446001529694, + "lr": 2.660084355828221e-06, + "objective/entropy": 134.31735229492188, + "objective/kl": 4.089544296264648, + "objective/non_score_reward": -0.2044772356748581, + "objective/rlhf_reward": 1.7731365822255611, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.7681013345718384, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.630859375, + "step": 1182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000748634338379 + }, + { + "episode": 28416, + "epoch": 0.05675186636469305, + "loss/policy_avg": 0.09049968421459198, + "lr": 2.6597967791411045e-06, + "objective/entropy": 100.19247436523438, + "objective/kl": 2.641049861907959, + "objective/non_score_reward": -0.13205251097679138, + "objective/rlhf_reward": -0.7923149541020393, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.2779417037963867, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.662109375, + "step": 1183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0003724098205566 + }, + { + "episode": 28440, + "epoch": 0.05679979868425783, + "loss/policy_avg": 0.0019521701615303755, + "lr": 2.659509202453988e-06, + "objective/entropy": 115.8849105834961, + "objective/kl": 4.649982929229736, + "objective/non_score_reward": -0.2324991673231125, + "objective/rlhf_reward": 4.605005130171776, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.305290699005127, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.556640625, + "step": 1184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0009500980377197 + }, + { + "episode": 28464, + "epoch": 0.0568477310038226, + "loss/policy_avg": -0.04213211312890053, + "lr": 2.6592216257668713e-06, + "objective/entropy": 100.4990234375, + "objective/kl": 7.465394973754883, + "objective/non_score_reward": -0.3732697367668152, + "objective/rlhf_reward": -2.2396183907985687, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.749096870422363, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4921875, + "step": 1185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972479343414307 + }, + { + "episode": 28488, + "epoch": 0.05689566332338738, + "loss/policy_avg": -0.013026927597820759, + "lr": 2.6589340490797547e-06, + "objective/entropy": 91.71714782714844, + "objective/kl": 2.4615631103515625, + "objective/non_score_reward": -0.12307815253734589, + "objective/rlhf_reward": 1.2615310922265053, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 1.2195324897766113, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.400390625, + "step": 1186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0056614875793457 + }, + { + "episode": 28512, + "epoch": 0.05694359564295215, + "loss/policy_avg": 0.06713936477899551, + "lr": 2.658646472392638e-06, + "objective/entropy": 109.15403747558594, + "objective/kl": 2.895627737045288, + "objective/non_score_reward": -0.14478138089179993, + "objective/rlhf_reward": -0.8686882518231869, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3260698318481445, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.45703125, + "step": 1187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001437187194824 + }, + { + "episode": 28536, + "epoch": 0.05699152796251693, + "loss/policy_avg": 0.024431779980659485, + "lr": 2.6583588957055216e-06, + "objective/entropy": 93.95329284667969, + "objective/kl": 1.7340772151947021, + "objective/non_score_reward": -0.08670388162136078, + "objective/rlhf_reward": -0.5202232152223587, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.378361701965332, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.41015625, + "step": 1188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9965698719024658 + }, + { + "episode": 28560, + "epoch": 0.0570394602820817, + "loss/policy_avg": 0.0604439377784729, + "lr": 2.658071319018405e-06, + "objective/entropy": 94.61737060546875, + "objective/kl": 5.3064165115356445, + "objective/non_score_reward": -0.2653208374977112, + "objective/rlhf_reward": -1.5919249951839447, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.421147346496582, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4150390625, + "step": 1189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984915256500244 + }, + { + "episode": 28584, + "epoch": 0.057087392601646476, + "loss/policy_avg": 0.07448774576187134, + "lr": 2.6577837423312884e-06, + "objective/entropy": 98.87156677246094, + "objective/kl": 5.316677093505859, + "objective/non_score_reward": -0.26583385467529297, + "objective/rlhf_reward": -1.5950031019747257, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.412245750427246, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4423828125, + "step": 1190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9949787855148315 + }, + { + "episode": 28608, + "epoch": 0.05713532492121125, + "loss/policy_avg": 0.030409488826990128, + "lr": 2.657496165644172e-06, + "objective/entropy": 88.15553283691406, + "objective/kl": 6.066204071044922, + "objective/non_score_reward": -0.30331021547317505, + "objective/rlhf_reward": 1.9657171242815665, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.10440731048584, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3994140625, + "step": 1191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0001344680786133 + }, + { + "episode": 28632, + "epoch": 0.057183257240776025, + "loss/policy_avg": 0.03474830090999603, + "lr": 2.6572085889570553e-06, + "objective/entropy": 75.75563049316406, + "objective/kl": 7.164586544036865, + "objective/non_score_reward": -0.3582293391227722, + "objective/rlhf_reward": -2.14937587082386, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.1649770736694336, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3603515625, + "step": 1192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014123916625977 + }, + { + "episode": 28656, + "epoch": 0.057231189560340796, + "loss/policy_avg": 0.02136962115764618, + "lr": 2.6569210122699387e-06, + "objective/entropy": 92.1910400390625, + "objective/kl": 4.985260963439941, + "objective/non_score_reward": -0.24926301836967468, + "objective/rlhf_reward": -1.4955781251192093, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.114431858062744, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.408203125, + "step": 1193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999223232269287 + }, + { + "episode": 28680, + "epoch": 0.057279121879905574, + "loss/policy_avg": 0.06472158432006836, + "lr": 2.656633435582822e-06, + "objective/entropy": 94.45437622070312, + "objective/kl": 5.873802185058594, + "objective/non_score_reward": -0.29369014501571655, + "objective/rlhf_reward": 1.2378591299057007, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.3117523193359375, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4130859375, + "step": 1194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998021125793457 + }, + { + "episode": 28704, + "epoch": 0.057327054199470345, + "loss/policy_avg": 0.061276551336050034, + "lr": 2.6563458588957056e-06, + "objective/entropy": 100.33500671386719, + "objective/kl": 6.152198791503906, + "objective/non_score_reward": -0.3076099157333374, + "objective/rlhf_reward": -1.8456595912575722, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.522089958190918, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.43359375, + "step": 1195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990705251693726 + }, + { + "episode": 28728, + "epoch": 0.05737498651903512, + "loss/policy_avg": 0.27513980865478516, + "lr": 2.656058282208589e-06, + "objective/entropy": 91.89706420898438, + "objective/kl": 6.332284927368164, + "objective/non_score_reward": -0.3166142702102661, + "objective/rlhf_reward": -1.8996854051947594, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.526993274688721, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4541015625, + "step": 1196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9994009733200073 + }, + { + "episode": 28752, + "epoch": 0.057422918838599894, + "loss/policy_avg": 0.05305163562297821, + "lr": 2.6557707055214724e-06, + "objective/entropy": 99.92655944824219, + "objective/kl": 3.6894545555114746, + "objective/non_score_reward": -0.1844727247953415, + "objective/rlhf_reward": -1.1068362966179848, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.768837928771973, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4609375, + "step": 1197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970364570617676 + }, + { + "episode": 28776, + "epoch": 0.05747085115816467, + "loss/policy_avg": 0.05872901529073715, + "lr": 2.655483128834356e-06, + "objective/entropy": 102.54901123046875, + "objective/kl": 4.1227874755859375, + "objective/non_score_reward": -0.20613938570022583, + "objective/rlhf_reward": 0.9004067860950363, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.2446699142456055, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.603515625, + "step": 1198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9952796697616577 + }, + { + "episode": 28800, + "epoch": 0.05751878347772944, + "loss/policy_avg": 0.08320063352584839, + "lr": 2.6551955521472393e-06, + "objective/entropy": 120.05440521240234, + "objective/kl": 2.7737433910369873, + "objective/non_score_reward": -0.13868717849254608, + "objective/rlhf_reward": 1.3051200814951789, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.8109562397003174, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 1199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982588291168213 + }, + { + "episode": 28824, + "epoch": 0.05756671579729422, + "loss/policy_avg": 0.06002422422170639, + "lr": 2.6549079754601227e-06, + "objective/entropy": 89.57402801513672, + "objective/kl": 4.8851704597473145, + "objective/non_score_reward": -0.24425852298736572, + "objective/rlhf_reward": 2.3200273686033897, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 5.091567516326904, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.421875, + "step": 1200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99673593044281 + }, + { + "episode": 28848, + "epoch": 0.05761464811685899, + "loss/policy_avg": 0.017391083762049675, + "lr": 2.654620398773006e-06, + "objective/entropy": 85.96940612792969, + "objective/kl": 6.035163879394531, + "objective/non_score_reward": -0.3017582297325134, + "objective/rlhf_reward": -0.004369210696097858, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.0855910778045654, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3935546875, + "step": 1201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002541542053223 + }, + { + "episode": 28872, + "epoch": 0.05766258043642377, + "loss/policy_avg": 0.026785043999552727, + "lr": 2.6543328220858896e-06, + "objective/entropy": 102.72750091552734, + "objective/kl": 4.55810546875, + "objective/non_score_reward": -0.2279052734375, + "objective/rlhf_reward": -1.3674316704273224, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.47432804107666, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4208984375, + "step": 1202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003695487976074 + }, + { + "episode": 28896, + "epoch": 0.05771051275598854, + "loss/policy_avg": 0.02931400015950203, + "lr": 2.654045245398773e-06, + "objective/entropy": 119.33341979980469, + "objective/kl": 5.620911598205566, + "objective/non_score_reward": -0.2810456156730652, + "objective/rlhf_reward": 1.3137264996767044, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.19083833694458, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.505859375, + "step": 1203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998694658279419 + }, + { + "episode": 28920, + "epoch": 0.05775844507555332, + "loss/policy_avg": 0.1632137894630432, + "lr": 2.6537576687116564e-06, + "objective/entropy": 76.3973159790039, + "objective/kl": 2.327392339706421, + "objective/non_score_reward": -0.1163695901632309, + "objective/rlhf_reward": -0.6982175596058369, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6114909648895264, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3203125, + "step": 1204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0020623207092285 + }, + { + "episode": 28944, + "epoch": 0.05780637739511809, + "loss/policy_avg": 0.05908843129873276, + "lr": 2.6534700920245403e-06, + "objective/entropy": 81.12391662597656, + "objective/kl": 5.749754905700684, + "objective/non_score_reward": -0.28748777508735657, + "objective/rlhf_reward": 0.08125342776787614, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.296062469482422, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3896484375, + "step": 1205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000792980194092 + }, + { + "episode": 28968, + "epoch": 0.05785430971468287, + "loss/policy_avg": 8.978372573852539, + "lr": 2.6531825153374237e-06, + "objective/entropy": 89.58897399902344, + "objective/kl": 7.408227443695068, + "objective/non_score_reward": -0.37041139602661133, + "objective/rlhf_reward": -2.222468487918377, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.139487266540527, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3662109375, + "step": 1206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0062217712402344 + }, + { + "episode": 28992, + "epoch": 0.05790224203424764, + "loss/policy_avg": 0.3653755187988281, + "lr": 2.652894938650307e-06, + "objective/entropy": 100.13902282714844, + "objective/kl": 2.825559139251709, + "objective/non_score_reward": -0.1412779688835144, + "objective/rlhf_reward": -0.8476677723228931, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.7101006507873535, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.525390625, + "step": 1207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971712827682495 + }, + { + "episode": 29016, + "epoch": 0.05795017435381242, + "loss/policy_avg": 0.044803570955991745, + "lr": 2.65260736196319e-06, + "objective/entropy": 85.99272155761719, + "objective/kl": 6.800496578216553, + "objective/non_score_reward": -0.34002482891082764, + "objective/rlhf_reward": -2.040148973464966, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4364218711853027, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3974609375, + "step": 1208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004234313964844 + }, + { + "episode": 29040, + "epoch": 0.05799810667337719, + "loss/policy_avg": -0.04962504655122757, + "lr": 2.6523197852760735e-06, + "objective/entropy": 89.02210998535156, + "objective/kl": 4.381872177124023, + "objective/non_score_reward": -0.21909360587596893, + "objective/rlhf_reward": 1.0065553161848548, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.812077045440674, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3798828125, + "step": 1209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001352548599243 + }, + { + "episode": 29064, + "epoch": 0.058046038992941966, + "loss/policy_avg": 0.11477906256914139, + "lr": 2.652032208588957e-06, + "objective/entropy": 112.53289031982422, + "objective/kl": 7.037132263183594, + "objective/non_score_reward": -0.35185664892196655, + "objective/rlhf_reward": 1.6744387620073966, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.303236961364746, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.521484375, + "step": 1210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989571571350098 + }, + { + "episode": 29088, + "epoch": 0.058093971312506744, + "loss/policy_avg": 0.023646986111998558, + "lr": 2.6517446319018404e-06, + "objective/entropy": 128.9694061279297, + "objective/kl": 4.763720512390137, + "objective/non_score_reward": -0.2381860464811325, + "objective/rlhf_reward": 1.1549431738616915, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.9897232055664062, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.560546875, + "step": 1211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994573593139648 + }, + { + "episode": 29112, + "epoch": 0.058141903632071515, + "loss/policy_avg": 0.03716292604804039, + "lr": 2.651457055214724e-06, + "objective/entropy": 112.16728210449219, + "objective/kl": 4.699753284454346, + "objective/non_score_reward": -0.23498766124248505, + "objective/rlhf_reward": -1.4099259749054909, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.0660219192504883, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.5, + "step": 1212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.997970461845398 + }, + { + "episode": 29136, + "epoch": 0.05818983595163629, + "loss/policy_avg": 0.18581008911132812, + "lr": 2.6511694785276072e-06, + "objective/entropy": 118.82444763183594, + "objective/kl": 3.8538529872894287, + "objective/non_score_reward": -0.19269266724586487, + "objective/rlhf_reward": -1.1561559587717056, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.2636746168136597, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.525390625, + "step": 1213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000020742416382 + }, + { + "episode": 29160, + "epoch": 0.058237768271201064, + "loss/policy_avg": 0.02174682356417179, + "lr": 2.6508819018404907e-06, + "objective/entropy": 89.56654357910156, + "objective/kl": 1.3962382078170776, + "objective/non_score_reward": -0.06981191784143448, + "objective/rlhf_reward": -0.4188714884221554, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.087489128112793, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4541015625, + "step": 1214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000943899154663 + }, + { + "episode": 29184, + "epoch": 0.05828570059076584, + "loss/policy_avg": 0.12123532593250275, + "lr": 2.650594325153374e-06, + "objective/entropy": 92.65281677246094, + "objective/kl": 4.150641918182373, + "objective/non_score_reward": -0.2075320929288864, + "objective/rlhf_reward": -1.2451925463974476, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5602774620056152, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4267578125, + "step": 1215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993728399276733 + }, + { + "episode": 29208, + "epoch": 0.05833363291033061, + "loss/policy_avg": -0.03289597108960152, + "lr": 2.650306748466258e-06, + "objective/entropy": 98.12457275390625, + "objective/kl": 5.346020698547363, + "objective/non_score_reward": -0.2673010528087616, + "objective/rlhf_reward": -1.6038061901926994, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.445807456970215, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4501953125, + "step": 1216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003800868988037 + }, + { + "episode": 29232, + "epoch": 0.05838156522989539, + "loss/policy_avg": 0.05858566612005234, + "lr": 2.6500191717791414e-06, + "objective/entropy": 109.56645202636719, + "objective/kl": 6.104514122009277, + "objective/non_score_reward": -0.305225670337677, + "objective/rlhf_reward": -0.02517407784449721, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 5.522624492645264, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4873046875, + "step": 1217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977205991744995 + }, + { + "episode": 29256, + "epoch": 0.05842949754946016, + "loss/policy_avg": 0.5560678839683533, + "lr": 2.649731595092025e-06, + "objective/entropy": 99.99008178710938, + "objective/kl": 6.790863037109375, + "objective/non_score_reward": -0.3395431637763977, + "objective/rlhf_reward": -2.0372589081525803, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.75335693359375, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4375, + "step": 1218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000133514404297 + }, + { + "episode": 29280, + "epoch": 0.05847742986902494, + "loss/policy_avg": 0.09920933842658997, + "lr": 2.6494440184049082e-06, + "objective/entropy": 108.1783676147461, + "objective/kl": 4.313632011413574, + "objective/non_score_reward": -0.21568159759044647, + "objective/rlhf_reward": 0.8431534998525513, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.5492777824401855, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4501953125, + "step": 1219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004942417144775 + }, + { + "episode": 29304, + "epoch": 0.05852536218858971, + "loss/policy_avg": 0.05826311931014061, + "lr": 2.6491564417177916e-06, + "objective/entropy": 82.17739868164062, + "objective/kl": 8.678319931030273, + "objective/non_score_reward": -0.4339159429073334, + "objective/rlhf_reward": 1.1820828416330031, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.383094787597656, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.400390625, + "step": 1220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974349737167358 + }, + { + "episode": 29328, + "epoch": 0.05857329450815449, + "loss/policy_avg": -0.014260968193411827, + "lr": 2.648868865030675e-06, + "objective/entropy": 84.69461059570312, + "objective/kl": 4.681558132171631, + "objective/non_score_reward": -0.23407790064811707, + "objective/rlhf_reward": -1.4044673964381218, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1119322776794434, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.373046875, + "step": 1221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992032051086426 + }, + { + "episode": 29352, + "epoch": 0.05862122682771926, + "loss/policy_avg": 0.032606713473796844, + "lr": 2.6485812883435585e-06, + "objective/entropy": 68.54422760009766, + "objective/kl": 5.44432258605957, + "objective/non_score_reward": -0.27221614122390747, + "objective/rlhf_reward": 1.3667031861841679, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.6263177394866943, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.31640625, + "step": 1222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999701976776123 + }, + { + "episode": 29376, + "epoch": 0.05866915914728404, + "loss/policy_avg": -0.0010087895207107067, + "lr": 2.6482937116564415e-06, + "objective/entropy": 100.46569061279297, + "objective/kl": 4.534698486328125, + "objective/non_score_reward": -0.226734921336174, + "objective/rlhf_reward": 0.9607073414672378, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.930721759796143, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.44140625, + "step": 1223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997867226600647 + }, + { + "episode": 29400, + "epoch": 0.05871709146684881, + "loss/policy_avg": -0.029738977551460266, + "lr": 2.648006134969325e-06, + "objective/entropy": 75.62324523925781, + "objective/kl": 6.660384654998779, + "objective/non_score_reward": -0.3330192267894745, + "objective/rlhf_reward": 1.0018846988677979, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.809900760650635, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3603515625, + "step": 1224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982290267944336 + }, + { + "episode": 29424, + "epoch": 0.05876502378641359, + "loss/policy_avg": 0.015697792172431946, + "lr": 2.6477185582822084e-06, + "objective/entropy": 92.45037078857422, + "objective/kl": 8.138596534729004, + "objective/non_score_reward": -0.4069298207759857, + "objective/rlhf_reward": -0.12046218555679289, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.7199394702911377, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4150390625, + "step": 1225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994384050369263 + }, + { + "episode": 29448, + "epoch": 0.05881295610597836, + "loss/policy_avg": 0.000506207812577486, + "lr": 2.647430981595092e-06, + "objective/entropy": 100.52769470214844, + "objective/kl": 5.975987911224365, + "objective/non_score_reward": -0.29879939556121826, + "objective/rlhf_reward": -1.7927963435649872, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.745494842529297, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4287109375, + "step": 1226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015039443969727 + }, + { + "episode": 29472, + "epoch": 0.058860888425543136, + "loss/policy_avg": 0.05871029943227768, + "lr": 2.6471434049079756e-06, + "objective/entropy": 102.02964782714844, + "objective/kl": 5.1560773849487305, + "objective/non_score_reward": -0.25780388712882996, + "objective/rlhf_reward": -1.5468232333660126, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.869965553283691, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4609375, + "step": 1227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966416358947754 + }, + { + "episode": 29496, + "epoch": 0.05890882074510791, + "loss/policy_avg": 0.06944174319505692, + "lr": 2.646855828220859e-06, + "objective/entropy": 88.83494567871094, + "objective/kl": 4.03519344329834, + "objective/non_score_reward": -0.20175963640213013, + "objective/rlhf_reward": 0.9266852297295464, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 1.4279921054840088, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4130859375, + "step": 1228, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014021396636963 + }, + { + "episode": 29520, + "epoch": 0.058956753064672685, + "loss/policy_avg": 0.007997616194188595, + "lr": 2.6465682515337425e-06, + "objective/entropy": 109.395263671875, + "objective/kl": 4.555168151855469, + "objective/non_score_reward": -0.22775839269161224, + "objective/rlhf_reward": 0.9545665170598987, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.7987277507781982, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.474609375, + "step": 1229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0000622272491455 + }, + { + "episode": 29544, + "epoch": 0.059004685384237456, + "loss/policy_avg": 0.25971540808677673, + "lr": 2.646280674846626e-06, + "objective/entropy": 104.27333068847656, + "objective/kl": 6.1825079917907715, + "objective/non_score_reward": -0.3091253936290741, + "objective/rlhf_reward": -1.85475230589509, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.9198098182678223, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.478515625, + "step": 1230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019867420196533 + }, + { + "episode": 29568, + "epoch": 0.05905261770380223, + "loss/policy_avg": -0.009754132479429245, + "lr": 2.6459930981595093e-06, + "objective/entropy": 75.54130554199219, + "objective/kl": 2.526587724685669, + "objective/non_score_reward": -0.12632939219474792, + "objective/rlhf_reward": 1.5631405200410846, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 1.5232279300689697, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3271484375, + "step": 1231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019235610961914 + }, + { + "episode": 29592, + "epoch": 0.059100550023367004, + "loss/policy_avg": 0.054684095084667206, + "lr": 2.6457055214723928e-06, + "objective/entropy": 101.93836975097656, + "objective/kl": 4.381407737731934, + "objective/non_score_reward": -0.21907037496566772, + "objective/rlhf_reward": 0.6855777353048323, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.616822242736816, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.462890625, + "step": 1232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998699426651001 + }, + { + "episode": 29616, + "epoch": 0.05914848234293178, + "loss/policy_avg": 0.05777949094772339, + "lr": 2.645417944785276e-06, + "objective/entropy": 120.25897216796875, + "objective/kl": 4.3532233238220215, + "objective/non_score_reward": -0.21766118705272675, + "objective/rlhf_reward": 1.6940329000353813, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.632150173187256, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.53125, + "step": 1233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998938798904419 + }, + { + "episode": 29640, + "epoch": 0.05919641466249655, + "loss/policy_avg": -0.0014326106756925583, + "lr": 2.6451303680981596e-06, + "objective/entropy": 100.34651947021484, + "objective/kl": 4.791038513183594, + "objective/non_score_reward": -0.23955194652080536, + "objective/rlhf_reward": -1.437311664223671, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.691720485687256, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.431640625, + "step": 1234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006186962127686 + }, + { + "episode": 29664, + "epoch": 0.05924434698206133, + "loss/policy_avg": 0.05804755911231041, + "lr": 2.644842791411043e-06, + "objective/entropy": 111.99299621582031, + "objective/kl": 4.462949275970459, + "objective/non_score_reward": -0.223147451877594, + "objective/rlhf_reward": 4.66111521422863, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.0721529722213745, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.46875, + "step": 1235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003129005432129 + }, + { + "episode": 29688, + "epoch": 0.0592922793016261, + "loss/policy_avg": -0.009273736737668514, + "lr": 2.6445552147239265e-06, + "objective/entropy": 118.33258056640625, + "objective/kl": 6.019331932067871, + "objective/non_score_reward": -0.3009665906429291, + "objective/rlhf_reward": 0.3314437054504287, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.281630039215088, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.505859375, + "step": 1236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999351143836975 + }, + { + "episode": 29712, + "epoch": 0.05934021162119088, + "loss/policy_avg": -8.117221295833588e-05, + "lr": 2.64426763803681e-06, + "objective/entropy": 93.2914047241211, + "objective/kl": 3.922013998031616, + "objective/non_score_reward": -0.19610071182250977, + "objective/rlhf_reward": 1.1445126413900617, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.3741273880004883, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4072265625, + "step": 1237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999988079071045 + }, + { + "episode": 29736, + "epoch": 0.05938814394075565, + "loss/policy_avg": -0.013865543529391289, + "lr": 2.6439800613496933e-06, + "objective/entropy": 101.11526489257812, + "objective/kl": 7.448060035705566, + "objective/non_score_reward": -0.37240299582481384, + "objective/rlhf_reward": -0.5000288382276215, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 1.8987425565719604, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.44140625, + "step": 1238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002232551574707 + }, + { + "episode": 29760, + "epoch": 0.05943607626032043, + "loss/policy_avg": 0.005838888231664896, + "lr": 2.6436924846625767e-06, + "objective/entropy": 104.99156188964844, + "objective/kl": 6.26297664642334, + "objective/non_score_reward": -0.3131488263607025, + "objective/rlhf_reward": -1.8788928836584091, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.794278621673584, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4990234375, + "step": 1239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999380111694336 + }, + { + "episode": 29784, + "epoch": 0.0594840085798852, + "loss/policy_avg": 0.3220541477203369, + "lr": 2.64340490797546e-06, + "objective/entropy": 75.70513153076172, + "objective/kl": 4.064106464385986, + "objective/non_score_reward": -0.20320531725883484, + "objective/rlhf_reward": 0.9180112414468659, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.209226369857788, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4921875, + "step": 1240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 2.0068416595458984 + }, + { + "episode": 29808, + "epoch": 0.05953194089944998, + "loss/policy_avg": 0.2844838500022888, + "lr": 2.6431173312883436e-06, + "objective/entropy": 79.36664581298828, + "objective/kl": 5.069700241088867, + "objective/non_score_reward": -0.2534850239753723, + "objective/rlhf_reward": 0.6163330905546082, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 6.359570503234863, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.44140625, + "step": 1241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9983208179473877 + }, + { + "episode": 29832, + "epoch": 0.05957987321901475, + "loss/policy_avg": 0.05368570238351822, + "lr": 2.642829754601227e-06, + "objective/entropy": 80.18741607666016, + "objective/kl": 7.121912956237793, + "objective/non_score_reward": -0.35609564185142517, + "objective/rlhf_reward": 0.4474855271341297, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.333188056945801, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3544921875, + "step": 1242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016722679138184 + }, + { + "episode": 29856, + "epoch": 0.05962780553857953, + "loss/policy_avg": 0.017026904970407486, + "lr": 2.6425421779141104e-06, + "objective/entropy": 99.38130950927734, + "objective/kl": 2.7516610622406006, + "objective/non_score_reward": -0.13758304715156555, + "objective/rlhf_reward": 5.17450176179409, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.91127872467041, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.41796875, + "step": 1243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0026743412017822 + }, + { + "episode": 29880, + "epoch": 0.0596757378581443, + "loss/policy_avg": 0.01985914073884487, + "lr": 2.642254601226994e-06, + "objective/entropy": 75.1976318359375, + "objective/kl": 6.857329845428467, + "objective/non_score_reward": -0.3428665101528168, + "objective/rlhf_reward": 0.08004431505097276, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 4.842294692993164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3623046875, + "step": 1244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982798099517822 + }, + { + "episode": 29904, + "epoch": 0.059723670177709076, + "loss/policy_avg": 0.009879359044134617, + "lr": 2.6419670245398773e-06, + "objective/entropy": 74.6874008178711, + "objective/kl": 6.56013298034668, + "objective/non_score_reward": -0.32800665497779846, + "objective/rlhf_reward": -1.9680399224162102, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7009425163269043, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.359375, + "step": 1245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001523971557617 + }, + { + "episode": 29928, + "epoch": 0.05977160249727385, + "loss/policy_avg": 0.013538003899157047, + "lr": 2.6416794478527607e-06, + "objective/entropy": 95.51702880859375, + "objective/kl": 4.280305862426758, + "objective/non_score_reward": -0.21401529014110565, + "objective/rlhf_reward": -1.2840916998684406, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.630150079727173, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.435546875, + "step": 1246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0030219554901123 + }, + { + "episode": 29952, + "epoch": 0.059819534816838625, + "loss/policy_avg": 0.025295238941907883, + "lr": 2.641391871165644e-06, + "objective/entropy": 101.56228637695312, + "objective/kl": 5.252302646636963, + "objective/non_score_reward": -0.2626151442527771, + "objective/rlhf_reward": -1.5756906419992447, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.718876361846924, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4638671875, + "step": 1247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982210397720337 + }, + { + "episode": 29976, + "epoch": 0.059867467136403396, + "loss/policy_avg": 0.02570401132106781, + "lr": 2.6411042944785276e-06, + "objective/entropy": 73.74658203125, + "objective/kl": 6.423552513122559, + "objective/non_score_reward": -0.32117760181427, + "objective/rlhf_reward": 1.8585128658396415, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 4.188689708709717, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.34375, + "step": 1248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997225284576416 + }, + { + "episode": 30000, + "epoch": 0.059915399455968174, + "loss/policy_avg": 0.048067279160022736, + "lr": 2.640816717791411e-06, + "objective/entropy": 97.57502746582031, + "objective/kl": 7.697453498840332, + "objective/non_score_reward": -0.3848726749420166, + "objective/rlhf_reward": -2.3092361092567444, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.671239852905273, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4453125, + "step": 1249, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998400092124939 + }, + { + "episode": 30024, + "epoch": 0.059963331775532945, + "loss/policy_avg": 0.007597873918712139, + "lr": 2.640529141104295e-06, + "objective/entropy": 71.52914428710938, + "objective/kl": 6.561728477478027, + "objective/non_score_reward": -0.3280864357948303, + "objective/rlhf_reward": 0.1687246606160534, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.766164541244507, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4248046875, + "step": 1250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9989380836486816 + }, + { + "episode": 30048, + "epoch": 0.06001126409509772, + "loss/policy_avg": 0.07281266897916794, + "lr": 2.6402415644171783e-06, + "objective/entropy": 67.02214813232422, + "objective/kl": 6.658837795257568, + "objective/non_score_reward": -0.3329418897628784, + "objective/rlhf_reward": -1.9976513534784317, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.283979654312134, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3349609375, + "step": 1251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998497486114502 + }, + { + "episode": 30072, + "epoch": 0.060059196414662494, + "loss/policy_avg": -0.0017945508006960154, + "lr": 2.6399539877300617e-06, + "objective/entropy": 78.59877014160156, + "objective/kl": 4.784793376922607, + "objective/non_score_reward": -0.2392396777868271, + "objective/rlhf_reward": -1.4354379922151566, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.129552364349365, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.333984375, + "step": 1252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993078708648682 + }, + { + "episode": 30096, + "epoch": 0.06010712873422727, + "loss/policy_avg": 0.01549900695681572, + "lr": 2.6396664110429447e-06, + "objective/entropy": 124.70177459716797, + "objective/kl": 4.1480712890625, + "objective/non_score_reward": -0.20740358531475067, + "objective/rlhf_reward": -1.2444214075803757, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.877127170562744, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.51171875, + "step": 1253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994802474975586 + }, + { + "episode": 30120, + "epoch": 0.06015506105379204, + "loss/policy_avg": 0.05091322958469391, + "lr": 2.639378834355828e-06, + "objective/entropy": 66.44577026367188, + "objective/kl": 5.331276893615723, + "objective/non_score_reward": -0.2665638327598572, + "objective/rlhf_reward": -1.5993828922510147, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.274434804916382, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.421875, + "step": 1254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9996073246002197 + }, + { + "episode": 30144, + "epoch": 0.06020299337335682, + "loss/policy_avg": 0.036335572600364685, + "lr": 2.6390912576687116e-06, + "objective/entropy": 86.39290618896484, + "objective/kl": 9.593180656433105, + "objective/non_score_reward": -0.4796590209007263, + "objective/rlhf_reward": -2.8779540956020355, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.454470634460449, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3837890625, + "step": 1255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995596408843994 + }, + { + "episode": 30168, + "epoch": 0.06025092569292159, + "loss/policy_avg": 0.14603076875209808, + "lr": 2.638803680981595e-06, + "objective/entropy": 100.7198486328125, + "objective/kl": 5.315140724182129, + "objective/non_score_reward": -0.2657570540904999, + "objective/rlhf_reward": -1.5945422053337097, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.185164213180542, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.431640625, + "step": 1256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003919839859009 + }, + { + "episode": 30192, + "epoch": 0.06029885801248637, + "loss/policy_avg": 0.048006922006607056, + "lr": 2.6385161042944784e-06, + "objective/entropy": 84.79606628417969, + "objective/kl": 5.0454912185668945, + "objective/non_score_reward": -0.2522745728492737, + "objective/rlhf_reward": 0.8074694510150913, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 6.753301620483398, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4677734375, + "step": 1257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9947903156280518 + }, + { + "episode": 30216, + "epoch": 0.06034679033205114, + "loss/policy_avg": 0.08817506581544876, + "lr": 2.638228527607362e-06, + "objective/entropy": 108.73330688476562, + "objective/kl": 6.700601100921631, + "objective/non_score_reward": -0.33503007888793945, + "objective/rlhf_reward": -2.0101804435253143, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.657122611999512, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.552734375, + "step": 1258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.99497389793396 + }, + { + "episode": 30240, + "epoch": 0.06039472265161592, + "loss/policy_avg": 0.2853406071662903, + "lr": 2.6379409509202453e-06, + "objective/entropy": 86.0921859741211, + "objective/kl": 8.009794235229492, + "objective/non_score_reward": -0.4004896581172943, + "objective/rlhf_reward": -2.402937948703766, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.492002487182617, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.39453125, + "step": 1259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00118350982666 + }, + { + "episode": 30264, + "epoch": 0.06044265497118069, + "loss/policy_avg": 0.04098299890756607, + "lr": 2.637653374233129e-06, + "objective/entropy": 97.52312469482422, + "objective/kl": 5.943971633911133, + "objective/non_score_reward": -0.2971985936164856, + "objective/rlhf_reward": 0.5379252891589168, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.8446109294891357, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.453125, + "step": 1260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9993518590927124 + }, + { + "episode": 30288, + "epoch": 0.06049058729074547, + "loss/policy_avg": 0.1583286076784134, + "lr": 2.6373657975460125e-06, + "objective/entropy": 109.25798034667969, + "objective/kl": 5.351530075073242, + "objective/non_score_reward": -0.26757651567459106, + "objective/rlhf_reward": 2.18011944228236, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.498321056365967, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.486328125, + "step": 1261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0034494400024414 + }, + { + "episode": 30312, + "epoch": 0.06053851961031024, + "loss/policy_avg": 0.0636022537946701, + "lr": 2.637078220858896e-06, + "objective/entropy": 112.6253662109375, + "objective/kl": 4.8543620109558105, + "objective/non_score_reward": -0.24271810054779053, + "objective/rlhf_reward": -1.4563086032867432, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.645208358764648, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.466796875, + "step": 1262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995927810668945 + }, + { + "episode": 30336, + "epoch": 0.06058645192987502, + "loss/policy_avg": 0.027983125299215317, + "lr": 2.6367906441717794e-06, + "objective/entropy": 103.0263900756836, + "objective/kl": 4.0744171142578125, + "objective/non_score_reward": -0.2037208378314972, + "objective/rlhf_reward": 1.0987918164182666, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.491295337677002, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4599609375, + "step": 1263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976515769958496 + }, + { + "episode": 30360, + "epoch": 0.06063438424943979, + "loss/policy_avg": 0.07633619755506516, + "lr": 2.636503067484663e-06, + "objective/entropy": 88.58677673339844, + "objective/kl": 6.168828964233398, + "objective/non_score_reward": -0.3084414601325989, + "objective/rlhf_reward": 0.1493512988090514, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.920061111450195, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.390625, + "step": 1264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976003170013428 + }, + { + "episode": 30384, + "epoch": 0.060682316569004566, + "loss/policy_avg": 0.05535813048481941, + "lr": 2.6362154907975462e-06, + "objective/entropy": 101.90641784667969, + "objective/kl": 3.4455108642578125, + "objective/non_score_reward": -0.172275573015213, + "objective/rlhf_reward": 0.8591359045794811, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.591311454772949, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.44140625, + "step": 1265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018153190612793 + }, + { + "episode": 30408, + "epoch": 0.06073024888856934, + "loss/policy_avg": 0.035253673791885376, + "lr": 2.6359279141104297e-06, + "objective/entropy": 96.16905212402344, + "objective/kl": 3.8817126750946045, + "objective/non_score_reward": -0.19408564269542694, + "objective/rlhf_reward": 0.6416661587895188, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 0.9929551482200623, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.546875, + "step": 1266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0007975101470947 + }, + { + "episode": 30432, + "epoch": 0.060778181208134115, + "loss/policy_avg": -0.028469692915678024, + "lr": 2.6356403374233127e-06, + "objective/entropy": 112.10400390625, + "objective/kl": 1.8783366680145264, + "objective/non_score_reward": -0.09391682595014572, + "objective/rlhf_reward": 5.436499051749706, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.2666940689086914, + "policy/clipfrac_avg": 1.8333333730697632, + "policy/entropy_avg": 0.474609375, + "step": 1267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.005197763442993 + }, + { + "episode": 30456, + "epoch": 0.06082611352769889, + "loss/policy_avg": 0.033774495124816895, + "lr": 2.635352760736196e-06, + "objective/entropy": 132.95416259765625, + "objective/kl": 3.7230498790740967, + "objective/non_score_reward": -0.18615250289440155, + "objective/rlhf_reward": 1.0203281872381105, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.9240684509277344, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.58203125, + "step": 1268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9993730783462524 + }, + { + "episode": 30480, + "epoch": 0.060874045847263664, + "loss/policy_avg": 0.015706613659858704, + "lr": 2.6350651840490795e-06, + "objective/entropy": 103.51777648925781, + "objective/kl": 5.187046051025391, + "objective/non_score_reward": -0.25935232639312744, + "objective/rlhf_reward": 0.17827495484507905, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.7083945274353027, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4970703125, + "step": 1269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988631010055542 + }, + { + "episode": 30504, + "epoch": 0.06092197816682844, + "loss/policy_avg": 0.10592511296272278, + "lr": 2.634777607361963e-06, + "objective/entropy": 86.88468933105469, + "objective/kl": 3.8242626190185547, + "objective/non_score_reward": -0.19121313095092773, + "objective/rlhf_reward": 1.8527212142944336, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.973290205001831, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.400390625, + "step": 1270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00368595123291 + }, + { + "episode": 30528, + "epoch": 0.06096991048639321, + "loss/policy_avg": 0.013892039656639099, + "lr": 2.634490030674847e-06, + "objective/entropy": 111.4290771484375, + "objective/kl": 3.5696322917938232, + "objective/non_score_reward": -0.17848162353038788, + "objective/rlhf_reward": -1.0708897709846497, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8769989013671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58203125, + "step": 1271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.000134229660034 + }, + { + "episode": 30552, + "epoch": 0.06101784280595799, + "loss/policy_avg": 0.036902520805597305, + "lr": 2.6342024539877302e-06, + "objective/entropy": 137.57647705078125, + "objective/kl": 2.8742682933807373, + "objective/non_score_reward": -0.14371341466903687, + "objective/rlhf_reward": -0.8622805327177048, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9444527626037598, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.62890625, + "step": 1272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011491775512695 + }, + { + "episode": 30576, + "epoch": 0.06106577512552276, + "loss/policy_avg": 0.13180658221244812, + "lr": 2.6339148773006136e-06, + "objective/entropy": 97.6256332397461, + "objective/kl": 7.9880523681640625, + "objective/non_score_reward": -0.3994026184082031, + "objective/rlhf_reward": -2.396415889263153, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5858471393585205, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4404296875, + "step": 1273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992955923080444 + }, + { + "episode": 30600, + "epoch": 0.06111370744508754, + "loss/policy_avg": 1.0696864128112793, + "lr": 2.633627300613497e-06, + "objective/entropy": 75.33988952636719, + "objective/kl": 6.3835272789001465, + "objective/non_score_reward": -0.3191763758659363, + "objective/rlhf_reward": -1.915058210492134, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3862085342407227, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.404296875, + "step": 1274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0023956298828125 + }, + { + "episode": 30624, + "epoch": 0.06116163976465231, + "loss/policy_avg": 0.016743093729019165, + "lr": 2.6333397239263805e-06, + "objective/entropy": 97.49541473388672, + "objective/kl": 5.5933356285095215, + "objective/non_score_reward": -0.2796667814254761, + "objective/rlhf_reward": 0.6431162293601993, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.212965250015259, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.46875, + "step": 1275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998884916305542 + }, + { + "episode": 30648, + "epoch": 0.06120957208421709, + "loss/policy_avg": 0.10479310154914856, + "lr": 2.633052147239264e-06, + "objective/entropy": 123.9197006225586, + "objective/kl": 3.595510482788086, + "objective/non_score_reward": -0.17977553606033325, + "objective/rlhf_reward": -1.0786532312631607, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.076220989227295, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.560546875, + "step": 1276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9982576370239258 + }, + { + "episode": 30672, + "epoch": 0.06125750440378186, + "loss/policy_avg": 0.02849559299647808, + "lr": 2.6327645705521474e-06, + "objective/entropy": 99.50321960449219, + "objective/kl": 4.050270080566406, + "objective/non_score_reward": -0.20251350104808807, + "objective/rlhf_reward": -1.2150810733437538, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.023453235626221, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.44140625, + "step": 1277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997307300567627 + }, + { + "episode": 30696, + "epoch": 0.06130543672334664, + "loss/policy_avg": 0.00297454628162086, + "lr": 2.6324769938650308e-06, + "objective/entropy": 86.14889526367188, + "objective/kl": 4.612610340118408, + "objective/non_score_reward": -0.2306305170059204, + "objective/rlhf_reward": -1.3837831430137157, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.397305965423584, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.392578125, + "step": 1278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003467559814453 + }, + { + "episode": 30720, + "epoch": 0.06135336904291141, + "loss/policy_avg": -0.04590277373790741, + "lr": 2.632189417177914e-06, + "objective/entropy": 114.02983093261719, + "objective/kl": 6.715298652648926, + "objective/non_score_reward": -0.33576497435569763, + "objective/rlhf_reward": -2.014589622616768, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.206883430480957, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.50390625, + "step": 1279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000990390777588 + }, + { + "episode": 30744, + "epoch": 0.061401301362476186, + "loss/policy_avg": 0.001244666171260178, + "lr": 2.6319018404907976e-06, + "objective/entropy": 132.50204467773438, + "objective/kl": 7.319005489349365, + "objective/non_score_reward": -0.3659503161907196, + "objective/rlhf_reward": -0.4613128125771202, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 3.2086169719696045, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.5625, + "step": 1280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0004477500915527 + }, + { + "episode": 30768, + "epoch": 0.06144923368204096, + "loss/policy_avg": 0.11278479546308517, + "lr": 2.631614263803681e-06, + "objective/entropy": 85.9473876953125, + "objective/kl": 3.218918561935425, + "objective/non_score_reward": -0.16094593703746796, + "objective/rlhf_reward": 1.6183837634684535, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.715317726135254, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.380859375, + "step": 1281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002328872680664 + }, + { + "episode": 30792, + "epoch": 0.061497166001605735, + "loss/policy_avg": 0.007079457864165306, + "lr": 2.6313266871165645e-06, + "objective/entropy": 98.15834045410156, + "objective/kl": 4.637694835662842, + "objective/non_score_reward": -0.23188471794128418, + "objective/rlhf_reward": 1.192751085496137, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.2672624588012695, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4599609375, + "step": 1282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9980947971343994 + }, + { + "episode": 30816, + "epoch": 0.061545098321170506, + "loss/policy_avg": 0.011546523310244083, + "lr": 2.631039110429448e-06, + "objective/entropy": 76.63227844238281, + "objective/kl": 4.8079938888549805, + "objective/non_score_reward": -0.24039970338344574, + "objective/rlhf_reward": 0.8787187050629619, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.119614601135254, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.33984375, + "step": 1283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016605854034424 + }, + { + "episode": 30840, + "epoch": 0.061593030640735284, + "loss/policy_avg": 0.06368912756443024, + "lr": 2.6307515337423313e-06, + "objective/entropy": 86.0552978515625, + "objective/kl": 5.876132488250732, + "objective/non_score_reward": -0.29380664229393005, + "objective/rlhf_reward": -1.7628397196531296, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.6020874977111816, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4462890625, + "step": 1284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001136302947998 + }, + { + "episode": 30864, + "epoch": 0.061640962960300055, + "loss/policy_avg": 0.10269075632095337, + "lr": 2.6304639570552148e-06, + "objective/entropy": 107.4333724975586, + "objective/kl": 4.901505470275879, + "objective/non_score_reward": -0.24507522583007812, + "objective/rlhf_reward": -1.4704513847827911, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.152580738067627, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.47265625, + "step": 1285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9965415000915527 + }, + { + "episode": 30888, + "epoch": 0.06168889527986483, + "loss/policy_avg": 0.033195652067661285, + "lr": 2.630176380368098e-06, + "objective/entropy": 93.68502044677734, + "objective/kl": 4.580684661865234, + "objective/non_score_reward": -0.2290342152118683, + "objective/rlhf_reward": 0.6257947832345961, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.8155035972595215, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3974609375, + "step": 1286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994356632232666 + }, + { + "episode": 30912, + "epoch": 0.061736827599429604, + "loss/policy_avg": -0.0020293667912483215, + "lr": 2.6298888036809816e-06, + "objective/entropy": 87.36702728271484, + "objective/kl": 6.195121765136719, + "objective/non_score_reward": -0.3097561001777649, + "objective/rlhf_reward": 0.2787065811861885, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 5.108069896697998, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4091796875, + "step": 1287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989197254180908 + }, + { + "episode": 30936, + "epoch": 0.06178475991899438, + "loss/policy_avg": 0.03421325236558914, + "lr": 2.629601226993865e-06, + "objective/entropy": 102.81011199951172, + "objective/kl": 6.11998176574707, + "objective/non_score_reward": -0.30599913001060486, + "objective/rlhf_reward": -1.835994653403759, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1797893047332764, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.435546875, + "step": 1288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987670183181763 + }, + { + "episode": 30960, + "epoch": 0.06183269223855915, + "loss/policy_avg": -0.01368163526058197, + "lr": 2.6293136503067485e-06, + "objective/entropy": 133.3533935546875, + "objective/kl": 5.996671199798584, + "objective/non_score_reward": -0.2998335659503937, + "objective/rlhf_reward": -1.7990013659000397, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.5067172050476074, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.6015625, + "step": 1289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002115488052368 + }, + { + "episode": 30984, + "epoch": 0.06188062455812393, + "loss/policy_avg": 0.021971747279167175, + "lr": 2.629026073619632e-06, + "objective/entropy": 91.6839828491211, + "objective/kl": 6.835235595703125, + "objective/non_score_reward": -0.3417617976665497, + "objective/rlhf_reward": 1.7350078546387366, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.9175801277160645, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.419921875, + "step": 1290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9990298748016357 + }, + { + "episode": 31008, + "epoch": 0.0619285568776887, + "loss/policy_avg": 0.05354639142751694, + "lr": 2.6287384969325153e-06, + "objective/entropy": 78.44295501708984, + "objective/kl": 6.006072044372559, + "objective/non_score_reward": -0.3003036379814148, + "objective/rlhf_reward": 0.09096758183749576, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.29415225982666, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.369140625, + "step": 1291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999279975891113 + }, + { + "episode": 31032, + "epoch": 0.06197648919725348, + "loss/policy_avg": 0.24019573628902435, + "lr": 2.6284509202453987e-06, + "objective/entropy": 84.76194763183594, + "objective/kl": 4.448638439178467, + "objective/non_score_reward": -0.22243189811706543, + "objective/rlhf_reward": -1.3345914334058762, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4835078716278076, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.3701171875, + "step": 1292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992804527282715 + }, + { + "episode": 31056, + "epoch": 0.06202442151681825, + "loss/policy_avg": 0.11521132290363312, + "lr": 2.628163343558282e-06, + "objective/entropy": 97.0101547241211, + "objective/kl": 4.353082180023193, + "objective/non_score_reward": -0.21765410900115967, + "objective/rlhf_reward": -1.3059246391057968, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.6260218620300293, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.43359375, + "step": 1293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0020172595977783 + }, + { + "episode": 31080, + "epoch": 0.06207235383638303, + "loss/policy_avg": 0.13969764113426208, + "lr": 2.627875766871166e-06, + "objective/entropy": 94.04832458496094, + "objective/kl": 4.021669387817383, + "objective/non_score_reward": -0.20108351111412048, + "objective/rlhf_reward": 0.7934989221394061, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 5.036968231201172, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.435546875, + "step": 1294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981741905212402 + }, + { + "episode": 31104, + "epoch": 0.0621202861559478, + "loss/policy_avg": 0.09459611773490906, + "lr": 2.6275881901840494e-06, + "objective/entropy": 105.90458679199219, + "objective/kl": 7.214730739593506, + "objective/non_score_reward": -0.36073654890060425, + "objective/rlhf_reward": -2.1644192077219486, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.424013137817383, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.427734375, + "step": 1295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979636669158936 + }, + { + "episode": 31128, + "epoch": 0.06216821847551258, + "loss/policy_avg": 0.028577398508787155, + "lr": 2.627300613496933e-06, + "objective/entropy": 115.89093017578125, + "objective/kl": 8.283430099487305, + "objective/non_score_reward": -0.4141715168952942, + "objective/rlhf_reward": 0.514970988035202, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.7317628860473633, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.48046875, + "step": 1296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000555515289307 + }, + { + "episode": 31152, + "epoch": 0.06221615079507735, + "loss/policy_avg": 0.08469686657190323, + "lr": 2.627013036809816e-06, + "objective/entropy": 111.52372741699219, + "objective/kl": 4.112702369689941, + "objective/non_score_reward": -0.20563510060310364, + "objective/rlhf_reward": 2.5517679848653487, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 6.884979248046875, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.494140625, + "step": 1297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972834587097168 + }, + { + "episode": 31176, + "epoch": 0.06226408311464213, + "loss/policy_avg": -0.01399120595306158, + "lr": 2.6267254601226993e-06, + "objective/entropy": 91.07359313964844, + "objective/kl": 5.073991775512695, + "objective/non_score_reward": -0.25369957089424133, + "objective/rlhf_reward": 0.2839826026351485, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.8276896476745605, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3994140625, + "step": 1298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0048136711120605 + }, + { + "episode": 31200, + "epoch": 0.0623120154342069, + "loss/policy_avg": 0.058353014290332794, + "lr": 2.6264378834355827e-06, + "objective/entropy": 86.643310546875, + "objective/kl": 3.0692005157470703, + "objective/non_score_reward": -0.1534600555896759, + "objective/rlhf_reward": -0.9207603000104427, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7661783695220947, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.37109375, + "step": 1299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984130859375 + }, + { + "episode": 31224, + "epoch": 0.062359947753771676, + "loss/policy_avg": 0.0034101400524377823, + "lr": 2.626150306748466e-06, + "objective/entropy": 101.17491149902344, + "objective/kl": 2.6600091457366943, + "objective/non_score_reward": -0.1330004632472992, + "objective/rlhf_reward": 1.5231140601981643, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 0.9461568593978882, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.443359375, + "step": 1300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002474069595337 + }, + { + "episode": 31248, + "epoch": 0.06240788007333645, + "loss/policy_avg": 0.1010250672698021, + "lr": 2.6258627300613496e-06, + "objective/entropy": 87.9678955078125, + "objective/kl": 6.178799629211426, + "objective/non_score_reward": -0.30893996357917786, + "objective/rlhf_reward": -1.8536397479474545, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.657759189605713, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4111328125, + "step": 1301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0002217292785645 + }, + { + "episode": 31272, + "epoch": 0.062455812392901225, + "loss/policy_avg": 0.019837992265820503, + "lr": 2.625575153374233e-06, + "objective/entropy": 126.2974853515625, + "objective/kl": 3.6274402141571045, + "objective/non_score_reward": -0.1813720166683197, + "objective/rlhf_reward": 1.2328847806502345, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.6120457649230957, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.560546875, + "step": 1302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999045729637146 + }, + { + "episode": 31296, + "epoch": 0.062503744712466, + "loss/policy_avg": 0.251190185546875, + "lr": 2.6252875766871164e-06, + "objective/entropy": 107.15347290039062, + "objective/kl": 7.8888373374938965, + "objective/non_score_reward": -0.3944419026374817, + "objective/rlhf_reward": -0.5604712779282298, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.3867684602737427, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.466796875, + "step": 1303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0034842491149902 + }, + { + "episode": 31320, + "epoch": 0.06255167703203077, + "loss/policy_avg": 0.021041549742221832, + "lr": 2.6250000000000003e-06, + "objective/entropy": 84.87272644042969, + "objective/kl": 4.023943901062012, + "objective/non_score_reward": -0.20119720697402954, + "objective/rlhf_reward": -1.2071831971406937, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.879412889480591, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.396484375, + "step": 1304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992496967315674 + }, + { + "episode": 31344, + "epoch": 0.06259960935159554, + "loss/policy_avg": -0.0015676207840442657, + "lr": 2.6247124233128837e-06, + "objective/entropy": 80.72772216796875, + "objective/kl": 5.632658004760742, + "objective/non_score_reward": -0.2816329300403595, + "objective/rlhf_reward": 0.11638246824753617, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.7179765701293945, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.34375, + "step": 1305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000487804412842 + }, + { + "episode": 31368, + "epoch": 0.06264754167116032, + "loss/policy_avg": 0.08662106096744537, + "lr": 2.624424846625767e-06, + "objective/entropy": 107.35481262207031, + "objective/kl": 5.859217643737793, + "objective/non_score_reward": -0.29296088218688965, + "objective/rlhf_reward": -1.7577651888132095, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.5789804458618164, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.466796875, + "step": 1306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987292289733887 + }, + { + "episode": 31392, + "epoch": 0.0626954739907251, + "loss/policy_avg": 0.32961779832839966, + "lr": 2.6241372699386506e-06, + "objective/entropy": 76.22236633300781, + "objective/kl": 5.38968563079834, + "objective/non_score_reward": -0.2694842517375946, + "objective/rlhf_reward": -1.6169055625796318, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.010683059692383, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.314453125, + "step": 1307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9961729049682617 + }, + { + "episode": 31416, + "epoch": 0.06274340631028986, + "loss/policy_avg": 0.03886047750711441, + "lr": 2.623849693251534e-06, + "objective/entropy": 85.28175354003906, + "objective/kl": 3.8588616847991943, + "objective/non_score_reward": -0.19294308125972748, + "objective/rlhf_reward": -1.1576584577560425, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3929553031921387, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3779296875, + "step": 1308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998598098754883 + }, + { + "episode": 31440, + "epoch": 0.06279133862985464, + "loss/policy_avg": 0.10237645357847214, + "lr": 2.6235621165644174e-06, + "objective/entropy": 87.01766967773438, + "objective/kl": 4.1336894035339355, + "objective/non_score_reward": -0.20668447017669678, + "objective/rlhf_reward": 0.6526823763242569, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.5352988243103027, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.380859375, + "step": 1309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010643005371094 + }, + { + "episode": 31464, + "epoch": 0.06283927094941942, + "loss/policy_avg": 0.010629785247147083, + "lr": 2.623274539877301e-06, + "objective/entropy": 115.5570068359375, + "objective/kl": 3.3171586990356445, + "objective/non_score_reward": -0.1658579558134079, + "objective/rlhf_reward": 1.1420954548229112, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.911489248275757, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.53125, + "step": 1310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001227855682373 + }, + { + "episode": 31488, + "epoch": 0.0628872032689842, + "loss/policy_avg": 0.12851667404174805, + "lr": 2.6229869631901843e-06, + "objective/entropy": 99.6744384765625, + "objective/kl": 6.659506797790527, + "objective/non_score_reward": -0.3329753279685974, + "objective/rlhf_reward": -1.997852012515068, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.800962448120117, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4228515625, + "step": 1311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998960256576538 + }, + { + "episode": 31512, + "epoch": 0.06293513558854896, + "loss/policy_avg": -0.02554449997842312, + "lr": 2.6226993865030673e-06, + "objective/entropy": 91.17036437988281, + "objective/kl": 4.15183162689209, + "objective/non_score_reward": -0.20759157836437225, + "objective/rlhf_reward": 0.6472398352316227, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.5766266584396362, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.40234375, + "step": 1312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0022263526916504 + }, + { + "episode": 31536, + "epoch": 0.06298306790811374, + "loss/policy_avg": 0.049026913940906525, + "lr": 2.6224118098159507e-06, + "objective/entropy": 126.49070739746094, + "objective/kl": 4.298995494842529, + "objective/non_score_reward": -0.21494978666305542, + "objective/rlhf_reward": -1.2896987199783325, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.045077323913574, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.55078125, + "step": 1313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001396894454956 + }, + { + "episode": 31560, + "epoch": 0.06303100022767852, + "loss/policy_avg": 0.0069281794130802155, + "lr": 2.622124233128834e-06, + "objective/entropy": 105.83184051513672, + "objective/kl": 4.732832908630371, + "objective/non_score_reward": -0.23664167523384094, + "objective/rlhf_reward": -1.4198499023914337, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7624119520187378, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4638671875, + "step": 1314, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999711275100708 + }, + { + "episode": 31584, + "epoch": 0.0630789325472433, + "loss/policy_avg": 0.01620551198720932, + "lr": 2.621836656441718e-06, + "objective/entropy": 103.5484619140625, + "objective/kl": 4.810503959655762, + "objective/non_score_reward": -0.24052521586418152, + "objective/rlhf_reward": -1.4431512020528316, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.045754432678223, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4501953125, + "step": 1315, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998381495475769 + }, + { + "episode": 31608, + "epoch": 0.06312686486680806, + "loss/policy_avg": 0.0338604561984539, + "lr": 2.6215490797546014e-06, + "objective/entropy": 85.34927368164062, + "objective/kl": 3.584808349609375, + "objective/non_score_reward": -0.17924043536186218, + "objective/rlhf_reward": -1.0754425264894962, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3207685947418213, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.376953125, + "step": 1316, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013058185577393 + }, + { + "episode": 31632, + "epoch": 0.06317479718637284, + "loss/policy_avg": 0.018853578716516495, + "lr": 2.621261503067485e-06, + "objective/entropy": 98.8065185546875, + "objective/kl": 3.4422607421875, + "objective/non_score_reward": -0.1721130609512329, + "objective/rlhf_reward": 0.7735015840621028, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.0222713947296143, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4326171875, + "step": 1317, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0029296875 + }, + { + "episode": 31656, + "epoch": 0.06322272950593762, + "loss/policy_avg": -0.005348498001694679, + "lr": 2.6209739263803682e-06, + "objective/entropy": 90.06038665771484, + "objective/kl": 6.013578414916992, + "objective/non_score_reward": -0.30067893862724304, + "objective/rlhf_reward": -1.8040736243128777, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.101161003112793, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4267578125, + "step": 1318, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011146068573 + }, + { + "episode": 31680, + "epoch": 0.0632706618255024, + "loss/policy_avg": 0.017841560766100883, + "lr": 2.6206863496932517e-06, + "objective/entropy": 78.63997650146484, + "objective/kl": 1.9099787473678589, + "objective/non_score_reward": -0.0954989343881607, + "objective/rlhf_reward": 2.4270064122974873, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.019682765007019, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.369140625, + "step": 1319, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002100944519043 + }, + { + "episode": 31704, + "epoch": 0.06331859414506716, + "loss/policy_avg": 0.031576067209243774, + "lr": 2.620398773006135e-06, + "objective/entropy": 122.40117645263672, + "objective/kl": 3.7251715660095215, + "objective/non_score_reward": -0.18625859916210175, + "objective/rlhf_reward": 1.4665077981712313, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.082371473312378, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.529296875, + "step": 1320, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001410961151123 + }, + { + "episode": 31728, + "epoch": 0.06336652646463194, + "loss/policy_avg": 0.03753375634551048, + "lr": 2.6201111963190185e-06, + "objective/entropy": 133.3799285888672, + "objective/kl": 4.4826579093933105, + "objective/non_score_reward": -0.22413289546966553, + "objective/rlhf_reward": 1.2392619979741069, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 6.109475612640381, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.61328125, + "step": 1321, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964308738708496 + }, + { + "episode": 31752, + "epoch": 0.06341445878419671, + "loss/policy_avg": 0.25050216913223267, + "lr": 2.619823619631902e-06, + "objective/entropy": 90.46748352050781, + "objective/kl": 6.7949538230896, + "objective/non_score_reward": -0.33974772691726685, + "objective/rlhf_reward": 0.5455730763437244, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.8930341005325317, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.4140625, + "step": 1322, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003387451171875 + }, + { + "episode": 31776, + "epoch": 0.06346239110376149, + "loss/policy_avg": 0.06490830332040787, + "lr": 2.6195360429447854e-06, + "objective/entropy": 104.20800018310547, + "objective/kl": 4.124238014221191, + "objective/non_score_reward": -0.2062118947505951, + "objective/rlhf_reward": 1.7627286165952682, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.9848151206970215, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.447265625, + "step": 1323, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002680778503418 + }, + { + "episode": 31800, + "epoch": 0.06351032342332626, + "loss/policy_avg": 0.07538659125566483, + "lr": 2.619248466257669e-06, + "objective/entropy": 104.61325073242188, + "objective/kl": 4.579162120819092, + "objective/non_score_reward": -0.2289581000804901, + "objective/rlhf_reward": -1.3737485483288765, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8186737298965454, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4345703125, + "step": 1324, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0023574829101562 + }, + { + "episode": 31824, + "epoch": 0.06355825574289103, + "loss/policy_avg": 0.05497276410460472, + "lr": 2.6189608895705522e-06, + "objective/entropy": 118.74777221679688, + "objective/kl": 4.251770973205566, + "objective/non_score_reward": -0.21258853375911713, + "objective/rlhf_reward": 0.5306488114760551, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.8387434482574463, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4990234375, + "step": 1325, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995976686477661 + }, + { + "episode": 31848, + "epoch": 0.06360618806245581, + "loss/policy_avg": 0.13328129053115845, + "lr": 2.6186733128834357e-06, + "objective/entropy": 123.41172790527344, + "objective/kl": 3.4677138328552246, + "objective/non_score_reward": -0.17338570952415466, + "objective/rlhf_reward": -1.0403141863644123, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.713648796081543, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.568359375, + "step": 1326, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997122287750244 + }, + { + "episode": 31872, + "epoch": 0.06365412038202059, + "loss/policy_avg": 0.045900363475084305, + "lr": 2.618385736196319e-06, + "objective/entropy": 89.92949676513672, + "objective/kl": 5.706336975097656, + "objective/non_score_reward": -0.2853168249130249, + "objective/rlhf_reward": 1.2880989909172058, + "objective/scores": 0.5, + "policy/approxkl_avg": 5.258171081542969, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4130859375, + "step": 1327, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9971601963043213 + }, + { + "episode": 31896, + "epoch": 0.06370205270158537, + "loss/policy_avg": 0.01600593887269497, + "lr": 2.6180981595092025e-06, + "objective/entropy": 87.49591064453125, + "objective/kl": 4.676831245422363, + "objective/non_score_reward": -0.23384158313274384, + "objective/rlhf_reward": -1.403049424290657, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7037806510925293, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.357421875, + "step": 1328, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004018783569336 + }, + { + "episode": 31920, + "epoch": 0.06374998502115013, + "loss/policy_avg": -0.002781609073281288, + "lr": 2.617810582822086e-06, + "objective/entropy": 78.98455047607422, + "objective/kl": 6.831892013549805, + "objective/non_score_reward": -0.3415946364402771, + "objective/rlhf_reward": -2.0495677292346954, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.592105865478516, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.380859375, + "step": 1329, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999955177307129 + }, + { + "episode": 31944, + "epoch": 0.06379791734071491, + "loss/policy_avg": 0.05236777290701866, + "lr": 2.6175230061349694e-06, + "objective/entropy": 109.90806579589844, + "objective/kl": 3.058973789215088, + "objective/non_score_reward": -0.15294867753982544, + "objective/rlhf_reward": 0.8884878863931928, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.456194281578064, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.513671875, + "step": 1330, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0014312267303467 + }, + { + "episode": 31968, + "epoch": 0.06384584966027969, + "loss/policy_avg": 0.05938855931162834, + "lr": 2.6172354294478528e-06, + "objective/entropy": 86.82719421386719, + "objective/kl": 3.2658050060272217, + "objective/non_score_reward": -0.16329024732112885, + "objective/rlhf_reward": -0.9797415118664503, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.39345121383667, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.40625, + "step": 1331, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9973094463348389 + }, + { + "episode": 31992, + "epoch": 0.06389378197984447, + "loss/policy_avg": 0.04624219983816147, + "lr": 2.616947852760736e-06, + "objective/entropy": 119.986572265625, + "objective/kl": 4.142152786254883, + "objective/non_score_reward": -0.20710763335227966, + "objective/rlhf_reward": 1.0784709985900882, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.816052436828613, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.546875, + "step": 1332, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9966527223587036 + }, + { + "episode": 32016, + "epoch": 0.06394171429940923, + "loss/policy_avg": -0.021209102123975754, + "lr": 2.6166602760736196e-06, + "objective/entropy": 93.91766357421875, + "objective/kl": 0.7230293154716492, + "objective/non_score_reward": -0.036151476204395294, + "objective/rlhf_reward": -0.21690884977579117, + "objective/scores": 0.0, + "policy/approxkl_avg": 0.2034161537885666, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.552734375, + "step": 1333, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0025794506073 + }, + { + "episode": 32040, + "epoch": 0.06398964661897401, + "loss/policy_avg": 0.04857039079070091, + "lr": 2.616372699386503e-06, + "objective/entropy": 136.14280700683594, + "objective/kl": 2.9734792709350586, + "objective/non_score_reward": -0.14867396652698517, + "objective/rlhf_reward": 1.4290730516959194, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.349377155303955, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.658203125, + "step": 1334, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001087188720703 + }, + { + "episode": 32064, + "epoch": 0.06403757893853879, + "loss/policy_avg": 0.029037337750196457, + "lr": 2.6160851226993865e-06, + "objective/entropy": 125.15169525146484, + "objective/kl": 2.984203338623047, + "objective/non_score_reward": -0.1492101550102234, + "objective/rlhf_reward": 2.104738999158144, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.274479627609253, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.5546875, + "step": 1335, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989924430847168 + }, + { + "episode": 32088, + "epoch": 0.06408551125810356, + "loss/policy_avg": 0.00806344673037529, + "lr": 2.61579754601227e-06, + "objective/entropy": 109.20976257324219, + "objective/kl": 5.471077919006348, + "objective/non_score_reward": -0.27355390787124634, + "objective/rlhf_reward": 0.3586767315864562, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 4.071638584136963, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.466796875, + "step": 1336, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013790130615234 + }, + { + "episode": 32112, + "epoch": 0.06413344357766833, + "loss/policy_avg": 0.018808046355843544, + "lr": 2.6155099693251533e-06, + "objective/entropy": 108.39665222167969, + "objective/kl": 6.902549743652344, + "objective/non_score_reward": -0.34512752294540405, + "objective/rlhf_reward": 0.2503518845487598, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 5.139015197753906, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.478515625, + "step": 1337, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991569519042969 + }, + { + "episode": 32136, + "epoch": 0.0641813758972331, + "loss/policy_avg": -0.00836876779794693, + "lr": 2.615222392638037e-06, + "objective/entropy": 162.54791259765625, + "objective/kl": 3.4662399291992188, + "objective/non_score_reward": -0.1733119934797287, + "objective/rlhf_reward": -1.0398719310760498, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1635124683380127, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.734375, + "step": 1338, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000192165374756 + }, + { + "episode": 32160, + "epoch": 0.06422930821679788, + "loss/policy_avg": 0.05636361986398697, + "lr": 2.6149348159509206e-06, + "objective/entropy": 111.45677947998047, + "objective/kl": 4.491605281829834, + "objective/non_score_reward": -0.22458025813102722, + "objective/rlhf_reward": 0.45869847735178804, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.8798139095306396, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.462890625, + "step": 1339, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984983205795288 + }, + { + "episode": 32184, + "epoch": 0.06427724053636266, + "loss/policy_avg": 0.6200167536735535, + "lr": 2.614647239263804e-06, + "objective/entropy": 99.4378662109375, + "objective/kl": 4.405259132385254, + "objective/non_score_reward": -0.22026294469833374, + "objective/rlhf_reward": 0.6784223318099974, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.9261891841888428, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4365234375, + "step": 1340, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012271404266357 + }, + { + "episode": 32208, + "epoch": 0.06432517285592743, + "loss/policy_avg": 0.21199296414852142, + "lr": 2.6143596625766875e-06, + "objective/entropy": 123.98519134521484, + "objective/kl": 10.1226806640625, + "objective/non_score_reward": -0.506134033203125, + "objective/rlhf_reward": -3.0368041396141052, + "objective/scores": 0.0, + "policy/approxkl_avg": 10.78635025024414, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.5234375, + "step": 1341, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9949941635131836 + }, + { + "episode": 32232, + "epoch": 0.0643731051754922, + "loss/policy_avg": 0.058306917548179626, + "lr": 2.6140720858895705e-06, + "objective/entropy": 96.624755859375, + "objective/kl": 3.163856029510498, + "objective/non_score_reward": -0.15819278359413147, + "objective/rlhf_reward": 1.371960141842461, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 1.9169621467590332, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4541015625, + "step": 1342, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.00203800201416 + }, + { + "episode": 32256, + "epoch": 0.06442103749505698, + "loss/policy_avg": 0.035618070513010025, + "lr": 2.613784509202454e-06, + "objective/entropy": 88.25248718261719, + "objective/kl": 3.8675074577331543, + "objective/non_score_reward": -0.1933753490447998, + "objective/rlhf_reward": -1.1602521762251854, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.4284942150115967, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.396484375, + "step": 1343, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999298095703125 + }, + { + "episode": 32280, + "epoch": 0.06446896981462176, + "loss/policy_avg": 0.057793304324150085, + "lr": 2.6134969325153373e-06, + "objective/entropy": 102.86869812011719, + "objective/kl": 7.027665615081787, + "objective/non_score_reward": -0.3513832986354828, + "objective/rlhf_reward": -2.1082996129989624, + "objective/scores": 0.0, + "policy/approxkl_avg": 9.475981712341309, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.5234375, + "step": 1344, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9958958625793457 + }, + { + "episode": 32304, + "epoch": 0.06451690213418652, + "loss/policy_avg": 0.035924140363931656, + "lr": 2.6132093558282207e-06, + "objective/entropy": 103.43757629394531, + "objective/kl": 5.779067039489746, + "objective/non_score_reward": -0.28895333409309387, + "objective/rlhf_reward": -1.7337199077010155, + "objective/scores": 0.0, + "policy/approxkl_avg": 8.031234741210938, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4375, + "step": 1345, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984400272369385 + }, + { + "episode": 32328, + "epoch": 0.0645648344537513, + "loss/policy_avg": 0.054011404514312744, + "lr": 2.612921779141104e-06, + "objective/entropy": 93.07918548583984, + "objective/kl": 7.582632064819336, + "objective/non_score_reward": -0.37913161516189575, + "objective/rlhf_reward": -2.274789586663246, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.9715237617492676, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4462890625, + "step": 1346, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9981980323791504 + }, + { + "episode": 32352, + "epoch": 0.06461276677331608, + "loss/policy_avg": 0.009033524431288242, + "lr": 2.6126342024539876e-06, + "objective/entropy": 137.55072021484375, + "objective/kl": 3.8759264945983887, + "objective/non_score_reward": -0.1937963217496872, + "objective/rlhf_reward": 1.8372221738100052, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.7780193090438843, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.630859375, + "step": 1347, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001786708831787 + }, + { + "episode": 32376, + "epoch": 0.06466069909288086, + "loss/policy_avg": 0.02512560412287712, + "lr": 2.612346625766871e-06, + "objective/entropy": 86.81053161621094, + "objective/kl": 5.729152679443359, + "objective/non_score_reward": -0.2864576578140259, + "objective/rlhf_reward": 0.2812541127204894, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.1790661811828613, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4091796875, + "step": 1348, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989333152770996 + }, + { + "episode": 32400, + "epoch": 0.06470863141244562, + "loss/policy_avg": 0.03531811386346817, + "lr": 2.612059049079755e-06, + "objective/entropy": 131.86485290527344, + "objective/kl": 3.8705596923828125, + "objective/non_score_reward": -0.19352799654006958, + "objective/rlhf_reward": -1.1611679792404175, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.137498617172241, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.58203125, + "step": 1349, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.99871826171875 + }, + { + "episode": 32424, + "epoch": 0.0647565637320104, + "loss/policy_avg": 0.021569104865193367, + "lr": 2.6117714723926383e-06, + "objective/entropy": 98.12850952148438, + "objective/kl": 6.652683258056641, + "objective/non_score_reward": -0.33263418078422546, + "objective/rlhf_reward": 0.3253117885042194, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.402549743652344, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4130859375, + "step": 1350, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010039806365967 + }, + { + "episode": 32448, + "epoch": 0.06480449605157518, + "loss/policy_avg": 0.28330090641975403, + "lr": 2.6114838957055217e-06, + "objective/entropy": 90.2380599975586, + "objective/kl": 4.557397842407227, + "objective/non_score_reward": -0.22786986827850342, + "objective/rlhf_reward": -1.367219254374504, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.238430023193359, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3818359375, + "step": 1351, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998949766159058 + }, + { + "episode": 32472, + "epoch": 0.06485242837113996, + "loss/policy_avg": 0.07446356117725372, + "lr": 2.611196319018405e-06, + "objective/entropy": 106.80180358886719, + "objective/kl": 2.6467394828796387, + "objective/non_score_reward": -0.13233698904514313, + "objective/rlhf_reward": 1.0987673487952556, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 2.1421008110046387, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.470703125, + "step": 1352, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003883838653564 + }, + { + "episode": 32496, + "epoch": 0.06490036069070472, + "loss/policy_avg": 0.07111359387636185, + "lr": 2.6109087423312886e-06, + "objective/entropy": 95.77000427246094, + "objective/kl": 6.172967910766602, + "objective/non_score_reward": -0.3086484372615814, + "objective/rlhf_reward": -1.851890504360199, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.224421977996826, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.439453125, + "step": 1353, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9974708557128906 + }, + { + "episode": 32520, + "epoch": 0.0649482930102695, + "loss/policy_avg": 0.2179809808731079, + "lr": 2.610621165644172e-06, + "objective/entropy": 127.98265075683594, + "objective/kl": 2.3054966926574707, + "objective/non_score_reward": -0.1152748316526413, + "objective/rlhf_reward": 5.308351095765829, + "objective/scores": 1.0, + "policy/approxkl_avg": 1.7118573188781738, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.568359375, + "step": 1354, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006680488586426 + }, + { + "episode": 32544, + "epoch": 0.06499622532983428, + "loss/policy_avg": 0.0656900405883789, + "lr": 2.6103335889570554e-06, + "objective/entropy": 103.17464447021484, + "objective/kl": 5.044975280761719, + "objective/non_score_reward": -0.2522487938404083, + "objective/rlhf_reward": 0.4865073561668395, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.187143325805664, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.48046875, + "step": 1355, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997802734375 + }, + { + "episode": 32568, + "epoch": 0.06504415764939905, + "loss/policy_avg": 0.03854365646839142, + "lr": 2.6100460122699384e-06, + "objective/entropy": 114.8079833984375, + "objective/kl": 2.7882227897644043, + "objective/non_score_reward": -0.13941113650798798, + "objective/rlhf_reward": -0.8364667892456055, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.049149513244629, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.56640625, + "step": 1356, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9971120357513428 + }, + { + "episode": 32592, + "epoch": 0.06509208996896382, + "loss/policy_avg": -0.001616935827769339, + "lr": 2.609758435582822e-06, + "objective/entropy": 97.03948974609375, + "objective/kl": 3.8948066234588623, + "objective/non_score_reward": -0.19474029541015625, + "objective/rlhf_reward": 4.831558130681515, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.813641309738159, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4306640625, + "step": 1357, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011754035949707 + }, + { + "episode": 32616, + "epoch": 0.0651400222885286, + "loss/policy_avg": 0.05270025134086609, + "lr": 2.6094708588957053e-06, + "objective/entropy": 78.02069091796875, + "objective/kl": 5.840737819671631, + "objective/non_score_reward": -0.29203692078590393, + "objective/rlhf_reward": 4.247778534889221, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.0287604331970215, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4091796875, + "step": 1358, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998793601989746 + }, + { + "episode": 32640, + "epoch": 0.06518795460809337, + "loss/policy_avg": 0.03918418288230896, + "lr": 2.609183282208589e-06, + "objective/entropy": 91.18402099609375, + "objective/kl": 3.162820816040039, + "objective/non_score_reward": -0.15814104676246643, + "objective/rlhf_reward": 1.6352131162943335, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.7373623847961426, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4375, + "step": 1359, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0020952224731445 + }, + { + "episode": 32664, + "epoch": 0.06523588692765815, + "loss/policy_avg": 0.051568225026130676, + "lr": 2.6088957055214726e-06, + "objective/entropy": 90.292724609375, + "objective/kl": 5.999013900756836, + "objective/non_score_reward": -0.2999506890773773, + "objective/rlhf_reward": -1.799704149365425, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8458778858184814, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.400390625, + "step": 1360, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000713586807251 + }, + { + "episode": 32688, + "epoch": 0.06528381924722292, + "loss/policy_avg": 0.12354453653097153, + "lr": 2.608608128834356e-06, + "objective/entropy": 102.63201904296875, + "objective/kl": 8.554819107055664, + "objective/non_score_reward": -0.42774099111557007, + "objective/rlhf_reward": -0.6736566151985321, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 3.8814802169799805, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4326171875, + "step": 1361, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989420175552368 + }, + { + "episode": 32712, + "epoch": 0.0653317515667877, + "loss/policy_avg": 0.04366031289100647, + "lr": 2.6083205521472394e-06, + "objective/entropy": 85.3880844116211, + "objective/kl": 5.355445861816406, + "objective/non_score_reward": -0.2677723169326782, + "objective/rlhf_reward": 0.9774254989983532, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.9072072505950928, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.396484375, + "step": 1362, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999511241912842 + }, + { + "episode": 32736, + "epoch": 0.06537968388635247, + "loss/policy_avg": 0.19177278876304626, + "lr": 2.608032975460123e-06, + "objective/entropy": 98.9887924194336, + "objective/kl": 6.5472612380981445, + "objective/non_score_reward": -0.3273630440235138, + "objective/rlhf_reward": -1.9641781896352768, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3017561435699463, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.515625, + "step": 1363, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984064102172852 + }, + { + "episode": 32760, + "epoch": 0.06542761620591725, + "loss/policy_avg": 0.11050555855035782, + "lr": 2.6077453987730063e-06, + "objective/entropy": 91.56581115722656, + "objective/kl": 5.335423946380615, + "objective/non_score_reward": -0.26677119731903076, + "objective/rlhf_reward": 1.399372909218073, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.432878017425537, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4052734375, + "step": 1364, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000199317932129 + }, + { + "episode": 32784, + "epoch": 0.06547554852548201, + "loss/policy_avg": 0.03586283326148987, + "lr": 2.6074578220858897e-06, + "objective/entropy": 87.25493621826172, + "objective/kl": 5.117606163024902, + "objective/non_score_reward": -0.25588032603263855, + "objective/rlhf_reward": -1.535281926393509, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.5879364013671875, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4033203125, + "step": 1365, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996549367904663 + }, + { + "episode": 32808, + "epoch": 0.06552348084504679, + "loss/policy_avg": 0.1283959001302719, + "lr": 2.607170245398773e-06, + "objective/entropy": 67.06230163574219, + "objective/kl": 2.7471330165863037, + "objective/non_score_reward": -0.13735665380954742, + "objective/rlhf_reward": -0.8241398632526398, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.779158115386963, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3154296875, + "step": 1366, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010030269622803 + }, + { + "episode": 32832, + "epoch": 0.06557141316461157, + "loss/policy_avg": 0.17031389474868774, + "lr": 2.6068826687116565e-06, + "objective/entropy": 106.9666748046875, + "objective/kl": 6.674169063568115, + "objective/non_score_reward": -0.3337084650993347, + "objective/rlhf_reward": -2.0022506713867188, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.116431713104248, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4677734375, + "step": 1367, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016651153564453 + }, + { + "episode": 32856, + "epoch": 0.06561934548417635, + "loss/policy_avg": 0.19892564415931702, + "lr": 2.60659509202454e-06, + "objective/entropy": 68.68580627441406, + "objective/kl": 4.998177528381348, + "objective/non_score_reward": -0.24990886449813843, + "objective/rlhf_reward": -1.4994531497359276, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.094000816345215, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3974609375, + "step": 1368, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999838948249817 + }, + { + "episode": 32880, + "epoch": 0.06566727780374111, + "loss/policy_avg": 0.09912502020597458, + "lr": 2.6063075153374234e-06, + "objective/entropy": 85.97239685058594, + "objective/kl": 4.520744323730469, + "objective/non_score_reward": -0.2260371893644333, + "objective/rlhf_reward": -1.356223151087761, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.475698709487915, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.3720703125, + "step": 1369, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000351905822754 + }, + { + "episode": 32904, + "epoch": 0.06571521012330589, + "loss/policy_avg": 0.2040978968143463, + "lr": 2.606019938650307e-06, + "objective/entropy": 117.50780487060547, + "objective/kl": 2.3438656330108643, + "objective/non_score_reward": -0.11719328165054321, + "objective/rlhf_reward": 3.082418809173744, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 1.860076904296875, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.5009765625, + "step": 1370, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002192974090576 + }, + { + "episode": 32928, + "epoch": 0.06576314244287067, + "loss/policy_avg": 0.0888609066605568, + "lr": 2.6057323619631902e-06, + "objective/entropy": 94.08699798583984, + "objective/kl": 5.301280498504639, + "objective/non_score_reward": -0.2650640606880188, + "objective/rlhf_reward": 0.5468589224327934, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.3727593421936035, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4091796875, + "step": 1371, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996204376220703 + }, + { + "episode": 32952, + "epoch": 0.06581107476243545, + "loss/policy_avg": 0.09069764614105225, + "lr": 2.6054447852760737e-06, + "objective/entropy": 102.09246826171875, + "objective/kl": 7.268484115600586, + "objective/non_score_reward": -0.36342424154281616, + "objective/rlhf_reward": 1.605033206282299, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.817713499069214, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4580078125, + "step": 1372, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979534149169922 + }, + { + "episode": 32976, + "epoch": 0.06585900708200021, + "loss/policy_avg": 0.014489313587546349, + "lr": 2.605157208588957e-06, + "objective/entropy": 84.86322021484375, + "objective/kl": 3.3609704971313477, + "objective/non_score_reward": -0.16804853081703186, + "objective/rlhf_reward": 1.5757682156922312, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.127777099609375, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3994140625, + "step": 1373, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.003589153289795 + }, + { + "episode": 33000, + "epoch": 0.06590693940156499, + "loss/policy_avg": -0.011523153632879257, + "lr": 2.6048696319018405e-06, + "objective/entropy": 72.83537292480469, + "objective/kl": 5.563187599182129, + "objective/non_score_reward": -0.27815932035446167, + "objective/rlhf_reward": -1.6689559668302536, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.060041904449463, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.37890625, + "step": 1374, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.999441146850586 + }, + { + "episode": 33024, + "epoch": 0.06595487172112977, + "loss/policy_avg": 0.040261879563331604, + "lr": 2.604582055214724e-06, + "objective/entropy": 105.38060760498047, + "objective/kl": 2.6088054180145264, + "objective/non_score_reward": -0.13044026494026184, + "objective/rlhf_reward": -0.7826415598392487, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.287054061889648, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4716796875, + "step": 1375, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981200695037842 + }, + { + "episode": 33048, + "epoch": 0.06600280404069454, + "loss/policy_avg": -0.0019086739048361778, + "lr": 2.6042944785276074e-06, + "objective/entropy": 74.65461730957031, + "objective/kl": 6.227200508117676, + "objective/non_score_reward": -0.31136006116867065, + "objective/rlhf_reward": 1.1318397894501686, + "objective/scores": 0.5, + "policy/approxkl_avg": 4.067878723144531, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4287109375, + "step": 1376, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.999839425086975 + }, + { + "episode": 33072, + "epoch": 0.06605073636025931, + "loss/policy_avg": 0.03161351755261421, + "lr": 2.604006901840491e-06, + "objective/entropy": 97.2342529296875, + "objective/kl": 4.1853156089782715, + "objective/non_score_reward": -0.20926575362682343, + "objective/rlhf_reward": -1.2555944621562958, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.165683746337891, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4404296875, + "step": 1377, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991481304168701 + }, + { + "episode": 33096, + "epoch": 0.06609866867982409, + "loss/policy_avg": 0.08545918017625809, + "lr": 2.6037193251533742e-06, + "objective/entropy": 98.6531982421875, + "objective/kl": 9.163344383239746, + "objective/non_score_reward": -0.45816725492477417, + "objective/rlhf_reward": -2.7490032613277435, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.264213562011719, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.427734375, + "step": 1378, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9977924823760986 + }, + { + "episode": 33120, + "epoch": 0.06614660099938886, + "loss/policy_avg": 0.015985311940312386, + "lr": 2.6034317484662577e-06, + "objective/entropy": 108.47671508789062, + "objective/kl": 5.595698356628418, + "objective/non_score_reward": -0.2797848880290985, + "objective/rlhf_reward": -1.678709302097559, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.962214946746826, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4755859375, + "step": 1379, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986369609832764 + }, + { + "episode": 33144, + "epoch": 0.06619453331895364, + "loss/policy_avg": 0.02728349156677723, + "lr": 2.603144171779141e-06, + "objective/entropy": 118.55717468261719, + "objective/kl": 4.15341854095459, + "objective/non_score_reward": -0.2076709270477295, + "objective/rlhf_reward": 1.3380338308574649, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.3887948989868164, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.5234375, + "step": 1380, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976143836975098 + }, + { + "episode": 33168, + "epoch": 0.0662424656385184, + "loss/policy_avg": 0.00016862107440829277, + "lr": 2.6028565950920245e-06, + "objective/entropy": 129.0310516357422, + "objective/kl": 1.1698154211044312, + "objective/non_score_reward": -0.05849076807498932, + "objective/rlhf_reward": 1.9701722256440881, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.9473705291748047, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.55859375, + "step": 1381, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002918243408203 + }, + { + "episode": 33192, + "epoch": 0.06629039795808318, + "loss/policy_avg": 0.029599687084555626, + "lr": 2.602569018404908e-06, + "objective/entropy": 150.34585571289062, + "objective/kl": 5.149758338928223, + "objective/non_score_reward": -0.2574878931045532, + "objective/rlhf_reward": -1.544927440583706, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.393131732940674, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.611328125, + "step": 1382, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0041825771331787 + }, + { + "episode": 33216, + "epoch": 0.06633833027764796, + "loss/policy_avg": -0.04218612611293793, + "lr": 2.6022814417177918e-06, + "objective/entropy": 90.6946029663086, + "objective/kl": 6.374307632446289, + "objective/non_score_reward": -0.3187154233455658, + "objective/rlhf_reward": -1.9122924953699112, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.7078490257263184, + "policy/clipfrac_avg": 1.8333333730697632, + "policy/entropy_avg": 0.404296875, + "step": 1383, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015878677368164 + }, + { + "episode": 33240, + "epoch": 0.06638626259721274, + "loss/policy_avg": 0.4187643527984619, + "lr": 2.601993865030675e-06, + "objective/entropy": 96.91001892089844, + "objective/kl": 7.268664836883545, + "objective/non_score_reward": -0.36343324184417725, + "objective/rlhf_reward": 0.14051732528696093, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 4.223344326019287, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4375, + "step": 1384, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995002746582031 + }, + { + "episode": 33264, + "epoch": 0.06643419491677752, + "loss/policy_avg": 0.09388434141874313, + "lr": 2.6017062883435586e-06, + "objective/entropy": 78.95474243164062, + "objective/kl": 4.720288276672363, + "objective/non_score_reward": -0.2360144555568695, + "objective/rlhf_reward": -1.4160866253077984, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4682259559631348, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.37109375, + "step": 1385, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9992122650146484 + }, + { + "episode": 33288, + "epoch": 0.06648212723634228, + "loss/policy_avg": 0.02297035977244377, + "lr": 2.6014187116564416e-06, + "objective/entropy": 94.39227294921875, + "objective/kl": 5.389701843261719, + "objective/non_score_reward": -0.26948511600494385, + "objective/rlhf_reward": 4.383089154958725, + "objective/scores": 1.0, + "policy/approxkl_avg": 4.663118362426758, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4248046875, + "step": 1386, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976584911346436 + }, + { + "episode": 33312, + "epoch": 0.06653005955590706, + "loss/policy_avg": 0.06768133491277695, + "lr": 2.601131134969325e-06, + "objective/entropy": 144.0038604736328, + "objective/kl": 5.392185211181641, + "objective/non_score_reward": -0.269609272480011, + "objective/rlhf_reward": 0.18852439870846605, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.348577499389648, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.64453125, + "step": 1387, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0003457069396973 + }, + { + "episode": 33336, + "epoch": 0.06657799187547184, + "loss/policy_avg": 0.12620383501052856, + "lr": 2.6008435582822085e-06, + "objective/entropy": 86.67352294921875, + "objective/kl": 2.297011375427246, + "objective/non_score_reward": -0.11485057324171066, + "objective/rlhf_reward": 5.310896623879671, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.6959307193756104, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3857421875, + "step": 1388, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0028018951416016 + }, + { + "episode": 33360, + "epoch": 0.06662592419503661, + "loss/policy_avg": 0.1073627844452858, + "lr": 2.600555981595092e-06, + "objective/entropy": 115.2448959350586, + "objective/kl": 4.616235256195068, + "objective/non_score_reward": -0.23081175982952118, + "objective/rlhf_reward": 0.5079187687924709, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 6.586090564727783, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.525390625, + "step": 1389, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000321388244629 + }, + { + "episode": 33384, + "epoch": 0.06667385651460138, + "loss/policy_avg": 0.04096933454275131, + "lr": 2.6002684049079753e-06, + "objective/entropy": 75.98558044433594, + "objective/kl": 5.880553245544434, + "objective/non_score_reward": -0.2940276861190796, + "objective/rlhf_reward": 0.04201381256592607, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.4115543365478516, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.365234375, + "step": 1390, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9986741542816162 + }, + { + "episode": 33408, + "epoch": 0.06672178883416616, + "loss/policy_avg": 0.12316790223121643, + "lr": 2.5999808282208588e-06, + "objective/entropy": 125.7945556640625, + "objective/kl": 5.502284049987793, + "objective/non_score_reward": -0.27511417865753174, + "objective/rlhf_reward": -1.6506849825382233, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8648788928985596, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.544921875, + "step": 1391, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983718395233154 + }, + { + "episode": 33432, + "epoch": 0.06676972115373093, + "loss/policy_avg": 0.21656648814678192, + "lr": 2.599693251533742e-06, + "objective/entropy": 65.46668243408203, + "objective/kl": 4.128106594085693, + "objective/non_score_reward": -0.20640531182289124, + "objective/rlhf_reward": -1.2384318709373474, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.1938377618789673, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.30078125, + "step": 1392, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0015149116516113 + }, + { + "episode": 33456, + "epoch": 0.06681765347329571, + "loss/policy_avg": 0.036553479731082916, + "lr": 2.599405674846626e-06, + "objective/entropy": 111.22920227050781, + "objective/kl": 3.7709527015686035, + "objective/non_score_reward": -0.18854761123657227, + "objective/rlhf_reward": 1.868714228272438, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.5418050289154053, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4970703125, + "step": 1393, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0006370544433594 + }, + { + "episode": 33480, + "epoch": 0.06686558579286048, + "loss/policy_avg": 0.0527520552277565, + "lr": 2.5991180981595095e-06, + "objective/entropy": 74.17001342773438, + "objective/kl": 5.074398994445801, + "objective/non_score_reward": -0.2537199556827545, + "objective/rlhf_reward": -1.5223196595907211, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.191809892654419, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3251953125, + "step": 1394, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013375282287598 + }, + { + "episode": 33504, + "epoch": 0.06691351811242525, + "loss/policy_avg": 0.00519383093342185, + "lr": 2.598830521472393e-06, + "objective/entropy": 134.46929931640625, + "objective/kl": 5.153677940368652, + "objective/non_score_reward": -0.2576839327812195, + "objective/rlhf_reward": 0.5911396824230087, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.871560573577881, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.56640625, + "step": 1395, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0039682388305664 + }, + { + "episode": 33528, + "epoch": 0.06696145043199003, + "loss/policy_avg": 0.1022026464343071, + "lr": 2.5985429447852763e-06, + "objective/entropy": 93.356201171875, + "objective/kl": 5.56543493270874, + "objective/non_score_reward": -0.27827176451683044, + "objective/rlhf_reward": 2.1159480237347297, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.42728590965271, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4140625, + "step": 1396, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987272024154663 + }, + { + "episode": 33552, + "epoch": 0.06700938275155481, + "loss/policy_avg": 0.49827635288238525, + "lr": 2.5982553680981597e-06, + "objective/entropy": 83.48590087890625, + "objective/kl": 5.482119560241699, + "objective/non_score_reward": -0.2741059958934784, + "objective/rlhf_reward": 0.3553640395402907, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 3.6320877075195312, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.392578125, + "step": 1397, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002046585083008 + }, + { + "episode": 33576, + "epoch": 0.06705731507111957, + "loss/policy_avg": 0.02976522035896778, + "lr": 2.597967791411043e-06, + "objective/entropy": 105.22001647949219, + "objective/kl": 6.218513488769531, + "objective/non_score_reward": -0.31092569231987, + "objective/rlhf_reward": -1.865554079413414, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.0038633346557617, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4716796875, + "step": 1398, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9989030361175537 + }, + { + "episode": 33600, + "epoch": 0.06710524739068435, + "loss/policy_avg": 0.05654367804527283, + "lr": 2.5976802147239266e-06, + "objective/entropy": 100.26164245605469, + "objective/kl": 6.3029985427856445, + "objective/non_score_reward": -0.3151499032974243, + "objective/rlhf_reward": -1.890899509191513, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.539277076721191, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.44921875, + "step": 1399, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.996795654296875 + }, + { + "episode": 33624, + "epoch": 0.06715317971024913, + "loss/policy_avg": -0.0034245839342474937, + "lr": 2.59739263803681e-06, + "objective/entropy": 109.46212005615234, + "objective/kl": 5.052888870239258, + "objective/non_score_reward": -0.25264444947242737, + "objective/rlhf_reward": 4.4841334372758865, + "objective/scores": 1.0, + "policy/approxkl_avg": 3.765136241912842, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.482421875, + "step": 1400, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9994128942489624 + }, + { + "episode": 33648, + "epoch": 0.06720111202981391, + "loss/policy_avg": 0.07872577756643295, + "lr": 2.597105061349693e-06, + "objective/entropy": 82.34791564941406, + "objective/kl": 4.619797706604004, + "objective/non_score_reward": -0.23098987340927124, + "objective/rlhf_reward": -1.3859393820166588, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.9672412872314453, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3701171875, + "step": 1401, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972496032714844 + }, + { + "episode": 33672, + "epoch": 0.06724904434937867, + "loss/policy_avg": 0.00438174232840538, + "lr": 2.5968174846625764e-06, + "objective/entropy": 88.85955810546875, + "objective/kl": 7.364049434661865, + "objective/non_score_reward": -0.3682025372982025, + "objective/rlhf_reward": -0.4748259976609863, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.37911319732666, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4130859375, + "step": 1402, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981516599655151 + }, + { + "episode": 33696, + "epoch": 0.06729697666894345, + "loss/policy_avg": 0.05090124160051346, + "lr": 2.59652990797546e-06, + "objective/entropy": 86.2366943359375, + "objective/kl": 6.119088172912598, + "objective/non_score_reward": -0.3059543967247009, + "objective/rlhf_reward": -1.8357264176011086, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.021100044250488, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.375, + "step": 1403, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998653531074524 + }, + { + "episode": 33720, + "epoch": 0.06734490898850823, + "loss/policy_avg": 0.037426065653562546, + "lr": 2.5962423312883437e-06, + "objective/entropy": 104.0465316772461, + "objective/kl": 5.706657409667969, + "objective/non_score_reward": -0.2853328585624695, + "objective/rlhf_reward": -1.711997076869011, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.406907320022583, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4892578125, + "step": 1404, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.002781867980957 + }, + { + "episode": 33744, + "epoch": 0.067392841308073, + "loss/policy_avg": 0.02978472411632538, + "lr": 2.595954754601227e-06, + "objective/entropy": 98.96080017089844, + "objective/kl": 6.598153591156006, + "objective/non_score_reward": -0.32990768551826477, + "objective/rlhf_reward": -1.9794460833072662, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.886173248291016, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4501953125, + "step": 1405, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9976110458374023 + }, + { + "episode": 33768, + "epoch": 0.06744077362763777, + "loss/policy_avg": 0.05682896822690964, + "lr": 2.5956671779141106e-06, + "objective/entropy": 72.5447998046875, + "objective/kl": 4.776078224182129, + "objective/non_score_reward": -0.2388039082288742, + "objective/rlhf_reward": 2.3527551316601447, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 3.4145843982696533, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.322265625, + "step": 1406, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9978222846984863 + }, + { + "episode": 33792, + "epoch": 0.06748870594720255, + "loss/policy_avg": 0.012007246725261211, + "lr": 2.595379601226994e-06, + "objective/entropy": 99.11837005615234, + "objective/kl": 4.855345726013184, + "objective/non_score_reward": -0.2427673041820526, + "objective/rlhf_reward": -1.456603690981865, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.49260139465332, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.439453125, + "step": 1407, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9972301721572876 + }, + { + "episode": 33816, + "epoch": 0.06753663826676733, + "loss/policy_avg": 0.05474459007382393, + "lr": 2.5950920245398774e-06, + "objective/entropy": 123.82810974121094, + "objective/kl": 0.9452492594718933, + "objective/non_score_reward": -0.047262467443943024, + "objective/rlhf_reward": 2.0375420722712043, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 1.4785712957382202, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.490234375, + "step": 1408, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004955768585205 + }, + { + "episode": 33840, + "epoch": 0.0675845705863321, + "loss/policy_avg": -0.01786312833428383, + "lr": 2.594804447852761e-06, + "objective/entropy": 67.93241882324219, + "objective/kl": 3.934445381164551, + "objective/non_score_reward": -0.19672226905822754, + "objective/rlhf_reward": 1.8196664871647954, + "objective/scores": 0.5, + "policy/approxkl_avg": 0.8841530084609985, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.306640625, + "step": 1409, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0022339820861816 + }, + { + "episode": 33864, + "epoch": 0.06763250290589687, + "loss/policy_avg": -0.04287567734718323, + "lr": 2.5945168711656443e-06, + "objective/entropy": 78.1019058227539, + "objective/kl": 4.276393890380859, + "objective/non_score_reward": -0.21381966769695282, + "objective/rlhf_reward": -1.2829179838299751, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.342831611633301, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3603515625, + "step": 1410, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000868558883667 + }, + { + "episode": 33888, + "epoch": 0.06768043522546165, + "loss/policy_avg": 0.031157249584794044, + "lr": 2.5942292944785277e-06, + "objective/entropy": 123.28905487060547, + "objective/kl": 4.0282182693481445, + "objective/non_score_reward": -0.2014109343290329, + "objective/rlhf_reward": 0.5259234636918388, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 4.993950843811035, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.53515625, + "step": 1411, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9981389045715332 + }, + { + "episode": 33912, + "epoch": 0.06772836754502642, + "loss/policy_avg": 0.05513560771942139, + "lr": 2.593941717791411e-06, + "objective/entropy": 108.43184661865234, + "objective/kl": 5.230996608734131, + "objective/non_score_reward": -0.26154983043670654, + "objective/rlhf_reward": -1.5692988634109497, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.0958635807037354, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4912109375, + "step": 1412, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0033774375915527 + }, + { + "episode": 33936, + "epoch": 0.0677762998645912, + "loss/policy_avg": 0.04232683777809143, + "lr": 2.5936541411042946e-06, + "objective/entropy": 100.48271942138672, + "objective/kl": 3.6520838737487793, + "objective/non_score_reward": -0.18260419368743896, + "objective/rlhf_reward": -1.095625177025795, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.4248311519622803, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.466796875, + "step": 1413, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9986686706542969 + }, + { + "episode": 33960, + "epoch": 0.06782423218415597, + "loss/policy_avg": 0.17961996793746948, + "lr": 2.593366564417178e-06, + "objective/entropy": 91.21055603027344, + "objective/kl": 9.853336334228516, + "objective/non_score_reward": -0.4926668405532837, + "objective/rlhf_reward": -2.956001043319702, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.469003677368164, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4404296875, + "step": 1414, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.994482398033142 + }, + { + "episode": 33984, + "epoch": 0.06787216450372074, + "loss/policy_avg": 0.007178165018558502, + "lr": 2.5930789877300614e-06, + "objective/entropy": 80.70498657226562, + "objective/kl": 3.929243803024292, + "objective/non_score_reward": -0.19646219909191132, + "objective/rlhf_reward": 0.6274068092347417, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.006504535675049, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4306640625, + "step": 1415, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.0010766983032227 + }, + { + "episode": 34008, + "epoch": 0.06792009682328552, + "loss/policy_avg": 0.14008145034313202, + "lr": 2.592791411042945e-06, + "objective/entropy": 119.53941345214844, + "objective/kl": 3.6904120445251465, + "objective/non_score_reward": -0.18452058732509613, + "objective/rlhf_reward": 0.7856657162746992, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.566697120666504, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.525390625, + "step": 1416, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9982774257659912 + }, + { + "episode": 34032, + "epoch": 0.0679680291428503, + "loss/policy_avg": -0.024379784241318703, + "lr": 2.5925038343558283e-06, + "objective/entropy": 84.75426483154297, + "objective/kl": 4.753249168395996, + "objective/non_score_reward": -0.23766246438026428, + "objective/rlhf_reward": -1.425974752753973, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.8887667655944824, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3984375, + "step": 1417, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002364158630371 + }, + { + "episode": 34056, + "epoch": 0.06801596146241506, + "loss/policy_avg": 0.022863183170557022, + "lr": 2.5922162576687117e-06, + "objective/entropy": 116.04048156738281, + "objective/kl": 4.952768802642822, + "objective/non_score_reward": -0.24763844907283783, + "objective/rlhf_reward": -1.4858306497335434, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.6976138353347778, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4912109375, + "step": 1418, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.00075626373291 + }, + { + "episode": 34080, + "epoch": 0.06806389378197984, + "loss/policy_avg": 0.02748348005115986, + "lr": 2.591928680981595e-06, + "objective/entropy": 97.93502044677734, + "objective/kl": 4.2337751388549805, + "objective/non_score_reward": -0.21168875694274902, + "objective/rlhf_reward": -1.2701325416564941, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.788329839706421, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.41796875, + "step": 1419, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000791072845459 + }, + { + "episode": 34104, + "epoch": 0.06811182610154462, + "loss/policy_avg": 0.02132336050271988, + "lr": 2.5916411042944785e-06, + "objective/entropy": 82.29376220703125, + "objective/kl": 4.056059837341309, + "objective/non_score_reward": -0.20280301570892334, + "objective/rlhf_reward": 0.6759712186148967, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.567381501197815, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.400390625, + "step": 1420, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010528564453125 + }, + { + "episode": 34128, + "epoch": 0.0681597584211094, + "loss/policy_avg": 0.03804056718945503, + "lr": 2.591353527607362e-06, + "objective/entropy": 131.31771850585938, + "objective/kl": 2.740090847015381, + "objective/non_score_reward": -0.137004554271698, + "objective/rlhf_reward": -0.8220272995531559, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5226576328277588, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.611328125, + "step": 1421, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9999125003814697 + }, + { + "episode": 34152, + "epoch": 0.06820769074067416, + "loss/policy_avg": 0.11993257701396942, + "lr": 2.5910659509202454e-06, + "objective/entropy": 93.90188598632812, + "objective/kl": 5.575595855712891, + "objective/non_score_reward": -0.2787798047065735, + "objective/rlhf_reward": 0.6484380002666477, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 6.058849334716797, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3994140625, + "step": 1422, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9952415227890015 + }, + { + "episode": 34176, + "epoch": 0.06825562306023894, + "loss/policy_avg": 0.07149292528629303, + "lr": 2.590778374233129e-06, + "objective/entropy": 92.423095703125, + "objective/kl": 6.4447736740112305, + "objective/non_score_reward": -0.3222387135028839, + "objective/rlhf_reward": -0.0406429606982861, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 4.085061550140381, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4130859375, + "step": 1423, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9990143775939941 + }, + { + "episode": 34200, + "epoch": 0.06830355537980372, + "loss/policy_avg": 0.003470185212790966, + "lr": 2.5904907975460122e-06, + "objective/entropy": 117.93185424804688, + "objective/kl": 6.002601623535156, + "objective/non_score_reward": -0.30013009905815125, + "objective/rlhf_reward": -1.8007804453372955, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.518758535385132, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.484375, + "step": 1424, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0005431175231934 + }, + { + "episode": 34224, + "epoch": 0.0683514876993685, + "loss/policy_avg": 0.033859334886074066, + "lr": 2.5902032208588957e-06, + "objective/entropy": 92.97808837890625, + "objective/kl": 4.991058826446533, + "objective/non_score_reward": -0.24955295026302338, + "objective/rlhf_reward": -1.4973176456987858, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.230112552642822, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4921875, + "step": 1425, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998000144958496 + }, + { + "episode": 34248, + "epoch": 0.06839942001893326, + "loss/policy_avg": 0.01976745016872883, + "lr": 2.589915644171779e-06, + "objective/entropy": 110.7061767578125, + "objective/kl": 3.275402784347534, + "objective/non_score_reward": -0.16377012431621552, + "objective/rlhf_reward": -0.9826207309961319, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.673046112060547, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.484375, + "step": 1426, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988024234771729 + }, + { + "episode": 34272, + "epoch": 0.06844735233849804, + "loss/policy_avg": 0.018042225390672684, + "lr": 2.589628067484663e-06, + "objective/entropy": 101.22933197021484, + "objective/kl": 4.211112976074219, + "objective/non_score_reward": -0.21055565774440765, + "objective/rlhf_reward": 1.736666053533554, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.485417127609253, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.416015625, + "step": 1427, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9996126890182495 + }, + { + "episode": 34296, + "epoch": 0.06849528465806282, + "loss/policy_avg": 0.02018706314265728, + "lr": 2.5893404907975464e-06, + "objective/entropy": 88.17574310302734, + "objective/kl": 2.8210787773132324, + "objective/non_score_reward": -0.1410539448261261, + "objective/rlhf_reward": -0.8463236838579178, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.829869270324707, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3759765625, + "step": 1428, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0013885498046875 + }, + { + "episode": 34320, + "epoch": 0.0685432169776276, + "loss/policy_avg": 0.040521323680877686, + "lr": 2.58905291411043e-06, + "objective/entropy": 110.65584564208984, + "objective/kl": 4.178350448608398, + "objective/non_score_reward": -0.2089175283908844, + "objective/rlhf_reward": 1.0676117475677493, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.665154218673706, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.48828125, + "step": 1429, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9988064765930176 + }, + { + "episode": 34344, + "epoch": 0.06859114929719236, + "loss/policy_avg": 0.0591043159365654, + "lr": 2.588765337423313e-06, + "objective/entropy": 113.03009033203125, + "objective/kl": 5.0230255126953125, + "objective/non_score_reward": -0.25115126371383667, + "objective/rlhf_reward": 0.2992723618985448, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.7814478874206543, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.619140625, + "step": 1430, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001081943511963 + }, + { + "episode": 34368, + "epoch": 0.06863908161675714, + "loss/policy_avg": 0.2257043868303299, + "lr": 2.5884777607361962e-06, + "objective/entropy": 119.893798828125, + "objective/kl": 3.5670413970947266, + "objective/non_score_reward": -0.17835205793380737, + "objective/rlhf_reward": -1.070112313143909, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8471159934997559, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.513671875, + "step": 1431, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0025649070739746 + }, + { + "episode": 34392, + "epoch": 0.06868701393632191, + "loss/policy_avg": 0.039100125432014465, + "lr": 2.5881901840490797e-06, + "objective/entropy": 145.53411865234375, + "objective/kl": 4.78700065612793, + "objective/non_score_reward": -0.23935005068778992, + "objective/rlhf_reward": 0.3700796847583089, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.614152431488037, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.6484375, + "step": 1432, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970035552978516 + }, + { + "episode": 34416, + "epoch": 0.06873494625588669, + "loss/policy_avg": 0.06348354369401932, + "lr": 2.587902607361963e-06, + "objective/entropy": 100.03976440429688, + "objective/kl": 7.370501518249512, + "objective/non_score_reward": -0.36852502822875977, + "objective/rlhf_reward": -0.21115029603242885, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 6.264249801635742, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.435546875, + "step": 1433, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967397451400757 + }, + { + "episode": 34440, + "epoch": 0.06878287857545146, + "loss/policy_avg": 0.008168239146471024, + "lr": 2.5876150306748465e-06, + "objective/entropy": 94.73725891113281, + "objective/kl": 8.84080696105957, + "objective/non_score_reward": -0.4420403838157654, + "objective/rlhf_reward": -0.9178533151849426, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.3391332626342773, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.40625, + "step": 1434, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001249313354492 + }, + { + "episode": 34464, + "epoch": 0.06883081089501623, + "loss/policy_avg": 0.09465286135673523, + "lr": 2.58732745398773e-06, + "objective/entropy": 107.34022521972656, + "objective/kl": 3.9168624877929688, + "objective/non_score_reward": -0.19584310054779053, + "objective/rlhf_reward": -1.1750586479902267, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.2820329666137695, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4755859375, + "step": 1435, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9998905658721924 + }, + { + "episode": 34488, + "epoch": 0.06887874321458101, + "loss/policy_avg": 0.015077384188771248, + "lr": 2.5870398773006134e-06, + "objective/entropy": 88.65779876708984, + "objective/kl": 2.9657485485076904, + "objective/non_score_reward": -0.14828743040561676, + "objective/rlhf_reward": -0.8897245898842812, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.224133491516113, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4072265625, + "step": 1436, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9984710216522217 + }, + { + "episode": 34512, + "epoch": 0.06892667553414579, + "loss/policy_avg": 0.04555100202560425, + "lr": 2.5867523006134968e-06, + "objective/entropy": 62.55036163330078, + "objective/kl": 6.949080944061279, + "objective/non_score_reward": -0.3474540412425995, + "objective/rlhf_reward": 0.4993350935341808, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.868528366088867, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.365234375, + "step": 1437, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 2, + "val/ratio": 1.9979299306869507 + }, + { + "episode": 34536, + "epoch": 0.06897460785371055, + "loss/policy_avg": 0.15953052043914795, + "lr": 2.5864647239263806e-06, + "objective/entropy": 91.1173324584961, + "objective/kl": 4.441897392272949, + "objective/non_score_reward": -0.22209487855434418, + "objective/rlhf_reward": 1.251490125543067, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.2262086868286133, + "policy/clipfrac_avg": 0.5, + "policy/entropy_avg": 0.4189453125, + "step": 1438, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997864961624146 + }, + { + "episode": 34560, + "epoch": 0.06902254017327533, + "loss/policy_avg": 0.20050762593746185, + "lr": 2.586177147239264e-06, + "objective/entropy": 78.59123992919922, + "objective/kl": 5.59735107421875, + "objective/non_score_reward": -0.27986758947372437, + "objective/rlhf_reward": -1.6792053207755089, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7465691566467285, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3740234375, + "step": 1439, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0016374588012695 + }, + { + "episode": 34584, + "epoch": 0.06907047249284011, + "loss/policy_avg": 0.10686106979846954, + "lr": 2.5858895705521475e-06, + "objective/entropy": 88.5713119506836, + "objective/kl": 4.960433006286621, + "objective/non_score_reward": -0.24802164733409882, + "objective/rlhf_reward": -1.4881298895925283, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.146754741668701, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3935546875, + "step": 1440, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975929260253906 + }, + { + "episode": 34608, + "epoch": 0.06911840481240489, + "loss/policy_avg": 0.08237063884735107, + "lr": 2.585601993865031e-06, + "objective/entropy": 94.00505828857422, + "objective/kl": 5.207503318786621, + "objective/non_score_reward": -0.26037514209747314, + "objective/rlhf_reward": -1.5622509196400642, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3334505558013916, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4365234375, + "step": 1441, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008816719055176 + }, + { + "episode": 34632, + "epoch": 0.06916633713196967, + "loss/policy_avg": 0.08189180493354797, + "lr": 2.5853144171779143e-06, + "objective/entropy": 121.50291442871094, + "objective/kl": 3.6441688537597656, + "objective/non_score_reward": -0.18220844864845276, + "objective/rlhf_reward": 1.2278661515165332, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 2.617565631866455, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.52734375, + "step": 1442, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0018184185028076 + }, + { + "episode": 34656, + "epoch": 0.06921426945153443, + "loss/policy_avg": 0.15175575017929077, + "lr": 2.5850268404907978e-06, + "objective/entropy": 88.00129699707031, + "objective/kl": 3.458299160003662, + "objective/non_score_reward": -0.17291495203971863, + "objective/rlhf_reward": 1.5465696324767562, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.462064504623413, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.390625, + "step": 1443, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010623931884766 + }, + { + "episode": 34680, + "epoch": 0.06926220177109921, + "loss/policy_avg": 0.06946873664855957, + "lr": 2.584739263803681e-06, + "objective/entropy": 101.74876403808594, + "objective/kl": 6.03009557723999, + "objective/non_score_reward": -0.30150479078292847, + "objective/rlhf_reward": -1.809028722345829, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.3775649070739746, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4658203125, + "step": 1444, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.998415470123291 + }, + { + "episode": 34704, + "epoch": 0.06931013409066399, + "loss/policy_avg": 0.02795383892953396, + "lr": 2.584451687116564e-06, + "objective/entropy": 104.29426574707031, + "objective/kl": 3.72941255569458, + "objective/non_score_reward": -0.186470627784729, + "objective/rlhf_reward": 1.8811762928962708, + "objective/scores": 0.5, + "policy/approxkl_avg": 2.5245509147644043, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4912109375, + "step": 1445, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0021824836730957 + }, + { + "episode": 34728, + "epoch": 0.06935806641022876, + "loss/policy_avg": 0.07699143886566162, + "lr": 2.5841641104294476e-06, + "objective/entropy": 114.48321533203125, + "objective/kl": 4.6104841232299805, + "objective/non_score_reward": -0.2305242270231247, + "objective/rlhf_reward": 4.616854771971703, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.96440052986145, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.482421875, + "step": 1446, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000107765197754 + }, + { + "episode": 34752, + "epoch": 0.06940599872979353, + "loss/policy_avg": 0.04816842079162598, + "lr": 2.583876533742331e-06, + "objective/entropy": 91.90196990966797, + "objective/kl": 1.7650156021118164, + "objective/non_score_reward": -0.0882507786154747, + "objective/rlhf_reward": 1.2048842824891888, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 5.915712356567383, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.42578125, + "step": 1447, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963626861572266 + }, + { + "episode": 34776, + "epoch": 0.0694539310493583, + "loss/policy_avg": 0.13012641668319702, + "lr": 2.583588957055215e-06, + "objective/entropy": 96.5672836303711, + "objective/kl": 4.34357213973999, + "objective/non_score_reward": -0.21717862784862518, + "objective/rlhf_reward": -1.3030716739594936, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.030134677886963, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4140625, + "step": 1448, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.004404067993164 + }, + { + "episode": 34800, + "epoch": 0.06950186336892308, + "loss/policy_avg": 0.1493099182844162, + "lr": 2.5833013803680983e-06, + "objective/entropy": 108.1253662109375, + "objective/kl": 4.150123119354248, + "objective/non_score_reward": -0.20750615000724792, + "objective/rlhf_reward": -1.2450368255376816, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.166501045227051, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4580078125, + "step": 1449, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9987763166427612 + }, + { + "episode": 34824, + "epoch": 0.06954979568848786, + "loss/policy_avg": -0.0521458238363266, + "lr": 2.5830138036809817e-06, + "objective/entropy": 100.36505126953125, + "objective/kl": 6.778353691101074, + "objective/non_score_reward": -0.33891770243644714, + "objective/rlhf_reward": -2.0335060954093933, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.9995768070220947, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4228515625, + "step": 1450, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000201940536499 + }, + { + "episode": 34848, + "epoch": 0.06959772800805263, + "loss/policy_avg": 0.0291233342140913, + "lr": 2.582726226993865e-06, + "objective/entropy": 100.9482650756836, + "objective/kl": 5.356234550476074, + "objective/non_score_reward": -0.26781171560287476, + "objective/rlhf_reward": 0.19930968036663865, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.006589651107788, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4541015625, + "step": 1451, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000549793243408 + }, + { + "episode": 34872, + "epoch": 0.0696456603276174, + "loss/policy_avg": 0.00544985756278038, + "lr": 2.5824386503067486e-06, + "objective/entropy": 135.50045776367188, + "objective/kl": 6.2594709396362305, + "objective/non_score_reward": -0.3129735589027405, + "objective/rlhf_reward": 0.7062182036401722, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 5.101127624511719, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.533203125, + "step": 1452, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.999699354171753 + }, + { + "episode": 34896, + "epoch": 0.06969359264718218, + "loss/policy_avg": 0.0538046695291996, + "lr": 2.582151073619632e-06, + "objective/entropy": 76.74607849121094, + "objective/kl": 3.604984998703003, + "objective/non_score_reward": -0.18024925887584686, + "objective/rlhf_reward": -1.0814954787492752, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.4103165864944458, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3349609375, + "step": 1453, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0010018348693848 + }, + { + "episode": 34920, + "epoch": 0.06974152496674696, + "loss/policy_avg": 0.044999971985816956, + "lr": 2.5818634969325154e-06, + "objective/entropy": 80.400146484375, + "objective/kl": 3.046128034591675, + "objective/non_score_reward": -0.15230640769004822, + "objective/rlhf_reward": -0.9138383604586124, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.992006778717041, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.50390625, + "step": 1454, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9974803924560547 + }, + { + "episode": 34944, + "epoch": 0.06978945728631172, + "loss/policy_avg": -0.014148377813398838, + "lr": 2.581575920245399e-06, + "objective/entropy": 80.82212829589844, + "objective/kl": 2.480158805847168, + "objective/non_score_reward": -0.1240079402923584, + "objective/rlhf_reward": 1.1487416264108028, + "objective/scores": 0.31546487678572877, + "policy/approxkl_avg": 1.9756698608398438, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.47265625, + "step": 1455, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 2.001702308654785 + }, + { + "episode": 34968, + "epoch": 0.0698373896058765, + "loss/policy_avg": 0.12020416557788849, + "lr": 2.5812883435582823e-06, + "objective/entropy": 129.82339477539062, + "objective/kl": 7.687321662902832, + "objective/non_score_reward": -0.3843660354614258, + "objective/rlhf_reward": -0.30619631707668316, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 6.732280731201172, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.54296875, + "step": 1456, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9967379570007324 + }, + { + "episode": 34992, + "epoch": 0.06988532192544128, + "loss/policy_avg": -0.012752020731568336, + "lr": 2.5810007668711657e-06, + "objective/entropy": 90.40740966796875, + "objective/kl": 5.238534450531006, + "objective/non_score_reward": -0.26192671060562134, + "objective/rlhf_reward": -1.571560189127922, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.1571903228759766, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3671875, + "step": 1457, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019726753234863 + }, + { + "episode": 35016, + "epoch": 0.06993325424500606, + "loss/policy_avg": -0.02433757111430168, + "lr": 2.580713190184049e-06, + "objective/entropy": 92.50370788574219, + "objective/kl": 4.905426025390625, + "objective/non_score_reward": -0.24527132511138916, + "objective/rlhf_reward": -1.471627851948142, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.8658087253570557, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4091796875, + "step": 1458, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012452602386475 + }, + { + "episode": 35040, + "epoch": 0.06998118656457082, + "loss/policy_avg": 0.3119747042655945, + "lr": 2.5804256134969326e-06, + "objective/entropy": 93.28794860839844, + "objective/kl": 4.584203720092773, + "objective/non_score_reward": -0.22921019792556763, + "objective/rlhf_reward": -1.3752612248063087, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.3055853843688965, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4189453125, + "step": 1459, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9992003440856934 + }, + { + "episode": 35064, + "epoch": 0.0700291188841356, + "loss/policy_avg": 0.0667242780327797, + "lr": 2.580138036809816e-06, + "objective/entropy": 73.03019714355469, + "objective/kl": 5.641839981079102, + "objective/non_score_reward": -0.28209197521209717, + "objective/rlhf_reward": -1.6925519555807114, + "objective/scores": 0.0, + "policy/approxkl_avg": 3.758484363555908, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.357421875, + "step": 1460, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998046875 + }, + { + "episode": 35088, + "epoch": 0.07007705120370038, + "loss/policy_avg": 0.08546097576618195, + "lr": 2.5798504601226994e-06, + "objective/entropy": 93.6557846069336, + "objective/kl": 7.460359573364258, + "objective/non_score_reward": -0.3730179965496063, + "objective/rlhf_reward": 0.7618920356035233, + "objective/scores": 0.5, + "policy/approxkl_avg": 3.767063856124878, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4052734375, + "step": 1461, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985496997833252 + }, + { + "episode": 35112, + "epoch": 0.07012498352326516, + "loss/policy_avg": -0.0004957183264195919, + "lr": 2.579562883435583e-06, + "objective/entropy": 87.648681640625, + "objective/kl": 3.942709445953369, + "objective/non_score_reward": -0.19713549315929413, + "objective/rlhf_reward": 1.817187175154686, + "objective/scores": 0.5, + "policy/approxkl_avg": 1.091950535774231, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3876953125, + "step": 1462, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0035452842712402 + }, + { + "episode": 35136, + "epoch": 0.07017291584282992, + "loss/policy_avg": 0.08719050884246826, + "lr": 2.5792753067484663e-06, + "objective/entropy": 90.3799057006836, + "objective/kl": 5.846271514892578, + "objective/non_score_reward": -0.2923136055469513, + "objective/rlhf_reward": 4.246118456125259, + "objective/scores": 1.0, + "policy/approxkl_avg": 2.5793933868408203, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.421875, + "step": 1463, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9985158443450928 + }, + { + "episode": 35160, + "epoch": 0.0702208481623947, + "loss/policy_avg": 0.018922708928585052, + "lr": 2.5789877300613497e-06, + "objective/entropy": 112.2987289428711, + "objective/kl": 4.803832054138184, + "objective/non_score_reward": -0.24019160866737366, + "objective/rlhf_reward": 0.3650303890348706, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.4719666242599487, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.53515625, + "step": 1464, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0012106895446777 + }, + { + "episode": 35184, + "epoch": 0.07026878048195948, + "loss/policy_avg": 0.07035300880670547, + "lr": 2.578700153374233e-06, + "objective/entropy": 77.64151000976562, + "objective/kl": 7.483972549438477, + "objective/non_score_reward": -0.37419864535331726, + "objective/rlhf_reward": 0.33886750612277716, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 2.3458797931671143, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.361328125, + "step": 1465, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9995365142822266 + }, + { + "episode": 35208, + "epoch": 0.07031671280152425, + "loss/policy_avg": 0.04355402663350105, + "lr": 2.5784125766871166e-06, + "objective/entropy": 92.38238525390625, + "objective/kl": 3.5206542015075684, + "objective/non_score_reward": -0.17603272199630737, + "objective/rlhf_reward": 1.0810467981208696, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 3.1842398643493652, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.4384765625, + "step": 1466, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997302293777466 + }, + { + "episode": 35232, + "epoch": 0.07036464512108902, + "loss/policy_avg": 0.0052341013215482235, + "lr": 2.578125e-06, + "objective/entropy": 81.72137451171875, + "objective/kl": 4.4113335609436035, + "objective/non_score_reward": -0.22056666016578674, + "objective/rlhf_reward": -1.3233999386429787, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.7340047359466553, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4072265625, + "step": 1467, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0024006366729736 + }, + { + "episode": 35256, + "epoch": 0.0704125774406538, + "loss/policy_avg": 0.04329672083258629, + "lr": 2.5778374233128834e-06, + "objective/entropy": 72.3553695678711, + "objective/kl": 4.986154556274414, + "objective/non_score_reward": -0.24930773675441742, + "objective/rlhf_reward": -1.4958464056253433, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.941498756408691, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.4365234375, + "step": 1468, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9984759092330933 + }, + { + "episode": 35280, + "epoch": 0.07046050976021857, + "loss/policy_avg": 0.0818299949169159, + "lr": 2.577549846625767e-06, + "objective/entropy": 81.2385482788086, + "objective/kl": 5.54136848449707, + "objective/non_score_reward": -0.27706843614578247, + "objective/rlhf_reward": -1.6624105423688889, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.3334729671478271, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3740234375, + "step": 1469, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0025243759155273 + }, + { + "episode": 35304, + "epoch": 0.07050844207978335, + "loss/policy_avg": 0.035745032131671906, + "lr": 2.5772622699386503e-06, + "objective/entropy": 112.57798767089844, + "objective/kl": 6.1058878898620605, + "objective/non_score_reward": -0.3052944242954254, + "objective/rlhf_reward": -1.831766426563263, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.235240459442139, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4697265625, + "step": 1470, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997791051864624 + }, + { + "episode": 35328, + "epoch": 0.07055637439934812, + "loss/policy_avg": 0.031935155391693115, + "lr": 2.5769746932515337e-06, + "objective/entropy": 88.1206283569336, + "objective/kl": 7.830800533294678, + "objective/non_score_reward": -0.3915400207042694, + "objective/rlhf_reward": 0.23481929499525755, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 4.310527801513672, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3818359375, + "step": 1471, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.997739553451538 + }, + { + "episode": 35352, + "epoch": 0.0706043067189129, + "loss/policy_avg": 0.05912924185395241, + "lr": 2.5766871165644175e-06, + "objective/entropy": 99.36148071289062, + "objective/kl": 6.190970420837402, + "objective/non_score_reward": -0.3095484972000122, + "objective/rlhf_reward": 1.9282874786240272, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 2.7309203147888184, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.396484375, + "step": 1472, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0007736682891846 + }, + { + "episode": 35376, + "epoch": 0.07065223903847767, + "loss/policy_avg": -0.026025230064988136, + "lr": 2.576399539877301e-06, + "objective/entropy": 76.73826599121094, + "objective/kl": 6.497640132904053, + "objective/non_score_reward": -0.32488200068473816, + "objective/rlhf_reward": -1.9492920711636543, + "objective/scores": 0.0, + "policy/approxkl_avg": 5.296842098236084, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.345703125, + "step": 1473, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9983714818954468 + }, + { + "episode": 35400, + "epoch": 0.07070017135804245, + "loss/policy_avg": 0.08867330849170685, + "lr": 2.5761119631901844e-06, + "objective/entropy": 98.23155212402344, + "objective/kl": 6.0526533126831055, + "objective/non_score_reward": -0.3026326596736908, + "objective/rlhf_reward": 4.184204146265984, + "objective/scores": 1.0, + "policy/approxkl_avg": 5.690794944763184, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.435546875, + "step": 1474, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963651895523071 + }, + { + "episode": 35424, + "epoch": 0.07074810367760721, + "loss/policy_avg": 0.22029921412467957, + "lr": 2.5758243865030674e-06, + "objective/entropy": 83.31275939941406, + "objective/kl": 5.558103084564209, + "objective/non_score_reward": -0.2779051661491394, + "objective/rlhf_reward": 0.6536859657217029, + "objective/scores": 0.38685280723454163, + "policy/approxkl_avg": 3.4956789016723633, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3701171875, + "step": 1475, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0058860778808594 + }, + { + "episode": 35448, + "epoch": 0.07079603599717199, + "loss/policy_avg": -0.012867603451013565, + "lr": 2.575536809815951e-06, + "objective/entropy": 89.74413299560547, + "objective/kl": 7.228492736816406, + "objective/non_score_reward": -0.36142462491989136, + "objective/rlhf_reward": -2.168547570705414, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.151303291320801, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.421875, + "step": 1476, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9963982105255127 + }, + { + "episode": 35472, + "epoch": 0.07084396831673677, + "loss/policy_avg": -0.004630501382052898, + "lr": 2.5752492331288342e-06, + "objective/entropy": 134.28500366210938, + "objective/kl": 3.264935255050659, + "objective/non_score_reward": -0.163246750831604, + "objective/rlhf_reward": 0.8266994317413602, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 3.162837505340576, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.59375, + "step": 1477, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0000972747802734 + }, + { + "episode": 35496, + "epoch": 0.07089190063630155, + "loss/policy_avg": -0.049938589334487915, + "lr": 2.5749616564417177e-06, + "objective/entropy": 136.537353515625, + "objective/kl": 4.043224334716797, + "objective/non_score_reward": -0.20216122269630432, + "objective/rlhf_reward": -1.21296726167202, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.9343724250793457, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.607421875, + "step": 1478, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0019335746765137 + }, + { + "episode": 35520, + "epoch": 0.07093983295586631, + "loss/policy_avg": 0.008371952921152115, + "lr": 2.574674079754601e-06, + "objective/entropy": 96.08819580078125, + "objective/kl": 4.113903045654297, + "objective/non_score_reward": -0.20569518208503723, + "objective/rlhf_reward": 0.7658290117979049, + "objective/scores": 0.3333333333333333, + "policy/approxkl_avg": 2.1805949211120605, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4443359375, + "step": 1479, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002115249633789 + }, + { + "episode": 35544, + "epoch": 0.07098776527543109, + "loss/policy_avg": 0.05770412087440491, + "lr": 2.5743865030674845e-06, + "objective/entropy": 98.0128402709961, + "objective/kl": 4.582643032073975, + "objective/non_score_reward": -0.22913216054439545, + "objective/rlhf_reward": 0.4313870628715787, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 4.317579746246338, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.41796875, + "step": 1480, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970561265945435 + }, + { + "episode": 35568, + "epoch": 0.07103569759499587, + "loss/policy_avg": 0.20269447565078735, + "lr": 2.574098926380368e-06, + "objective/entropy": 65.17657470703125, + "objective/kl": 4.57020378112793, + "objective/non_score_reward": -0.22851018607616425, + "objective/rlhf_reward": 0.7661820061911476, + "objective/scores": 0.3562071871080222, + "policy/approxkl_avg": 2.2412402629852295, + "policy/clipfrac_avg": 0.3333333432674408, + "policy/entropy_avg": 0.30078125, + "step": 1481, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9991968870162964 + }, + { + "episode": 35592, + "epoch": 0.07108362991456064, + "loss/policy_avg": 0.05820987746119499, + "lr": 2.573811349693252e-06, + "objective/entropy": 80.15453338623047, + "objective/kl": 4.228695869445801, + "objective/non_score_reward": -0.21143481135368347, + "objective/rlhf_reward": 1.3154505809010955, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 3.056039571762085, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.3447265625, + "step": 1482, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9977290630340576 + }, + { + "episode": 35616, + "epoch": 0.07113156223412541, + "loss/policy_avg": 0.19329991936683655, + "lr": 2.5735237730061352e-06, + "objective/entropy": 73.03053283691406, + "objective/kl": 7.226901531219482, + "objective/non_score_reward": -0.3613450527191162, + "objective/rlhf_reward": -2.1680703572928905, + "objective/scores": 0.0, + "policy/approxkl_avg": 7.2663750648498535, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.3759765625, + "step": 1483, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.9964094161987305 + }, + { + "episode": 35640, + "epoch": 0.07117949455369019, + "loss/policy_avg": -0.02825089357793331, + "lr": 2.5732361963190187e-06, + "objective/entropy": 69.99711608886719, + "objective/kl": 7.081105709075928, + "objective/non_score_reward": -0.3540552854537964, + "objective/rlhf_reward": -2.124331682920456, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.8583786487579346, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.3779296875, + "step": 1484, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0011887550354004 + }, + { + "episode": 35664, + "epoch": 0.07122742687325496, + "loss/policy_avg": -4.710402572527528e-05, + "lr": 2.572948619631902e-06, + "objective/entropy": 106.95525360107422, + "objective/kl": 6.134894847869873, + "objective/non_score_reward": -0.3067447543144226, + "objective/rlhf_reward": -1.840468406677246, + "objective/scores": 0.0, + "policy/approxkl_avg": 6.429330348968506, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4580078125, + "step": 1485, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9975485801696777 + }, + { + "episode": 35688, + "epoch": 0.07127535919281974, + "loss/policy_avg": -0.040668293833732605, + "lr": 2.5726610429447855e-06, + "objective/entropy": 100.8899154663086, + "objective/kl": 3.2353568077087402, + "objective/non_score_reward": -0.16176782548427582, + "objective/rlhf_reward": 0.7637819826499306, + "objective/scores": 0.2890648263178879, + "policy/approxkl_avg": 2.7025251388549805, + "policy/clipfrac_avg": 1.6666667461395264, + "policy/entropy_avg": 0.4609375, + "step": 1486, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.000654458999634 + }, + { + "episode": 35712, + "epoch": 0.0713232915123845, + "loss/policy_avg": -0.0408848412334919, + "lr": 2.572373466257669e-06, + "objective/entropy": 106.33609008789062, + "objective/kl": 6.500934600830078, + "objective/non_score_reward": -0.32504674792289734, + "objective/rlhf_reward": -1.950280375778675, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.846691608428955, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.4765625, + "step": 1487, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001965045928955 + }, + { + "episode": 35736, + "epoch": 0.07137122383194928, + "loss/policy_avg": 0.04746153578162193, + "lr": 2.5720858895705524e-06, + "objective/entropy": 78.06982421875, + "objective/kl": 5.640975475311279, + "objective/non_score_reward": -0.2820487916469574, + "objective/rlhf_reward": -1.6922926008701324, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.380772590637207, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3759765625, + "step": 1488, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9979841709136963 + }, + { + "episode": 35760, + "epoch": 0.07141915615151406, + "loss/policy_avg": 0.0943746566772461, + "lr": 2.5717983128834358e-06, + "objective/entropy": 111.97176361083984, + "objective/kl": 3.172848701477051, + "objective/non_score_reward": -0.1586424559354782, + "objective/rlhf_reward": -0.9518546797335148, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.092461585998535, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.517578125, + "step": 1489, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9997506141662598 + }, + { + "episode": 35784, + "epoch": 0.07146708847107884, + "loss/policy_avg": 0.0012816772796213627, + "lr": 2.5715107361963188e-06, + "objective/entropy": 76.99417114257812, + "objective/kl": 5.074615001678467, + "objective/non_score_reward": -0.25373074412345886, + "objective/rlhf_reward": -1.5223845094442368, + "objective/scores": 0.0, + "policy/approxkl_avg": 4.472695350646973, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3642578125, + "step": 1490, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9970474243164062 + }, + { + "episode": 35808, + "epoch": 0.0715150207906436, + "loss/policy_avg": 0.055141255259513855, + "lr": 2.571223159509202e-06, + "objective/entropy": 103.7957763671875, + "objective/kl": 4.886859893798828, + "objective/non_score_reward": -0.24434298276901245, + "objective/rlhf_reward": -1.4660578817129135, + "objective/scores": 0.0, + "policy/approxkl_avg": 1.5025055408477783, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.4658203125, + "step": 1491, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0008034706115723 + }, + { + "episode": 35832, + "epoch": 0.07156295311020838, + "loss/policy_avg": 0.1610299050807953, + "lr": 2.570935582822086e-06, + "objective/entropy": 99.49496459960938, + "objective/kl": 6.07961368560791, + "objective/non_score_reward": -0.30398067831993103, + "objective/rlhf_reward": 1.9616944664103202, + "objective/scores": 0.6309297535714575, + "policy/approxkl_avg": 7.436861038208008, + "policy/clipfrac_avg": 1.1666667461395264, + "policy/entropy_avg": 0.4560546875, + "step": 1492, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.9964115619659424 + }, + { + "episode": 35856, + "epoch": 0.07161088542977316, + "loss/policy_avg": -0.005624236539006233, + "lr": 2.5706480061349695e-06, + "objective/entropy": 81.7576904296875, + "objective/kl": 3.20346999168396, + "objective/non_score_reward": -0.16017350554466248, + "objective/rlhf_reward": 0.8451389816941056, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 2.4733023643493652, + "policy/clipfrac_avg": 1.5, + "policy/entropy_avg": 0.3427734375, + "step": 1493, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002610445022583 + }, + { + "episode": 35880, + "epoch": 0.07165881774933794, + "loss/policy_avg": 0.09190116822719574, + "lr": 2.570360429447853e-06, + "objective/entropy": 69.15216064453125, + "objective/kl": 6.456817626953125, + "objective/non_score_reward": -0.32284092903137207, + "objective/rlhf_reward": 0.647013938164899, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 5.550074577331543, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.3798828125, + "step": 1494, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 1, + "val/ratio": 1.997521996498108 + }, + { + "episode": 35904, + "epoch": 0.0717067500689027, + "loss/policy_avg": 0.01016529742628336, + "lr": 2.5700728527607363e-06, + "objective/entropy": 84.81048583984375, + "objective/kl": 4.4437079429626465, + "objective/non_score_reward": -0.2221854031085968, + "objective/rlhf_reward": 0.47306762238753175, + "objective/scores": 0.3010299956639812, + "policy/approxkl_avg": 1.7537269592285156, + "policy/clipfrac_avg": 1.0, + "policy/entropy_avg": 0.388671875, + "step": 1495, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.001992702484131 + }, + { + "episode": 35928, + "epoch": 0.07175468238846748, + "loss/policy_avg": 0.16143585741519928, + "lr": 2.5697852760736198e-06, + "objective/entropy": 90.2025146484375, + "objective/kl": 6.482316017150879, + "objective/non_score_reward": -0.3241158127784729, + "objective/rlhf_reward": -1.9446948617696762, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.3309388160705566, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.41796875, + "step": 1496, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.002239227294922 + }, + { + "episode": 35952, + "epoch": 0.07180261470803226, + "loss/policy_avg": 0.062151260673999786, + "lr": 2.569497699386503e-06, + "objective/entropy": 73.18388366699219, + "objective/kl": 5.0991010665893555, + "objective/non_score_reward": -0.2549550533294678, + "objective/rlhf_reward": -1.5297303795814514, + "objective/scores": 0.0, + "policy/approxkl_avg": 2.893345832824707, + "policy/clipfrac_avg": 0.6666666865348816, + "policy/entropy_avg": 0.3466796875, + "step": 1497, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.998718023300171 + }, + { + "episode": 35976, + "epoch": 0.07185054702759704, + "loss/policy_avg": 0.08840575814247131, + "lr": 2.5692101226993866e-06, + "objective/entropy": 103.36686706542969, + "objective/kl": 5.20394229888916, + "objective/non_score_reward": -0.26019713282585144, + "objective/rlhf_reward": 1.022876618540475, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 8.441944122314453, + "policy/clipfrac_avg": 0.8333333730697632, + "policy/entropy_avg": 0.427734375, + "step": 1498, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 1.994280457496643 + }, + { + "episode": 36000, + "epoch": 0.07189847934716181, + "loss/policy_avg": 0.041327521204948425, + "lr": 2.56892254601227e-06, + "objective/entropy": 101.67473602294922, + "objective/kl": 4.087104797363281, + "objective/non_score_reward": -0.20435523986816406, + "objective/rlhf_reward": 1.3579278347255679, + "objective/scores": 0.43067655807339306, + "policy/approxkl_avg": 1.0482717752456665, + "policy/clipfrac_avg": 1.3333333730697632, + "policy/entropy_avg": 0.4267578125, + "step": 1499, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 0, + "val/ratio": 2.0030810832977295 + } + ], + "logging_steps": 500, + "max_steps": 5216, + "num_input_tokens_seen": 0, + "num_train_epochs": 3.0, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": true, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0, + "train_batch_size": null, + "trial_name": null, + "trial_params": null +}