diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" deleted file mode 100644--- "a/checkpoint-1000/trainer_state.json" +++ /dev/null @@ -1,18034 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "episode": 16000, - "epoch": 0.28759391738864726, - "eval_steps": 500, - "global_step": 1000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "episode": 16, - "epoch": 0.00028759391738864725, - "loss/policy_avg": 0.04147649183869362, - "lr": 1e-05, - "objective/entropy": 119.65733337402344, - "objective/kl": 15.623376846313477, - "objective/non_score_reward": -1.5623377561569214, - "objective/rlhf_reward": -3.849351099133491, - "objective/scores": 0.6, - "policy/approxkl_avg": 473.7090759277344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7531497478485107, - "step": 0, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990334510803223 - }, - { - "episode": 32, - "epoch": 0.0005751878347772945, - "loss/policy_avg": 0.09634321182966232, - "lr": 9.999360940695298e-06, - "objective/entropy": -24.297130584716797, - "objective/kl": 11.720248222351074, - "objective/non_score_reward": -1.1720247268676758, - "objective/rlhf_reward": -3.2880991645157334, - "objective/scores": 0.35, - "policy/approxkl_avg": 233.3876953125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6364185214042664, - "step": 1, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988316297531128 - }, - { - "episode": 48, - "epoch": 0.0008627817521659417, - "loss/policy_avg": 0.5879926681518555, - "lr": 9.998721881390595e-06, - "objective/entropy": -123.47531127929688, - "objective/kl": 7.935818672180176, - "objective/non_score_reward": -0.7935818433761597, - "objective/rlhf_reward": -0.25060838157055054, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 142.57273864746094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6163707971572876, - "step": 2, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999882459640503 - }, - { - "episode": 64, - "epoch": 0.001150375669554589, - "loss/policy_avg": 0.380592405796051, - "lr": 9.99808282208589e-06, - "objective/entropy": -117.48745727539062, - "objective/kl": 10.153940200805664, - "objective/non_score_reward": -1.0153939723968506, - "objective/rlhf_reward": -2.682973676411015, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 190.00497436523438, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5329767465591431, - "step": 3, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998424053192139 - }, - { - "episode": 80, - "epoch": 0.001437969586943236, - "loss/policy_avg": 0.14582836627960205, - "lr": 9.997443762781187e-06, - "objective/entropy": -217.63848876953125, - "objective/kl": 10.502876281738281, - "objective/non_score_reward": -1.0502876043319702, - "objective/rlhf_reward": -2.777318381418554, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 221.2613067626953, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6339143514633179, - "step": 4, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9996079206466675 - }, - { - "episode": 96, - "epoch": 0.0017255635043318834, - "loss/policy_avg": 0.12740007042884827, - "lr": 9.996804703476484e-06, - "objective/entropy": 398.1901550292969, - "objective/kl": 14.20137882232666, - "objective/non_score_reward": -1.420137882232666, - "objective/rlhf_reward": -3.28055148422718, - "objective/scores": 0.6, - "policy/approxkl_avg": 349.208740234375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 1.005652904510498, - "step": 5, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9985274076461792 - }, - { - "episode": 112, - "epoch": 0.0020131574217205307, - "loss/policy_avg": 0.1509546935558319, - "lr": 9.99616564417178e-06, - "objective/entropy": -124.58861541748047, - "objective/kl": 8.397514343261719, - "objective/non_score_reward": -0.8397514224052429, - "objective/rlhf_reward": -1.9590056151151658, - "objective/scores": 0.35, - "policy/approxkl_avg": 87.99980163574219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7696092128753662, - "step": 6, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9961457252502441 - }, - { - "episode": 128, - "epoch": 0.002300751339109178, - "loss/policy_avg": 0.07236729562282562, - "lr": 9.995526584867077e-06, - "objective/entropy": -62.749176025390625, - "objective/kl": 10.19581413269043, - "objective/non_score_reward": -1.0195814371109009, - "objective/rlhf_reward": -2.3449923555056253, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 151.23446655273438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7507286071777344, - "step": 7, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9975416660308838 - }, - { - "episode": 144, - "epoch": 0.002588345256497825, - "loss/policy_avg": 0.1384029984474182, - "lr": 9.994887525562374e-06, - "objective/entropy": -143.49945068359375, - "objective/kl": 12.088400840759277, - "objective/non_score_reward": -1.2088401317596436, - "objective/rlhf_reward": -3.435360452532768, - "objective/scores": 0.35, - "policy/approxkl_avg": 150.72146606445312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7477531433105469, - "step": 8, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0000879764556885 - }, - { - "episode": 160, - "epoch": 0.002875939173886472, - "loss/policy_avg": -0.009389623999595642, - "lr": 9.99424846625767e-06, - "objective/entropy": -8.538755416870117, - "objective/kl": 4.930829048156738, - "objective/non_score_reward": -0.4930829405784607, - "objective/rlhf_reward": -0.6306961386496122, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 21.575889587402344, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4435485005378723, - "step": 9, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00315523147583 - }, - { - "episode": 176, - "epoch": 0.0031635330912751195, - "loss/policy_avg": 0.09865772724151611, - "lr": 9.993609406952966e-06, - "objective/entropy": -17.656417846679688, - "objective/kl": 7.901223659515381, - "objective/non_score_reward": -0.790122389793396, - "objective/rlhf_reward": -1.213078211026128, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 61.98566436767578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7360714673995972, - "step": 10, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977571964263916 - }, - { - "episode": 192, - "epoch": 0.0034511270086637668, - "loss/policy_avg": -0.005021991208195686, - "lr": 9.992970347648263e-06, - "objective/entropy": -36.69260787963867, - "objective/kl": 10.859649658203125, - "objective/non_score_reward": -1.0859650373458862, - "objective/rlhf_reward": -1.9438601382076737, - "objective/scores": 0.6, - "policy/approxkl_avg": 145.91165161132812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4879041314125061, - "step": 11, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994498491287231 - }, - { - "episode": 208, - "epoch": 0.003738720926052414, - "loss/policy_avg": 0.35356682538986206, - "lr": 9.992331288343558e-06, - "objective/entropy": -69.72517395019531, - "objective/kl": 10.624967575073242, - "objective/non_score_reward": -1.0624967813491821, - "objective/rlhf_reward": -2.6937278797298223, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 142.52261352539062, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6256821751594543, - "step": 12, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9967740774154663 - }, - { - "episode": 224, - "epoch": 0.004026314843441061, - "loss/policy_avg": 0.24467170238494873, - "lr": 9.991692229038855e-06, - "objective/entropy": -115.99034881591797, - "objective/kl": 11.337324142456055, - "objective/non_score_reward": -1.1337324380874634, - "objective/rlhf_reward": -2.8730703942185505, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 95.43186950683594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5122163891792297, - "step": 13, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000202178955078 - }, - { - "episode": 240, - "epoch": 0.004313908760829708, - "loss/policy_avg": 0.36638143658638, - "lr": 9.991053169734152e-06, - "objective/entropy": 90.19092559814453, - "objective/kl": 8.482120513916016, - "objective/non_score_reward": -0.8482120633125305, - "objective/rlhf_reward": -1.5680194973674526, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 103.84627532958984, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.45586448907852173, - "step": 14, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0012996196746826 - }, - { - "episode": 256, - "epoch": 0.004601502678218356, - "loss/policy_avg": 0.3564397394657135, - "lr": 9.990414110429449e-06, - "objective/entropy": 62.88275146484375, - "objective/kl": 8.093853950500488, - "objective/non_score_reward": -0.8093854188919067, - "objective/rlhf_reward": -1.7565889535502193, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 129.63275146484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5616360902786255, - "step": 15, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000332832336426 - }, - { - "episode": 272, - "epoch": 0.004889096595607003, - "loss/policy_avg": 0.731740415096283, - "lr": 9.989775051124744e-06, - "objective/entropy": 175.25027465820312, - "objective/kl": 13.653030395507812, - "objective/non_score_reward": -1.3653030395507812, - "objective/rlhf_reward": -4.037380088766185, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 197.69329833984375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6656568050384521, - "step": 16, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0004138946533203 - }, - { - "episode": 288, - "epoch": 0.00517669051299565, - "loss/policy_avg": 0.0038209843914955854, - "lr": 9.989135991820041e-06, - "objective/entropy": 166.37741088867188, - "objective/kl": 11.93104362487793, - "objective/non_score_reward": -1.1931045055389404, - "objective/rlhf_reward": -3.3724178135395046, - "objective/scores": 0.35, - "policy/approxkl_avg": 123.38684844970703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4346945881843567, - "step": 17, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.002516269683838 - }, - { - "episode": 304, - "epoch": 0.0054642844303842975, - "loss/policy_avg": 0.5328235626220703, - "lr": 9.988496932515338e-06, - "objective/entropy": -59.579795837402344, - "objective/kl": 14.574970245361328, - "objective/non_score_reward": -1.457497000694275, - "objective/rlhf_reward": -4.273728727307871, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 107.66255187988281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6419472098350525, - "step": 18, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0002565383911133 - }, - { - "episode": 320, - "epoch": 0.005751878347772944, - "loss/policy_avg": 0.1068505123257637, - "lr": 9.987857873210635e-06, - "objective/entropy": 25.82529067993164, - "objective/kl": 7.757124900817871, - "objective/non_score_reward": -0.7757124900817871, - "objective/rlhf_reward": -1.702849841117859, - "objective/scores": 0.35, - "policy/approxkl_avg": 35.83104705810547, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.37679600715637207, - "step": 19, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0017640590667725 - }, - { - "episode": 336, - "epoch": 0.006039472265161592, - "loss/policy_avg": 0.9153174757957458, - "lr": 9.987218813905932e-06, - "objective/entropy": 123.23423767089844, - "objective/kl": 15.62867546081543, - "objective/non_score_reward": -1.5628674030303955, - "objective/rlhf_reward": -4.770517232830882, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 175.58567810058594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6862951517105103, - "step": 20, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9951945543289185 - }, - { - "episode": 352, - "epoch": 0.006327066182550239, - "loss/policy_avg": 0.13535380363464355, - "lr": 9.986579754601228e-06, - "objective/entropy": 106.94303894042969, - "objective/kl": 14.264102935791016, - "objective/non_score_reward": -1.42641019821167, - "objective/rlhf_reward": -2.781921450735304, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 203.86151123046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4334957003593445, - "step": 21, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984745979309082 - }, - { - "episode": 368, - "epoch": 0.006614660099938887, - "loss/policy_avg": 0.08913514018058777, - "lr": 9.985940695296524e-06, - "objective/entropy": 86.8988037109375, - "objective/kl": 14.969903945922852, - "objective/non_score_reward": -1.4969902038574219, - "objective/rlhf_reward": -4.564128805597392, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 204.34201049804688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6533622741699219, - "step": 22, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9993085861206055 - }, - { - "episode": 384, - "epoch": 0.0069022540173275335, - "loss/policy_avg": 0.4681934416294098, - "lr": 9.98530163599182e-06, - "objective/entropy": -86.89934539794922, - "objective/kl": 17.868688583374023, - "objective/non_score_reward": -1.7868685722351074, - "objective/rlhf_reward": -5.821961793929262, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 180.03530883789062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6652738451957703, - "step": 23, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9969769716262817 - }, - { - "episode": 400, - "epoch": 0.00718984793471618, - "loss/policy_avg": 0.05787897855043411, - "lr": 9.984662576687117e-06, - "objective/entropy": 217.01751708984375, - "objective/kl": 7.942338466644287, - "objective/non_score_reward": -0.7942339181900024, - "objective/rlhf_reward": -1.7769354641437531, - "objective/scores": 0.35, - "policy/approxkl_avg": 14.617660522460938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7284016609191895, - "step": 24, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0007760524749756 - }, - { - "episode": 416, - "epoch": 0.007477441852104828, - "loss/policy_avg": 0.17751406133174896, - "lr": 9.984023517382414e-06, - "objective/entropy": 79.38223266601562, - "objective/kl": 13.876078605651855, - "objective/non_score_reward": -1.3876079320907593, - "objective/rlhf_reward": -4.191181921695156, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 129.87246704101562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9167462587356567, - "step": 25, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985355138778687 - }, - { - "episode": 432, - "epoch": 0.007765035769493475, - "loss/policy_avg": 0.529009222984314, - "lr": 9.983384458077711e-06, - "objective/entropy": -31.18558120727539, - "objective/kl": 14.786969184875488, - "objective/non_score_reward": -1.4786969423294067, - "objective/rlhf_reward": -4.5361854816354334, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 125.92539978027344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.43768489360809326, - "step": 26, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984157085418701 - }, - { - "episode": 448, - "epoch": 0.008052629686882123, - "loss/policy_avg": 0.3665599822998047, - "lr": 9.982745398773006e-06, - "objective/entropy": 23.827144622802734, - "objective/kl": 13.60982894897461, - "objective/non_score_reward": -1.360982894897461, - "objective/rlhf_reward": -5.443931698799133, - "objective/scores": 0.0, - "policy/approxkl_avg": 127.14844512939453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5104779601097107, - "step": 27, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987092018127441 - }, - { - "episode": 464, - "epoch": 0.00834022360427077, - "loss/policy_avg": 0.32786238193511963, - "lr": 9.982106339468303e-06, - "objective/entropy": 128.9566650390625, - "objective/kl": 11.556554794311523, - "objective/non_score_reward": -1.1556555032730103, - "objective/rlhf_reward": -3.297109279662294, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 59.29738998413086, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4545682668685913, - "step": 28, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0006556510925293 - }, - { - "episode": 480, - "epoch": 0.008627817521659416, - "loss/policy_avg": 0.2694750428199768, - "lr": 9.9814672801636e-06, - "objective/entropy": 78.4908447265625, - "objective/kl": 9.683059692382812, - "objective/non_score_reward": -0.9683058857917786, - "objective/rlhf_reward": -2.5139735278829765, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 102.18389892578125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.659131646156311, - "step": 29, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9980244636535645 - }, - { - "episode": 496, - "epoch": 0.008915411439048063, - "loss/policy_avg": -0.2861338257789612, - "lr": 9.980828220858897e-06, - "objective/entropy": -90.27975463867188, - "objective/kl": 7.361126899719238, - "objective/non_score_reward": -0.7361127138137817, - "objective/rlhf_reward": -1.1196221962300053, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 76.95925903320312, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5377018451690674, - "step": 30, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.006430149078369 - }, - { - "episode": 512, - "epoch": 0.009203005356436712, - "loss/policy_avg": 0.1336214542388916, - "lr": 9.980189161554194e-06, - "objective/entropy": 153.1845703125, - "objective/kl": 12.326415061950684, - "objective/non_score_reward": -1.232641577720642, - "objective/rlhf_reward": -3.5713163701042365, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 140.37075805664062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7196662425994873, - "step": 31, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9995383024215698 - }, - { - "episode": 528, - "epoch": 0.009490599273825359, - "loss/policy_avg": -0.03590531647205353, - "lr": 9.97955010224949e-06, - "objective/entropy": -60.39399719238281, - "objective/kl": 7.551569938659668, - "objective/non_score_reward": -0.7551569938659668, - "objective/rlhf_reward": -1.6613781240925025, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 36.98230743408203, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.617447018623352, - "step": 32, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0055394172668457 - }, - { - "episode": 544, - "epoch": 0.009778193191214006, - "loss/policy_avg": 0.23507678508758545, - "lr": 9.978911042944786e-06, - "objective/entropy": -62.405269622802734, - "objective/kl": 12.254663467407227, - "objective/non_score_reward": -1.2254663705825806, - "objective/rlhf_reward": -3.386093669923481, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 28.730735778808594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5197240114212036, - "step": 33, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973214864730835 - }, - { - "episode": 560, - "epoch": 0.010065787108602653, - "loss/policy_avg": 0.16142824292182922, - "lr": 9.978271983640083e-06, - "objective/entropy": 63.909202575683594, - "objective/kl": 11.40770149230957, - "objective/non_score_reward": -1.1407701969146729, - "objective/rlhf_reward": -1.6393620713960855, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 103.13188171386719, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6455787420272827, - "step": 34, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994690418243408 - }, - { - "episode": 576, - "epoch": 0.0103533810259913, - "loss/policy_avg": 0.10174459218978882, - "lr": 9.977632924335378e-06, - "objective/entropy": -30.112831115722656, - "objective/kl": 17.954376220703125, - "objective/non_score_reward": -1.7954376935958862, - "objective/rlhf_reward": -5.059044542089973, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 330.9220886230469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.519372820854187, - "step": 35, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978516101837158 - }, - { - "episode": 592, - "epoch": 0.010640974943379948, - "loss/policy_avg": 0.47705915570259094, - "lr": 9.976993865030675e-06, - "objective/entropy": 302.72314453125, - "objective/kl": 19.512754440307617, - "objective/non_score_reward": -1.9512755870819092, - "objective/rlhf_reward": -3.4051021099090573, - "objective/scores": 1.1, - "policy/approxkl_avg": 104.70938110351562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8252858519554138, - "step": 36, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983307123184204 - }, - { - "episode": 608, - "epoch": 0.010928568860768595, - "loss/policy_avg": 0.3472205400466919, - "lr": 9.976354805725972e-06, - "objective/entropy": -59.4378662109375, - "objective/kl": 10.388540267944336, - "objective/non_score_reward": -1.0388540029525757, - "objective/rlhf_reward": -2.3305873229828586, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 22.55358123779297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6380844712257385, - "step": 37, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981659650802612 - }, - { - "episode": 624, - "epoch": 0.011216162778157242, - "loss/policy_avg": 0.44485026597976685, - "lr": 9.975715746421269e-06, - "objective/entropy": 76.74449157714844, - "objective/kl": 10.349222183227539, - "objective/non_score_reward": -1.0349223613739014, - "objective/rlhf_reward": -4.139689266681671, - "objective/scores": 0.0, - "policy/approxkl_avg": 78.09274291992188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6423808336257935, - "step": 38, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9971048831939697 - }, - { - "episode": 640, - "epoch": 0.011503756695545889, - "loss/policy_avg": 0.37319111824035645, - "lr": 9.975076687116566e-06, - "objective/entropy": -67.30467224121094, - "objective/kl": 19.358768463134766, - "objective/non_score_reward": -1.9358769655227661, - "objective/rlhf_reward": -6.227735841067966, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 161.26229858398438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6262432336807251, - "step": 39, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987332820892334 - }, - { - "episode": 656, - "epoch": 0.011791350612934537, - "loss/policy_avg": 0.389024943113327, - "lr": 9.97443762781186e-06, - "objective/entropy": 210.994384765625, - "objective/kl": 11.99485969543457, - "objective/non_score_reward": -1.1994858980178833, - "objective/rlhf_reward": -4.797943651676178, - "objective/scores": 0.0, - "policy/approxkl_avg": 79.62628173828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7361236810684204, - "step": 40, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987305402755737 - }, - { - "episode": 672, - "epoch": 0.012078944530323184, - "loss/policy_avg": 0.4818825125694275, - "lr": 9.973798568507158e-06, - "objective/entropy": 280.91552734375, - "objective/kl": 17.216154098510742, - "objective/non_score_reward": -1.7216153144836426, - "objective/rlhf_reward": -5.435863117785797, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 82.88700866699219, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7085442543029785, - "step": 41, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000746488571167 - }, - { - "episode": 688, - "epoch": 0.012366538447711831, - "loss/policy_avg": 0.7192404270172119, - "lr": 9.973159509202454e-06, - "objective/entropy": 89.66543579101562, - "objective/kl": 12.255132675170898, - "objective/non_score_reward": -1.2255134582519531, - "objective/rlhf_reward": -0.5020534753799435, - "objective/scores": 1.1, - "policy/approxkl_avg": 79.93511199951172, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5533976554870605, - "step": 42, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999737739562988 - }, - { - "episode": 704, - "epoch": 0.012654132365100478, - "loss/policy_avg": 0.0401420071721077, - "lr": 9.972520449897751e-06, - "objective/entropy": 175.6131591796875, - "objective/kl": 12.72716999053955, - "objective/non_score_reward": -1.272716999053955, - "objective/rlhf_reward": -2.167148996831152, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 138.8005828857422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8866395354270935, - "step": 43, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9959650039672852 - }, - { - "episode": 720, - "epoch": 0.012941726282489125, - "loss/policy_avg": 0.5428536534309387, - "lr": 9.971881390593048e-06, - "objective/entropy": 122.98509216308594, - "objective/kl": 14.87851619720459, - "objective/non_score_reward": -1.487851619720459, - "objective/rlhf_reward": -4.435634577068027, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 57.47890853881836, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7034124135971069, - "step": 44, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998563528060913 - }, - { - "episode": 736, - "epoch": 0.013229320199877773, - "loss/policy_avg": 1.027585744857788, - "lr": 9.971242331288345e-06, - "objective/entropy": 119.49530792236328, - "objective/kl": 18.71068572998047, - "objective/non_score_reward": -1.8710683584213257, - "objective/rlhf_reward": -5.659444983276437, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 190.9130859375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9129630923271179, - "step": 45, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9993374347686768 - }, - { - "episode": 752, - "epoch": 0.01351691411726642, - "loss/policy_avg": -0.013658525422215462, - "lr": 9.97060327198364e-06, - "objective/entropy": 10.491897583007812, - "objective/kl": 13.526758193969727, - "objective/non_score_reward": -1.3526759147644043, - "objective/rlhf_reward": -1.0107036590576168, - "objective/scores": 1.1, - "policy/approxkl_avg": 102.07904815673828, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.482522189617157, - "step": 46, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0015859603881836 - }, - { - "episode": 768, - "epoch": 0.013804508034655067, - "loss/policy_avg": 0.18925166130065918, - "lr": 9.969964212678937e-06, - "objective/entropy": -118.86809539794922, - "objective/kl": 10.978793144226074, - "objective/non_score_reward": -1.097879409790039, - "objective/rlhf_reward": -3.0498822390133435, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 104.2835693359375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6507794857025146, - "step": 47, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996565341949463 - }, - { - "episode": 784, - "epoch": 0.014092101952043714, - "loss/policy_avg": 0.5690521597862244, - "lr": 9.969325153374234e-06, - "objective/entropy": -175.16403198242188, - "objective/kl": 19.28797149658203, - "objective/non_score_reward": -1.9287970066070557, - "objective/rlhf_reward": -6.158928765860155, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 288.65631103515625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6812887787818909, - "step": 48, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999005794525146 - }, - { - "episode": 800, - "epoch": 0.01437969586943236, - "loss/policy_avg": 0.5041743516921997, - "lr": 9.968686094069531e-06, - "objective/entropy": 318.1710205078125, - "objective/kl": 17.975252151489258, - "objective/non_score_reward": -1.79752516746521, - "objective/rlhf_reward": -5.84846519520822, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 390.5566101074219, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8034517765045166, - "step": 49, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9982198476791382 - }, - { - "episode": 816, - "epoch": 0.01466728978682101, - "loss/policy_avg": 0.21048909425735474, - "lr": 9.968047034764828e-06, - "objective/entropy": 16.597354888916016, - "objective/kl": 22.140174865722656, - "objective/non_score_reward": -2.214017629623413, - "objective/rlhf_reward": -6.733364107386146, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 327.35992431640625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4595518708229065, - "step": 50, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974133968353271 - }, - { - "episode": 832, - "epoch": 0.014954883704209656, - "loss/policy_avg": 0.6745895147323608, - "lr": 9.967407975460123e-06, - "objective/entropy": 26.577850341796875, - "objective/kl": 15.099103927612305, - "objective/non_score_reward": -1.5099103450775146, - "objective/rlhf_reward": -4.435521040026265, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 52.37441635131836, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5440022945404053, - "step": 51, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997567057609558 - }, - { - "episode": 848, - "epoch": 0.015242477621598303, - "loss/policy_avg": 1.5183483362197876, - "lr": 9.96676891615542e-06, - "objective/entropy": -133.34732055664062, - "objective/kl": 15.838411331176758, - "objective/non_score_reward": -1.58384108543396, - "objective/rlhf_reward": -4.9937288074785755, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 181.23886108398438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.491477906703949, - "step": 52, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999549388885498 - }, - { - "episode": 864, - "epoch": 0.01553007153898695, - "loss/policy_avg": 1.0412685871124268, - "lr": 9.966129856850717e-06, - "objective/entropy": -12.032562255859375, - "objective/kl": 13.811055183410645, - "objective/non_score_reward": -1.3811054229736328, - "objective/rlhf_reward": -4.008649730476078, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 197.14422607421875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5553240776062012, - "step": 53, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998724341392517 - }, - { - "episode": 880, - "epoch": 0.0158176654563756, - "loss/policy_avg": 0.11221161484718323, - "lr": 9.965490797546014e-06, - "objective/entropy": 239.8121795654297, - "objective/kl": 14.03902816772461, - "objective/non_score_reward": -1.4039026498794556, - "objective/rlhf_reward": -3.790781910690378, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 54.992515563964844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8224223852157593, - "step": 54, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998995304107666 - }, - { - "episode": 896, - "epoch": 0.016105259373764245, - "loss/policy_avg": 0.27755433320999146, - "lr": 9.96485173824131e-06, - "objective/entropy": 311.885009765625, - "objective/kl": 21.855777740478516, - "objective/non_score_reward": -2.185577630996704, - "objective/rlhf_reward": -8.742310643196106, - "objective/scores": 0.0, - "policy/approxkl_avg": 196.02963256835938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7535547018051147, - "step": 55, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9981474876403809 - }, - { - "episode": 912, - "epoch": 0.016392853291152892, - "loss/policy_avg": 0.23765933513641357, - "lr": 9.964212678936606e-06, - "objective/entropy": -135.26939392089844, - "objective/kl": 17.994558334350586, - "objective/non_score_reward": -1.7994558811187744, - "objective/rlhf_reward": -5.464490548769633, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 143.15103149414062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7061681747436523, - "step": 56, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001481771469116 - }, - { - "episode": 928, - "epoch": 0.01668044720854154, - "loss/policy_avg": -0.09936670958995819, - "lr": 9.963573619631903e-06, - "objective/entropy": 273.41107177734375, - "objective/kl": 17.296648025512695, - "objective/non_score_reward": -1.7296650409698486, - "objective/rlhf_reward": -5.559410088990612, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 170.04476928710938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.664265513420105, - "step": 57, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996645212173462 - }, - { - "episode": 944, - "epoch": 0.016968041125930186, - "loss/policy_avg": -0.40471351146698, - "lr": 9.9629345603272e-06, - "objective/entropy": 91.30682373046875, - "objective/kl": 9.7944974899292, - "objective/non_score_reward": -0.9794497489929199, - "objective/rlhf_reward": -2.361539780107096, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 46.471343994140625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6433554887771606, - "step": 58, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0122342109680176 - }, - { - "episode": 960, - "epoch": 0.017255635043318833, - "loss/policy_avg": 0.758541464805603, - "lr": 9.962295501022495e-06, - "objective/entropy": -28.099227905273438, - "objective/kl": 15.942657470703125, - "objective/non_score_reward": -1.5942658185958862, - "objective/rlhf_reward": -4.2543568483748775, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 199.81590270996094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6792592406272888, - "step": 59, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988605976104736 - }, - { - "episode": 976, - "epoch": 0.01754322896070748, - "loss/policy_avg": 0.11890214681625366, - "lr": 9.961656441717792e-06, - "objective/entropy": 48.058135986328125, - "objective/kl": 16.837148666381836, - "objective/non_score_reward": -1.6837148666381836, - "objective/rlhf_reward": -5.130739483896809, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 123.93780517578125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8319634199142456, - "step": 60, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9987045526504517 - }, - { - "episode": 992, - "epoch": 0.017830822878096127, - "loss/policy_avg": -0.3681066036224365, - "lr": 9.961017382413088e-06, - "objective/entropy": -12.798896789550781, - "objective/kl": 13.068469047546387, - "objective/non_score_reward": -1.3068468570709229, - "objective/rlhf_reward": -3.8857517450148156, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 29.726402282714844, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.425646036863327, - "step": 61, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.004940986633301 - }, - { - "episode": 1008, - "epoch": 0.018118416795484777, - "loss/policy_avg": 0.6650391221046448, - "lr": 9.960378323108385e-06, - "objective/entropy": 172.82774353027344, - "objective/kl": 19.25320053100586, - "objective/non_score_reward": -1.9253199100494385, - "objective/rlhf_reward": -5.753868768887456, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 61.55193328857422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.707613468170166, - "step": 62, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9996411800384521 - }, - { - "episode": 1024, - "epoch": 0.018406010712873424, - "loss/policy_avg": 0.006029143929481506, - "lr": 9.959739263803682e-06, - "objective/entropy": 84.45201110839844, - "objective/kl": 9.024871826171875, - "objective/non_score_reward": -0.9024871587753296, - "objective/rlhf_reward": -2.1593505545571894, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 54.238502502441406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7926748991012573, - "step": 63, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0010342597961426 - }, - { - "episode": 1040, - "epoch": 0.01869360463026207, - "loss/policy_avg": 0.43513649702072144, - "lr": 9.959100204498979e-06, - "objective/entropy": 271.2078857421875, - "objective/kl": 19.053577423095703, - "objective/non_score_reward": -1.905357837677002, - "objective/rlhf_reward": -6.2959189749061295, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 146.22186279296875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 1.0022022724151611, - "step": 64, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983612298965454 - }, - { - "episode": 1056, - "epoch": 0.018981198547650718, - "loss/policy_avg": 0.207576721906662, - "lr": 9.958461145194274e-06, - "objective/entropy": 61.16169738769531, - "objective/kl": 12.455079078674316, - "objective/non_score_reward": -1.2455079555511475, - "objective/rlhf_reward": -3.1572031929817905, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 90.93212890625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8675985336303711, - "step": 65, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000087022781372 - }, - { - "episode": 1072, - "epoch": 0.019268792465039365, - "loss/policy_avg": 0.036766890436410904, - "lr": 9.957822085889571e-06, - "objective/entropy": -109.95204162597656, - "objective/kl": 18.774991989135742, - "objective/non_score_reward": -1.8774993419647217, - "objective/rlhf_reward": -5.776663676897684, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 143.13140869140625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6979721188545227, - "step": 66, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986827373504639 - }, - { - "episode": 1088, - "epoch": 0.01955638638242801, - "loss/policy_avg": 0.2812209725379944, - "lr": 9.957183026584868e-06, - "objective/entropy": 347.89093017578125, - "objective/kl": 18.375164031982422, - "objective/non_score_reward": -1.8375165462493896, - "objective/rlhf_reward": -5.688206379831421, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 156.17637634277344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8508192300796509, - "step": 67, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0011837482452393 - }, - { - "episode": 1104, - "epoch": 0.019843980299816658, - "loss/policy_avg": 0.4886673092842102, - "lr": 9.956543967280165e-06, - "objective/entropy": 386.39532470703125, - "objective/kl": 21.181537628173828, - "objective/non_score_reward": -2.1181535720825195, - "objective/rlhf_reward": -8.472614765167236, - "objective/scores": 0.0, - "policy/approxkl_avg": 175.6392822265625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9587714672088623, - "step": 68, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9981954097747803 - }, - { - "episode": 1120, - "epoch": 0.020131574217205305, - "loss/policy_avg": 0.20124448835849762, - "lr": 9.955904907975462e-06, - "objective/entropy": -67.81639099121094, - "objective/kl": 18.70073127746582, - "objective/non_score_reward": -1.8700731992721558, - "objective/rlhf_reward": -6.05646069784936, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 92.5486831665039, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6175429224967957, - "step": 69, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00050950050354 - }, - { - "episode": 1136, - "epoch": 0.020419168134593952, - "loss/policy_avg": -0.021352097392082214, - "lr": 9.955265848670757e-06, - "objective/entropy": -17.604766845703125, - "objective/kl": 21.45330810546875, - "objective/non_score_reward": -2.1453306674957275, - "objective/rlhf_reward": -6.919463162839996, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 400.26580810546875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9343768358230591, - "step": 70, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998741149902344 - }, - { - "episode": 1152, - "epoch": 0.0207067620519826, - "loss/policy_avg": 1.1225731372833252, - "lr": 9.954626789366054e-06, - "objective/entropy": 25.78099822998047, - "objective/kl": 14.004438400268555, - "objective/non_score_reward": -1.4004437923431396, - "objective/rlhf_reward": -3.776946599754404, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 111.43013000488281, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5702972412109375, - "step": 71, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9973070621490479 - }, - { - "episode": 1168, - "epoch": 0.02099435596937125, - "loss/policy_avg": 0.25385117530822754, - "lr": 9.95398773006135e-06, - "objective/entropy": -119.72091674804688, - "objective/kl": 15.869585037231445, - "objective/non_score_reward": -1.586958408355713, - "objective/rlhf_reward": -4.400422762112553, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 68.1309814453125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6043493151664734, - "step": 72, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9978251457214355 - }, - { - "episode": 1184, - "epoch": 0.021281949886759896, - "loss/policy_avg": 1.0585708618164062, - "lr": 9.953348670756648e-06, - "objective/entropy": 175.019775390625, - "objective/kl": 22.812929153442383, - "objective/non_score_reward": -2.2812929153442383, - "objective/rlhf_reward": -7.7996585703193375, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 238.55490112304688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.764386773109436, - "step": 73, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998070478439331 - }, - { - "episode": 1200, - "epoch": 0.021569543804148543, - "loss/policy_avg": 0.3202959895133972, - "lr": 9.952709611451944e-06, - "objective/entropy": -55.03008270263672, - "objective/kl": 20.011316299438477, - "objective/non_score_reward": -2.001131772994995, - "objective/rlhf_reward": -5.60452709197998, - "objective/scores": 0.6, - "policy/approxkl_avg": 218.61663818359375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7646293640136719, - "step": 74, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000119686126709 - }, - { - "episode": 1216, - "epoch": 0.02185713772153719, - "loss/policy_avg": 0.07566210627555847, - "lr": 9.952070552147241e-06, - "objective/entropy": -71.89826965332031, - "objective/kl": 18.70985984802246, - "objective/non_score_reward": -1.8709862232208252, - "objective/rlhf_reward": -5.536533663945134, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 150.85037231445312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.622381865978241, - "step": 75, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000347137451172 - }, - { - "episode": 1232, - "epoch": 0.022144731638925837, - "loss/policy_avg": 1.0919684171676636, - "lr": 9.951431492842536e-06, - "objective/entropy": 52.233760833740234, - "objective/kl": 16.872692108154297, - "objective/non_score_reward": -1.6872694492340088, - "objective/rlhf_reward": -5.1449576354661755, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 106.15821838378906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5982179641723633, - "step": 76, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982967376708984 - }, - { - "episode": 1248, - "epoch": 0.022432325556314484, - "loss/policy_avg": 0.08637362718582153, - "lr": 9.950792433537833e-06, - "objective/entropy": -41.915985107421875, - "objective/kl": 13.70026969909668, - "objective/non_score_reward": -1.37002694606781, - "objective/rlhf_reward": -3.81824833673297, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 65.06179809570312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7901297807693481, - "step": 77, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979660511016846 - }, - { - "episode": 1264, - "epoch": 0.02271991947370313, - "loss/policy_avg": 0.25904232263565063, - "lr": 9.950153374233129e-06, - "objective/entropy": 80.24528503417969, - "objective/kl": 11.508593559265137, - "objective/non_score_reward": -1.1508593559265137, - "objective/rlhf_reward": -3.1224849848107095, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 80.6361312866211, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5768579244613647, - "step": 78, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997675895690918 - }, - { - "episode": 1280, - "epoch": 0.023007513391091777, - "loss/policy_avg": 1.0851349830627441, - "lr": 9.949514314928425e-06, - "objective/entropy": 179.42474365234375, - "objective/kl": 17.690536499023438, - "objective/non_score_reward": -1.7690538167953491, - "objective/rlhf_reward": -5.128804068045552, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 140.53465270996094, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5955301523208618, - "step": 79, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9972807168960571 - }, - { - "episode": 1296, - "epoch": 0.023295107308480424, - "loss/policy_avg": 0.10646107792854309, - "lr": 9.948875255623722e-06, - "objective/entropy": 192.84939575195312, - "objective/kl": 20.0378360748291, - "objective/non_score_reward": -2.003783702850342, - "objective/rlhf_reward": -8.015134632587433, - "objective/scores": 0.0, - "policy/approxkl_avg": 228.60391235351562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5153179168701172, - "step": 80, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999485969543457 - }, - { - "episode": 1312, - "epoch": 0.023582701225869074, - "loss/policy_avg": 0.4094586968421936, - "lr": 9.94823619631902e-06, - "objective/entropy": -145.0159912109375, - "objective/kl": 16.018333435058594, - "objective/non_score_reward": -1.6018333435058594, - "objective/rlhf_reward": -6.407333076000214, - "objective/scores": 0.0, - "policy/approxkl_avg": 113.58265686035156, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6427879333496094, - "step": 81, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9979770183563232 - }, - { - "episode": 1328, - "epoch": 0.02387029514325772, - "loss/policy_avg": 0.5852205753326416, - "lr": 9.947597137014316e-06, - "objective/entropy": -188.32510375976562, - "objective/kl": 16.71861457824707, - "objective/non_score_reward": -1.6718615293502808, - "objective/rlhf_reward": -5.328196370337887, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 47.16347122192383, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4781952202320099, - "step": 82, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979345798492432 - }, - { - "episode": 1344, - "epoch": 0.024157889060646368, - "loss/policy_avg": 0.7273481488227844, - "lr": 9.946958077709611e-06, - "objective/entropy": -261.5775146484375, - "objective/kl": 17.21273422241211, - "objective/non_score_reward": -1.721273422241211, - "objective/rlhf_reward": -5.280973825518208, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 91.5807113647461, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7938202619552612, - "step": 83, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998490333557129 - }, - { - "episode": 1360, - "epoch": 0.024445482978035015, - "loss/policy_avg": 0.6384750604629517, - "lr": 9.946319018404908e-06, - "objective/entropy": -149.5916290283203, - "objective/kl": 21.390371322631836, - "objective/non_score_reward": -2.1390371322631836, - "objective/rlhf_reward": -7.230635855227632, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 152.96697998046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.733663022518158, - "step": 84, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9965767860412598 - }, - { - "episode": 1376, - "epoch": 0.024733076895423662, - "loss/policy_avg": 1.4870085716247559, - "lr": 9.945679959100205e-06, - "objective/entropy": 279.02581787109375, - "objective/kl": 13.598295211791992, - "objective/non_score_reward": -1.3598296642303467, - "objective/rlhf_reward": -4.015486557682125, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 75.3885269165039, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9894934296607971, - "step": 85, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0009098052978516 - }, - { - "episode": 1392, - "epoch": 0.02502067081281231, - "loss/policy_avg": 0.34267449378967285, - "lr": 9.945040899795502e-06, - "objective/entropy": -9.687551498413086, - "objective/kl": 17.537944793701172, - "objective/non_score_reward": -1.7537946701049805, - "objective/rlhf_reward": -4.091459666134092, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 86.10018920898438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7380132675170898, - "step": 86, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9966371059417725 - }, - { - "episode": 1408, - "epoch": 0.025308264730200956, - "loss/policy_avg": 0.662402868270874, - "lr": 9.944401840490799e-06, - "objective/entropy": 182.38612365722656, - "objective/kl": 17.891094207763672, - "objective/non_score_reward": -1.789109230041504, - "objective/rlhf_reward": -5.705838660807952, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 27.472164154052734, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6948930025100708, - "step": 87, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9972063302993774 - }, - { - "episode": 1424, - "epoch": 0.025595858647589603, - "loss/policy_avg": 2.4642419815063477, - "lr": 9.943762781186096e-06, - "objective/entropy": 9.746139526367188, - "objective/kl": 17.692127227783203, - "objective/non_score_reward": -1.7692127227783203, - "objective/rlhf_reward": -5.735215237646727, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 221.2765350341797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6335443258285522, - "step": 88, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997534990310669 - }, - { - "episode": 1440, - "epoch": 0.02588345256497825, - "loss/policy_avg": 0.4348924160003662, - "lr": 9.94312372188139e-06, - "objective/entropy": 134.22723388671875, - "objective/kl": 27.34999656677246, - "objective/non_score_reward": -2.734999895095825, - "objective/rlhf_reward": -9.278140073240387, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 201.6007080078125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8156861662864685, - "step": 89, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9973480701446533 - }, - { - "episode": 1456, - "epoch": 0.026171046482366896, - "loss/policy_avg": 0.1291811764240265, - "lr": 9.942484662576688e-06, - "objective/entropy": 112.80955505371094, - "objective/kl": 15.703033447265625, - "objective/non_score_reward": -1.5703033208847046, - "objective/rlhf_reward": -4.333801994996007, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 155.86367797851562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5453826189041138, - "step": 90, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9969561100006104 - }, - { - "episode": 1472, - "epoch": 0.026458640399755547, - "loss/policy_avg": 0.5085259675979614, - "lr": 9.941845603271985e-06, - "objective/entropy": 178.85531616210938, - "objective/kl": 17.638357162475586, - "objective/non_score_reward": -1.7638356685638428, - "objective/rlhf_reward": -5.574390235360026, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 101.44283294677734, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.42108362913131714, - "step": 91, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9999537467956543 - }, - { - "episode": 1488, - "epoch": 0.026746234317144194, - "loss/policy_avg": 0.6299684643745422, - "lr": 9.941206543967281e-06, - "objective/entropy": -44.030662536621094, - "objective/kl": 20.842021942138672, - "objective/non_score_reward": -2.0842020511627197, - "objective/rlhf_reward": -6.977558457587643, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 154.17724609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8085545301437378, - "step": 92, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997336506843567 - }, - { - "episode": 1504, - "epoch": 0.02703382823453284, - "loss/policy_avg": 0.334034264087677, - "lr": 9.940567484662578e-06, - "objective/entropy": 137.08668518066406, - "objective/kl": 12.822792053222656, - "objective/non_score_reward": -1.2822792530059814, - "objective/rlhf_reward": -5.129117101430893, - "objective/scores": 0.0, - "policy/approxkl_avg": 28.62427520751953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5771763920783997, - "step": 93, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001514196395874 - }, - { - "episode": 1520, - "epoch": 0.027321422151921487, - "loss/policy_avg": 0.5848271250724792, - "lr": 9.939928425357874e-06, - "objective/entropy": -91.07750701904297, - "objective/kl": 12.661925315856934, - "objective/non_score_reward": -1.2661924362182617, - "objective/rlhf_reward": -3.239941294464182, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 20.221342086791992, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7387726306915283, - "step": 94, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9965746402740479 - }, - { - "episode": 1536, - "epoch": 0.027609016069310134, - "loss/policy_avg": 0.45774608850479126, - "lr": 9.93928936605317e-06, - "objective/entropy": 61.91606903076172, - "objective/kl": 13.879372596740723, - "objective/non_score_reward": -1.387937307357788, - "objective/rlhf_reward": -4.151749169826507, - "objective/scores": 0.35, - "policy/approxkl_avg": 77.81513977050781, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4593799114227295, - "step": 95, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001408576965332 - }, - { - "episode": 1552, - "epoch": 0.02789660998669878, - "loss/policy_avg": 0.15000438690185547, - "lr": 9.938650306748467e-06, - "objective/entropy": 105.67562866210938, - "objective/kl": 19.344045639038086, - "objective/non_score_reward": -1.9344044923782349, - "objective/rlhf_reward": -6.287019650550231, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 139.15414428710938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7023290991783142, - "step": 96, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000124454498291 - }, - { - "episode": 1568, - "epoch": 0.028184203904087428, - "loss/policy_avg": 0.15950141847133636, - "lr": 9.938011247443764e-06, - "objective/entropy": 163.98699951171875, - "objective/kl": 27.47066307067871, - "objective/non_score_reward": -2.7470664978027344, - "objective/rlhf_reward": -8.865559878126655, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 212.09678649902344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8216714859008789, - "step": 97, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999010562896729 - }, - { - "episode": 1584, - "epoch": 0.028471797821476075, - "loss/policy_avg": 0.08940532058477402, - "lr": 9.937372188139061e-06, - "objective/entropy": -107.25084686279297, - "objective/kl": 20.170251846313477, - "objective/non_score_reward": -2.0170252323150635, - "objective/rlhf_reward": -5.144382034183714, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 45.363121032714844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7300325632095337, - "step": 98, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9954917430877686 - }, - { - "episode": 1600, - "epoch": 0.02875939173886472, - "loss/policy_avg": 0.1482250690460205, - "lr": 9.936733128834358e-06, - "objective/entropy": 225.79022216796875, - "objective/kl": 18.687984466552734, - "objective/non_score_reward": -1.8687984943389893, - "objective/rlhf_reward": -6.024595717997894, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 224.1140594482422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.834899365901947, - "step": 99, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9977468252182007 - }, - { - "episode": 1616, - "epoch": 0.029046985656253372, - "loss/policy_avg": 0.1928468644618988, - "lr": 9.936094069529653e-06, - "objective/entropy": 8.741950988769531, - "objective/kl": 15.258420944213867, - "objective/non_score_reward": -1.5258420705795288, - "objective/rlhf_reward": -4.652770261378631, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 31.214942932128906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5689660310745239, - "step": 100, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975221157073975 - }, - { - "episode": 1632, - "epoch": 0.02933457957364202, - "loss/policy_avg": -0.0598021075129509, - "lr": 9.93545501022495e-06, - "objective/entropy": -18.486255645751953, - "objective/kl": 25.29681396484375, - "objective/non_score_reward": -2.5296812057495117, - "objective/rlhf_reward": -7.718725478649139, - "objective/scores": 0.6, - "policy/approxkl_avg": 111.83526611328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5892493724822998, - "step": 101, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977436065673828 - }, - { - "episode": 1648, - "epoch": 0.029622173491030666, - "loss/policy_avg": 0.47538769245147705, - "lr": 9.934815950920245e-06, - "objective/entropy": -110.20201873779297, - "objective/kl": 22.853384017944336, - "objective/non_score_reward": -2.2853384017944336, - "objective/rlhf_reward": -7.690755407424316, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 73.58186340332031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7267776727676392, - "step": 102, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981677532196045 - }, - { - "episode": 1664, - "epoch": 0.029909767408419313, - "loss/policy_avg": -0.00016094697639346123, - "lr": 9.934176891615542e-06, - "objective/entropy": 144.46910095214844, - "objective/kl": 16.481285095214844, - "objective/non_score_reward": -1.6481282711029053, - "objective/rlhf_reward": -5.26700029882781, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 115.55538177490234, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8190140724182129, - "step": 103, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999662160873413 - }, - { - "episode": 1680, - "epoch": 0.03019736132580796, - "loss/policy_avg": 0.3124885559082031, - "lr": 9.933537832310839e-06, - "objective/entropy": 82.22334289550781, - "objective/kl": 23.603931427001953, - "objective/non_score_reward": -2.3603932857513428, - "objective/rlhf_reward": -8.04157326221466, - "objective/scores": 0.35, - "policy/approxkl_avg": 249.64776611328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6342385411262512, - "step": 104, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999876022338867 - }, - { - "episode": 1696, - "epoch": 0.030484955243196606, - "loss/policy_avg": 0.5430713891983032, - "lr": 9.932898773006136e-06, - "objective/entropy": 120.59968566894531, - "objective/kl": 16.078868865966797, - "objective/non_score_reward": -1.6078869104385376, - "objective/rlhf_reward": -4.87528809806402, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 28.921520233154297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7989763021469116, - "step": 105, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9993932247161865 - }, - { - "episode": 1712, - "epoch": 0.030772549160585253, - "loss/policy_avg": 0.40486371517181396, - "lr": 9.932259713701433e-06, - "objective/entropy": -83.70709228515625, - "objective/kl": 21.060504913330078, - "objective/non_score_reward": -2.106050491333008, - "objective/rlhf_reward": -6.908430480750736, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 57.908729553222656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.8018752932548523, - "step": 106, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995449781417847 - }, - { - "episode": 1728, - "epoch": 0.0310601430779739, - "loss/policy_avg": 0.4627416133880615, - "lr": 9.931620654396728e-06, - "objective/entropy": -27.708335876464844, - "objective/kl": 19.676761627197266, - "objective/non_score_reward": -1.9676761627197266, - "objective/rlhf_reward": -7.870704412460327, - "objective/scores": 0.0, - "policy/approxkl_avg": 89.03375244140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7063722014427185, - "step": 107, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978171586990356 - }, - { - "episode": 1744, - "epoch": 0.03134773699536255, - "loss/policy_avg": 0.24644207954406738, - "lr": 9.930981595092025e-06, - "objective/entropy": 84.95053100585938, - "objective/kl": 17.334156036376953, - "objective/non_score_reward": -1.7334158420562744, - "objective/rlhf_reward": -5.417891824039158, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 59.91339111328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5948917269706726, - "step": 108, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983779191970825 - }, - { - "episode": 1760, - "epoch": 0.0316353309127512, - "loss/policy_avg": 0.10573781281709671, - "lr": 9.930342535787322e-06, - "objective/entropy": 37.63609313964844, - "objective/kl": 19.209318161010742, - "objective/non_score_reward": -1.9209318161010742, - "objective/rlhf_reward": -5.56102097250608, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 49.033843994140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.550654411315918, - "step": 109, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999387264251709 - }, - { - "episode": 1776, - "epoch": 0.031922924830139844, - "loss/policy_avg": 1.9212778806686401, - "lr": 9.929703476482619e-06, - "objective/entropy": -80.58729553222656, - "objective/kl": 21.281909942626953, - "objective/non_score_reward": -2.1281909942626953, - "objective/rlhf_reward": -6.779431001345316, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 400.0589599609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7861940860748291, - "step": 110, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973957538604736 - }, - { - "episode": 1792, - "epoch": 0.03221051874752849, - "loss/policy_avg": 0.275944322347641, - "lr": 9.929064417177915e-06, - "objective/entropy": -208.78277587890625, - "objective/kl": 19.070934295654297, - "objective/non_score_reward": -1.9070935249328613, - "objective/rlhf_reward": -3.228373861312866, - "objective/scores": 1.1, - "policy/approxkl_avg": 49.866607666015625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6659146547317505, - "step": 111, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001814842224121 - }, - { - "episode": 1808, - "epoch": 0.03249811266491714, - "loss/policy_avg": 0.01928192749619484, - "lr": 9.928425357873212e-06, - "objective/entropy": -55.08910369873047, - "objective/kl": 13.248590469360352, - "objective/non_score_reward": -1.3248591423034668, - "objective/rlhf_reward": -3.940187000964565, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 120.32939147949219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7349205017089844, - "step": 112, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983385801315308 - }, - { - "episode": 1824, - "epoch": 0.032785706582305785, - "loss/policy_avg": 0.46047085523605347, - "lr": 9.927786298568507e-06, - "objective/entropy": 62.72016906738281, - "objective/kl": 23.932682037353516, - "objective/non_score_reward": -2.39326810836792, - "objective/rlhf_reward": -5.173073148727417, - "objective/scores": 1.1, - "policy/approxkl_avg": 117.28857421875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7454954385757446, - "step": 113, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001185894012451 - }, - { - "episode": 1840, - "epoch": 0.03307330049969443, - "loss/policy_avg": 0.45565682649612427, - "lr": 9.927147239263804e-06, - "objective/entropy": 91.39878845214844, - "objective/kl": 23.617773056030273, - "objective/non_score_reward": -2.3617773056030273, - "objective/rlhf_reward": -8.121596846610231, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 150.77520751953125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6033438444137573, - "step": 114, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999167919158936 - }, - { - "episode": 1856, - "epoch": 0.03336089441708308, - "loss/policy_avg": 0.7295475602149963, - "lr": 9.926508179959101e-06, - "objective/entropy": 207.79177856445312, - "objective/kl": 27.30187225341797, - "objective/non_score_reward": -2.73018741607666, - "objective/rlhf_reward": -9.187415854136148, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 190.16458129882812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.674660861492157, - "step": 115, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992830753326416 - }, - { - "episode": 1872, - "epoch": 0.033648488334471725, - "loss/policy_avg": 0.6712960600852966, - "lr": 9.925869120654398e-06, - "objective/entropy": -117.77911376953125, - "objective/kl": 24.611560821533203, - "objective/non_score_reward": -2.461156129837036, - "objective/rlhf_reward": -8.485374891494198, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 180.31971740722656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8039132356643677, - "step": 116, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9965860843658447 - }, - { - "episode": 1888, - "epoch": 0.03393608225186037, - "loss/policy_avg": 0.34298884868621826, - "lr": 9.925230061349695e-06, - "objective/entropy": -77.61781311035156, - "objective/kl": 19.15323829650879, - "objective/non_score_reward": -1.9153238534927368, - "objective/rlhf_reward": -5.2612955331802365, - "objective/scores": 0.6, - "policy/approxkl_avg": 146.52737426757812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5704625248908997, - "step": 117, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9994972944259644 - }, - { - "episode": 1904, - "epoch": 0.03422367616924902, - "loss/policy_avg": 0.038467422127723694, - "lr": 9.92459100204499e-06, - "objective/entropy": 49.699798583984375, - "objective/kl": 14.500207901000977, - "objective/non_score_reward": -1.4500207901000977, - "objective/rlhf_reward": -4.3762512101727395, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 53.40303421020508, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7844617366790771, - "step": 118, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998944878578186 - }, - { - "episode": 1920, - "epoch": 0.034511270086637666, - "loss/policy_avg": 0.21085724234580994, - "lr": 9.923951942740287e-06, - "objective/entropy": 171.3436279296875, - "objective/kl": 26.085678100585938, - "objective/non_score_reward": -2.608567714691162, - "objective/rlhf_reward": -10.434271335601807, - "objective/scores": 0.0, - "policy/approxkl_avg": 62.63068389892578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7893345355987549, - "step": 119, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0008397102355957 - }, - { - "episode": 1936, - "epoch": 0.03479886400402631, - "loss/policy_avg": 2.211275100708008, - "lr": 9.923312883435584e-06, - "objective/entropy": -61.354347229003906, - "objective/kl": 20.615779876708984, - "objective/non_score_reward": -2.0615780353546143, - "objective/rlhf_reward": -6.82247974415597, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 51.41122055053711, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6323862075805664, - "step": 120, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0031816959381104 - }, - { - "episode": 1952, - "epoch": 0.03508645792141496, - "loss/policy_avg": 0.09407002478837967, - "lr": 9.92267382413088e-06, - "objective/entropy": 61.494049072265625, - "objective/kl": 16.42398452758789, - "objective/non_score_reward": -1.6423983573913574, - "objective/rlhf_reward": -5.053821527751621, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 42.54444122314453, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6436998844146729, - "step": 121, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003204345703125 - }, - { - "episode": 1968, - "epoch": 0.03537405183880361, - "loss/policy_avg": 4.1326775550842285, - "lr": 9.922034764826178e-06, - "objective/entropy": 182.5974884033203, - "objective/kl": 23.137367248535156, - "objective/non_score_reward": -2.313736915588379, - "objective/rlhf_reward": -7.895697408650799, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 43.000953674316406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4477632939815521, - "step": 122, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9996920824050903 - }, - { - "episode": 1984, - "epoch": 0.03566164575619225, - "loss/policy_avg": 0.6063717603683472, - "lr": 9.921395705521473e-06, - "objective/entropy": 374.73065185546875, - "objective/kl": 19.553401947021484, - "objective/non_score_reward": -1.9553401470184326, - "objective/rlhf_reward": -6.4213602304458615, - "objective/scores": 0.35, - "policy/approxkl_avg": 66.70773315429688, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8944922685623169, - "step": 123, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997408151626587 - }, - { - "episode": 2000, - "epoch": 0.03594923967358091, - "loss/policy_avg": -0.24504688382148743, - "lr": 9.92075664621677e-06, - "objective/entropy": 61.65036392211914, - "objective/kl": 24.3512020111084, - "objective/non_score_reward": -2.435120105743408, - "objective/rlhf_reward": -8.39884465029779, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 52.655914306640625, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.7538488507270813, - "step": 124, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.018387794494629 - }, - { - "episode": 2016, - "epoch": 0.036236833590969554, - "loss/policy_avg": 0.5044976472854614, - "lr": 9.920117586912067e-06, - "objective/entropy": 252.68862915039062, - "objective/kl": 31.767620086669922, - "objective/non_score_reward": -3.176762104034424, - "objective/rlhf_reward": -11.34779831144659, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 143.51535034179688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6717317700386047, - "step": 125, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9978291988372803 - }, - { - "episode": 2032, - "epoch": 0.0365244275083582, - "loss/policy_avg": 0.5120067596435547, - "lr": 9.919478527607362e-06, - "objective/entropy": -164.51651000976562, - "objective/kl": 27.349618911743164, - "objective/non_score_reward": -2.734961986541748, - "objective/rlhf_reward": -9.539847826957702, - "objective/scores": 0.35, - "policy/approxkl_avg": 40.27062225341797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7697643041610718, - "step": 126, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998326301574707 - }, - { - "episode": 2048, - "epoch": 0.03681202142574685, - "loss/policy_avg": 0.6887588500976562, - "lr": 9.918839468302659e-06, - "objective/entropy": 69.27521514892578, - "objective/kl": 22.67069435119629, - "objective/non_score_reward": -2.2670693397521973, - "objective/rlhf_reward": -7.6896756077683985, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 162.61952209472656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6613045930862427, - "step": 127, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999122977256775 - }, - { - "episode": 2064, - "epoch": 0.037099615343135495, - "loss/policy_avg": 0.3447116017341614, - "lr": 9.918200408997956e-06, - "objective/entropy": 212.85906982421875, - "objective/kl": 26.792640686035156, - "objective/non_score_reward": -2.6792640686035156, - "objective/rlhf_reward": -8.983723298708597, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 97.52276611328125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9090495109558105, - "step": 128, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001721382141113 - }, - { - "episode": 2080, - "epoch": 0.03738720926052414, - "loss/policy_avg": 0.48409304022789, - "lr": 9.917561349693252e-06, - "objective/entropy": 190.03709411621094, - "objective/kl": 18.16942596435547, - "objective/non_score_reward": -1.8169424533843994, - "objective/rlhf_reward": -5.145064058081184, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 45.51705551147461, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.46248874068260193, - "step": 129, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9967193603515625 - }, - { - "episode": 2096, - "epoch": 0.03767480317791279, - "loss/policy_avg": 0.5664651989936829, - "lr": 9.91692229038855e-06, - "objective/entropy": 192.7556610107422, - "objective/kl": 20.14044952392578, - "objective/non_score_reward": -2.014044761657715, - "objective/rlhf_reward": -6.231350894245217, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 29.526737213134766, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8081471920013428, - "step": 130, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9979431629180908 - }, - { - "episode": 2112, - "epoch": 0.037962397095301435, - "loss/policy_avg": 0.41585665941238403, - "lr": 9.916283231083844e-06, - "objective/entropy": -49.55967712402344, - "objective/kl": 34.61396789550781, - "objective/non_score_reward": -3.4613966941833496, - "objective/rlhf_reward": -12.466984846679072, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 176.34915161132812, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7800864577293396, - "step": 131, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994279146194458 - }, - { - "episode": 2128, - "epoch": 0.03824999101269008, - "loss/policy_avg": 0.11256570369005203, - "lr": 9.915644171779141e-06, - "objective/entropy": 156.05429077148438, - "objective/kl": 29.602603912353516, - "objective/non_score_reward": -2.9602606296539307, - "objective/rlhf_reward": -7.441042518615722, - "objective/scores": 1.1, - "policy/approxkl_avg": 77.59361267089844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8300012350082397, - "step": 132, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990795850753784 - }, - { - "episode": 2144, - "epoch": 0.03853758493007873, - "loss/policy_avg": 0.5523125529289246, - "lr": 9.915005112474438e-06, - "objective/entropy": -9.20687484741211, - "objective/kl": 18.338748931884766, - "objective/non_score_reward": -1.8338749408721924, - "objective/rlhf_reward": -5.510670955452035, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 8.425865173339844, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6365004777908325, - "step": 133, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000248908996582 - }, - { - "episode": 2160, - "epoch": 0.038825178847467376, - "loss/policy_avg": -0.049435317516326904, - "lr": 9.914366053169735e-06, - "objective/entropy": -86.67240142822266, - "objective/kl": 31.985502243041992, - "objective/non_score_reward": -3.198550224304199, - "objective/rlhf_reward": -11.237941711154535, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 199.85411071777344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7536939382553101, - "step": 134, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000448703765869 - }, - { - "episode": 2176, - "epoch": 0.03911277276485602, - "loss/policy_avg": 2.6105287075042725, - "lr": 9.913726993865032e-06, - "objective/entropy": -58.581077575683594, - "objective/kl": 16.238283157348633, - "objective/non_score_reward": -1.6238282918930054, - "objective/rlhf_reward": -6.495313286781311, - "objective/scores": 0.0, - "policy/approxkl_avg": 10.216068267822266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5845435857772827, - "step": 135, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.019744396209717 - }, - { - "episode": 2192, - "epoch": 0.03940036668224467, - "loss/policy_avg": 0.24281054735183716, - "lr": 9.913087934560329e-06, - "objective/entropy": -94.54998779296875, - "objective/kl": 24.751571655273438, - "objective/non_score_reward": -2.4751572608947754, - "objective/rlhf_reward": -8.384857022556004, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 124.94600677490234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6612954139709473, - "step": 136, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9992079734802246 - }, - { - "episode": 2208, - "epoch": 0.039687960599633317, - "loss/policy_avg": 0.3692808449268341, - "lr": 9.912448875255624e-06, - "objective/entropy": 131.55032348632812, - "objective/kl": 33.53261184692383, - "objective/non_score_reward": -3.3532609939575195, - "objective/rlhf_reward": -10.489325438381407, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 40.712066650390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6237879395484924, - "step": 137, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9970109462738037 - }, - { - "episode": 2224, - "epoch": 0.03997555451702196, - "loss/policy_avg": 0.7208542823791504, - "lr": 9.911809815950921e-06, - "objective/entropy": -9.526227951049805, - "objective/kl": 23.76919937133789, - "objective/non_score_reward": -2.376919984817505, - "objective/rlhf_reward": -9.507680296897888, - "objective/scores": 0.0, - "policy/approxkl_avg": 324.29388427734375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5352585315704346, - "step": 138, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988393783569336 - }, - { - "episode": 2240, - "epoch": 0.04026314843441061, - "loss/policy_avg": 0.24535533785820007, - "lr": 9.911170756646218e-06, - "objective/entropy": -117.0184555053711, - "objective/kl": 12.411272048950195, - "objective/non_score_reward": -1.2411272525787354, - "objective/rlhf_reward": -3.5139104529336542, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 86.15127563476562, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6268381476402283, - "step": 139, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9967679977416992 - }, - { - "episode": 2256, - "epoch": 0.04055074235179926, - "loss/policy_avg": 1.7714985609054565, - "lr": 9.910531697341515e-06, - "objective/entropy": -78.9849853515625, - "objective/kl": 19.77252197265625, - "objective/non_score_reward": -1.9772523641586304, - "objective/rlhf_reward": -7.9090094566345215, - "objective/scores": 0.0, - "policy/approxkl_avg": 51.44104766845703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.48759615421295166, - "step": 140, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998913288116455 - }, - { - "episode": 2272, - "epoch": 0.040838336269187904, - "loss/policy_avg": 0.42394816875457764, - "lr": 9.909892638036812e-06, - "objective/entropy": -52.96235656738281, - "objective/kl": 23.680797576904297, - "objective/non_score_reward": -2.368079662322998, - "objective/rlhf_reward": -7.647490258487771, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 46.429683685302734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7907830476760864, - "step": 141, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9969936609268188 - }, - { - "episode": 2288, - "epoch": 0.04112593018657655, - "loss/policy_avg": 0.42378103733062744, - "lr": 9.909253578732107e-06, - "objective/entropy": -55.83934783935547, - "objective/kl": 19.108665466308594, - "objective/non_score_reward": -1.910866618156433, - "objective/rlhf_reward": -4.7197478159677715, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 87.44278717041016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7084353566169739, - "step": 142, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.99775230884552 - }, - { - "episode": 2304, - "epoch": 0.0414135241039652, - "loss/policy_avg": 0.8798868656158447, - "lr": 9.908614519427404e-06, - "objective/entropy": 397.6534729003906, - "objective/kl": 35.663856506347656, - "objective/non_score_reward": -3.5663862228393555, - "objective/rlhf_reward": -12.318133900837836, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 403.22802734375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9414247274398804, - "step": 143, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968068599700928 - }, - { - "episode": 2320, - "epoch": 0.04170111802135385, - "loss/policy_avg": 0.8013919591903687, - "lr": 9.9079754601227e-06, - "objective/entropy": 92.8883285522461, - "objective/kl": 23.82331657409668, - "objective/non_score_reward": -2.382331609725952, - "objective/rlhf_reward": -8.170076781247538, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 73.349609375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6135187149047852, - "step": 144, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0004608631134033 - }, - { - "episode": 2336, - "epoch": 0.0419887119387425, - "loss/policy_avg": 2.381652593612671, - "lr": 9.907336400817996e-06, - "objective/entropy": -49.03077697753906, - "objective/kl": 20.151020050048828, - "objective/non_score_reward": -2.0151021480560303, - "objective/rlhf_reward": -6.5041489293247015, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 37.75957489013672, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7855414152145386, - "step": 145, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0004804134368896 - }, - { - "episode": 2352, - "epoch": 0.042276305856131145, - "loss/policy_avg": 10.612133026123047, - "lr": 9.906697341513293e-06, - "objective/entropy": 31.498912811279297, - "objective/kl": 22.87250518798828, - "objective/non_score_reward": -2.2872507572174072, - "objective/rlhf_reward": -7.487143372715103, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 67.64070892333984, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6964052319526672, - "step": 146, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002523422241211 - }, - { - "episode": 2368, - "epoch": 0.04256389977351979, - "loss/policy_avg": 1.218620777130127, - "lr": 9.90605828220859e-06, - "objective/entropy": -186.7276153564453, - "objective/kl": 17.879009246826172, - "objective/non_score_reward": -1.7879009246826172, - "objective/rlhf_reward": -2.751603758335113, - "objective/scores": 1.1, - "policy/approxkl_avg": 62.54943084716797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9248688220977783, - "step": 147, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999516248703003 - }, - { - "episode": 2384, - "epoch": 0.04285149369090844, - "loss/policy_avg": 0.08095124363899231, - "lr": 9.905419222903886e-06, - "objective/entropy": -79.130859375, - "objective/kl": 17.005231857299805, - "objective/non_score_reward": -1.7005233764648438, - "objective/rlhf_reward": -5.378261108596889, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 14.491683959960938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5301992893218994, - "step": 148, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981908798217773 - }, - { - "episode": 2400, - "epoch": 0.043139087608297086, - "loss/policy_avg": 0.9632350206375122, - "lr": 9.904780163599183e-06, - "objective/entropy": 78.35763549804688, - "objective/kl": 20.501068115234375, - "objective/non_score_reward": -2.050107002258301, - "objective/rlhf_reward": -3.8004277706146237, - "objective/scores": 1.1, - "policy/approxkl_avg": 42.78419494628906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6539649963378906, - "step": 149, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986085891723633 - }, - { - "episode": 2416, - "epoch": 0.04342668152568573, - "loss/policy_avg": 0.2573207914829254, - "lr": 9.904141104294478e-06, - "objective/entropy": 139.81314086914062, - "objective/kl": 15.294164657592773, - "objective/non_score_reward": -1.5294163227081299, - "objective/rlhf_reward": -4.739063062754971, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 44.518585205078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9615079760551453, - "step": 150, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0004477500915527 - }, - { - "episode": 2432, - "epoch": 0.04371427544307438, - "loss/policy_avg": 0.1424349546432495, - "lr": 9.903502044989775e-06, - "objective/entropy": -200.99420166015625, - "objective/kl": 23.035701751708984, - "objective/non_score_reward": -2.30357027053833, - "objective/rlhf_reward": -7.835678675261837, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 147.28530883789062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5872718691825867, - "step": 151, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000128746032715 - }, - { - "episode": 2448, - "epoch": 0.044001869360463026, - "loss/policy_avg": 0.2018139809370041, - "lr": 9.902862985685072e-06, - "objective/entropy": 265.19927978515625, - "objective/kl": 22.482166290283203, - "objective/non_score_reward": -2.2482166290283203, - "objective/rlhf_reward": -7.65123014739099, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 45.75090408325195, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6457411050796509, - "step": 152, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9972870349884033 - }, - { - "episode": 2464, - "epoch": 0.04428946327785167, - "loss/policy_avg": 1.6731089353561401, - "lr": 9.902223926380369e-06, - "objective/entropy": 88.4649658203125, - "objective/kl": 23.56682586669922, - "objective/non_score_reward": -2.356682300567627, - "objective/rlhf_reward": -7.479318688588078, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 46.715396881103516, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5394150614738464, - "step": 153, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9959262609481812 - }, - { - "episode": 2480, - "epoch": 0.04457705719524032, - "loss/policy_avg": 0.28844547271728516, - "lr": 9.901584867075666e-06, - "objective/entropy": 153.28744506835938, - "objective/kl": 25.101318359375, - "objective/non_score_reward": -2.5101318359375, - "objective/rlhf_reward": -8.484268038478449, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 65.64702606201172, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.48713332414627075, - "step": 154, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9966403245925903 - }, - { - "episode": 2496, - "epoch": 0.04486465111262897, - "loss/policy_avg": 0.5096696615219116, - "lr": 9.900945807770961e-06, - "objective/entropy": 155.93728637695312, - "objective/kl": 19.505783081054688, - "objective/non_score_reward": -1.9505780935287476, - "objective/rlhf_reward": -6.3517142339662165, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 54.11931610107422, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5123778581619263, - "step": 155, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984791278839111 - }, - { - "episode": 2512, - "epoch": 0.045152245030017614, - "loss/policy_avg": 0.06939780712127686, - "lr": 9.900306748466258e-06, - "objective/entropy": 18.287960052490234, - "objective/kl": 17.8365421295166, - "objective/non_score_reward": -1.7836542129516602, - "objective/rlhf_reward": -5.792981436758667, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 29.395599365234375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5711311101913452, - "step": 156, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997730255126953 - }, - { - "episode": 2528, - "epoch": 0.04543983894740626, - "loss/policy_avg": 0.6630462408065796, - "lr": 9.899667689161555e-06, - "objective/entropy": 63.43556213378906, - "objective/kl": 22.572792053222656, - "objective/non_score_reward": -2.2572789192199707, - "objective/rlhf_reward": -7.687480738669066, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 46.135581970214844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5037804841995239, - "step": 157, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9967372417449951 - }, - { - "episode": 2544, - "epoch": 0.04572743286479491, - "loss/policy_avg": 0.9545019865036011, - "lr": 9.899028629856852e-06, - "objective/entropy": 78.440673828125, - "objective/kl": 24.54024314880371, - "objective/non_score_reward": -2.454024314880371, - "objective/rlhf_reward": -8.43749509104858, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 15.233887672424316, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5358680486679077, - "step": 158, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0002801418304443 - }, - { - "episode": 2560, - "epoch": 0.046015026782183555, - "loss/policy_avg": 0.7431780099868774, - "lr": 9.898389570552149e-06, - "objective/entropy": 140.66659545898438, - "objective/kl": 15.743444442749023, - "objective/non_score_reward": -1.574344515800476, - "objective/rlhf_reward": -4.938128196929378, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 3.185384511947632, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.35770517587661743, - "step": 159, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9985697269439697 - }, - { - "episode": 2576, - "epoch": 0.0463026206995722, - "loss/policy_avg": 0.5926499366760254, - "lr": 9.897750511247446e-06, - "objective/entropy": 142.6728973388672, - "objective/kl": 26.324443817138672, - "objective/non_score_reward": -2.6324446201324463, - "objective/rlhf_reward": -10.529778361320496, - "objective/scores": 0.0, - "policy/approxkl_avg": 163.95303344726562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6692796945571899, - "step": 160, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0011281967163086 - }, - { - "episode": 2592, - "epoch": 0.04659021461696085, - "loss/policy_avg": 0.5165088772773743, - "lr": 9.89711145194274e-06, - "objective/entropy": -49.34636688232422, - "objective/kl": 23.45389175415039, - "objective/non_score_reward": -2.3453893661499023, - "objective/rlhf_reward": -7.865785443576511, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 21.34342384338379, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6546250581741333, - "step": 161, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9961342811584473 - }, - { - "episode": 2608, - "epoch": 0.046877808534349495, - "loss/policy_avg": 0.06444612145423889, - "lr": 9.896472392638038e-06, - "objective/entropy": 3.992961883544922, - "objective/kl": 22.590843200683594, - "objective/non_score_reward": -2.259084463119507, - "objective/rlhf_reward": -7.636337852478027, - "objective/scores": 0.35, - "policy/approxkl_avg": 78.1329345703125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6085139513015747, - "step": 162, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9983890056610107 - }, - { - "episode": 2624, - "epoch": 0.04716540245173815, - "loss/policy_avg": -0.5813350081443787, - "lr": 9.895833333333334e-06, - "objective/entropy": 129.43629455566406, - "objective/kl": 18.305377960205078, - "objective/non_score_reward": -1.8305377960205078, - "objective/rlhf_reward": -5.841198506768107, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 35.31721496582031, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.5479708313941956, - "step": 163, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.006561756134033 - }, - { - "episode": 2640, - "epoch": 0.047452996369126796, - "loss/policy_avg": 0.771713376045227, - "lr": 9.895194274028631e-06, - "objective/entropy": -7.5702056884765625, - "objective/kl": 26.34789276123047, - "objective/non_score_reward": -2.634789228439331, - "objective/rlhf_reward": -9.023385488780672, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 189.376708984375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7600178122520447, - "step": 164, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9973194599151611 - }, - { - "episode": 2656, - "epoch": 0.04774059028651544, - "loss/policy_avg": 0.3846855163574219, - "lr": 9.894555214723928e-06, - "objective/entropy": 208.37188720703125, - "objective/kl": 24.167360305786133, - "objective/non_score_reward": -2.416736125946045, - "objective/rlhf_reward": -8.307694399093075, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 50.201210021972656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8115335702896118, - "step": 165, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998011589050293 - }, - { - "episode": 2672, - "epoch": 0.04802818420390409, - "loss/policy_avg": 0.1327829658985138, - "lr": 9.893916155419225e-06, - "objective/entropy": 137.91152954101562, - "objective/kl": 16.795686721801758, - "objective/non_score_reward": -1.6795687675476074, - "objective/rlhf_reward": -5.202503287585911, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 83.59907531738281, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6950229406356812, - "step": 166, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9975529909133911 - }, - { - "episode": 2688, - "epoch": 0.048315778121292736, - "loss/policy_avg": 0.1259589046239853, - "lr": 9.89327709611452e-06, - "objective/entropy": -62.611698150634766, - "objective/kl": 23.73765754699707, - "objective/non_score_reward": -2.373765707015991, - "objective/rlhf_reward": -8.095063066482544, - "objective/scores": 0.35, - "policy/approxkl_avg": 30.360702514648438, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4680527448654175, - "step": 167, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0010440349578857 - }, - { - "episode": 2704, - "epoch": 0.04860337203868138, - "loss/policy_avg": 1.4625483751296997, - "lr": 9.892638036809815e-06, - "objective/entropy": 191.9486083984375, - "objective/kl": 29.09479331970215, - "objective/non_score_reward": -2.9094796180725098, - "objective/rlhf_reward": -9.976058488309967, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 96.90817260742188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4573523998260498, - "step": 168, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999511957168579 - }, - { - "episode": 2720, - "epoch": 0.04889096595607003, - "loss/policy_avg": 0.39898326992988586, - "lr": 9.891998977505112e-06, - "objective/entropy": 76.32606506347656, - "objective/kl": 25.170101165771484, - "objective/non_score_reward": -2.51701021194458, - "objective/rlhf_reward": -8.58708846848762, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 109.47344970703125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7625200748443604, - "step": 169, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9968773126602173 - }, - { - "episode": 2736, - "epoch": 0.04917855987345868, - "loss/policy_avg": 1.2032477855682373, - "lr": 9.89135991820041e-06, - "objective/entropy": -85.850830078125, - "objective/kl": 23.681184768676758, - "objective/non_score_reward": -2.3681185245513916, - "objective/rlhf_reward": -7.916214673724726, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 8.655494689941406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7028899788856506, - "step": 170, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0086746215820312 - }, - { - "episode": 2752, - "epoch": 0.049466153790847324, - "loss/policy_avg": 0.2989116311073303, - "lr": 9.890720858895706e-06, - "objective/entropy": -244.5385284423828, - "objective/kl": 25.30226707458496, - "objective/non_score_reward": -2.530226707458496, - "objective/rlhf_reward": -8.74230466136108, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 80.04335021972656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7547532320022583, - "step": 171, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9984104633331299 - }, - { - "episode": 2768, - "epoch": 0.04975374770823597, - "loss/policy_avg": 0.15676680207252502, - "lr": 9.890081799591003e-06, - "objective/entropy": -46.70368576049805, - "objective/kl": 23.916561126708984, - "objective/non_score_reward": -2.3916563987731934, - "objective/rlhf_reward": -8.142793853481379, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 98.60531616210938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4980742335319519, - "step": 172, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000648021697998 - }, - { - "episode": 2784, - "epoch": 0.05004134162562462, - "loss/policy_avg": 2.26867413520813, - "lr": 9.8894427402863e-06, - "objective/entropy": -103.40653991699219, - "objective/kl": 13.933716773986816, - "objective/non_score_reward": -1.39337158203125, - "objective/rlhf_reward": -3.969366837207394, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 48.939292907714844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5038570165634155, - "step": 173, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992566108703613 - }, - { - "episode": 2800, - "epoch": 0.050328935543013265, - "loss/policy_avg": 2.38254714012146, - "lr": 9.888803680981595e-06, - "objective/entropy": 186.43983459472656, - "objective/kl": 22.118637084960938, - "objective/non_score_reward": -2.2118635177612305, - "objective/rlhf_reward": -7.5219414568244645, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 46.10081481933594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5202944278717041, - "step": 174, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9985566139221191 - }, - { - "episode": 2816, - "epoch": 0.05061652946040191, - "loss/policy_avg": 0.7353519201278687, - "lr": 9.888164621676892e-06, - "objective/entropy": 40.153011322021484, - "objective/kl": 24.53411865234375, - "objective/non_score_reward": -2.453411817550659, - "objective/rlhf_reward": -8.413647150993347, - "objective/scores": 0.35, - "policy/approxkl_avg": 73.58890533447266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5293036699295044, - "step": 175, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9974313974380493 - }, - { - "episode": 2832, - "epoch": 0.05090412337779056, - "loss/policy_avg": 0.09056591242551804, - "lr": 9.887525562372189e-06, - "objective/entropy": 271.1833190917969, - "objective/kl": 18.394960403442383, - "objective/non_score_reward": -1.83949613571167, - "objective/rlhf_reward": -5.8770319251374, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 106.73664855957031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.683214545249939, - "step": 176, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000105619430542 - }, - { - "episode": 2848, - "epoch": 0.051191717295179205, - "loss/policy_avg": 1.4375742673873901, - "lr": 9.886886503067486e-06, - "objective/entropy": 21.05878448486328, - "objective/kl": 19.652141571044922, - "objective/non_score_reward": -1.9652140140533447, - "objective/rlhf_reward": -5.913445363717015, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 19.185291290283203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5441437363624573, - "step": 177, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982903003692627 - }, - { - "episode": 2864, - "epoch": 0.05147931121256785, - "loss/policy_avg": 0.9233704209327698, - "lr": 9.886247443762783e-06, - "objective/entropy": 79.99378204345703, - "objective/kl": 30.930316925048828, - "objective/non_score_reward": -3.093031644821167, - "objective/rlhf_reward": -10.42471582718366, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 71.49031829833984, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.4192318320274353, - "step": 178, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994785785675049 - }, - { - "episode": 2880, - "epoch": 0.0517669051299565, - "loss/policy_avg": 0.8243035078048706, - "lr": 9.88560838445808e-06, - "objective/entropy": -79.08426666259766, - "objective/kl": 21.69200897216797, - "objective/non_score_reward": -2.169200897216797, - "objective/rlhf_reward": -7.072683844629841, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 111.395263671875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5561103820800781, - "step": 179, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990267753601074 - }, - { - "episode": 2896, - "epoch": 0.052054499047345146, - "loss/policy_avg": 0.20029950141906738, - "lr": 9.884969325153375e-06, - "objective/entropy": 29.943138122558594, - "objective/kl": 23.813167572021484, - "objective/non_score_reward": -2.381316661834717, - "objective/rlhf_reward": -9.525267362594604, - "objective/scores": 0.0, - "policy/approxkl_avg": 42.68115997314453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6918896436691284, - "step": 180, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9963979721069336 - }, - { - "episode": 2912, - "epoch": 0.05234209296473379, - "loss/policy_avg": 0.014301195740699768, - "lr": 9.884330265848671e-06, - "objective/entropy": 164.71829223632812, - "objective/kl": 24.91703224182129, - "objective/non_score_reward": -2.4917030334472656, - "objective/rlhf_reward": -7.84410613991407, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 178.1289520263672, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.43327000737190247, - "step": 181, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9986697435379028 - }, - { - "episode": 2928, - "epoch": 0.052629686882122446, - "loss/policy_avg": 0.9095668792724609, - "lr": 9.883691206543968e-06, - "objective/entropy": 107.3670883178711, - "objective/kl": 29.20984649658203, - "objective/non_score_reward": -2.920984983444214, - "objective/rlhf_reward": -10.079820070330221, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 77.46437072753906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6489860415458679, - "step": 182, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9962208271026611 - }, - { - "episode": 2944, - "epoch": 0.05291728079951109, - "loss/policy_avg": 0.2735748589038849, - "lr": 9.883052147239265e-06, - "objective/entropy": 22.112346649169922, - "objective/kl": 20.8614444732666, - "objective/non_score_reward": -2.08614444732666, - "objective/rlhf_reward": -6.682718520582306, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 121.54977416992188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5333126187324524, - "step": 183, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991374015808105 - }, - { - "episode": 2960, - "epoch": 0.05320487471689974, - "loss/policy_avg": 0.32464680075645447, - "lr": 9.882413087934562e-06, - "objective/entropy": 188.4505615234375, - "objective/kl": 20.14493179321289, - "objective/non_score_reward": -2.014493227005005, - "objective/rlhf_reward": -6.542201125415501, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 49.658721923828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7087539434432983, - "step": 184, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0017189979553223 - }, - { - "episode": 2976, - "epoch": 0.05349246863428839, - "loss/policy_avg": 0.6234300136566162, - "lr": 9.881774028629857e-06, - "objective/entropy": -28.301137924194336, - "objective/kl": 22.407699584960938, - "objective/non_score_reward": -2.240769863128662, - "objective/rlhf_reward": -7.406820564475611, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.418500900268555, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.48261120915412903, - "step": 185, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975056648254395 - }, - { - "episode": 2992, - "epoch": 0.053780062551677034, - "loss/policy_avg": 0.1302778124809265, - "lr": 9.881134969325154e-06, - "objective/entropy": 163.4349365234375, - "objective/kl": 31.70010757446289, - "objective/non_score_reward": -3.170010566711426, - "objective/rlhf_reward": -10.280042505264282, - "objective/scores": 0.6, - "policy/approxkl_avg": 74.95491790771484, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.584516167640686, - "step": 186, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998785138130188 - }, - { - "episode": 3008, - "epoch": 0.05406765646906568, - "loss/policy_avg": 0.22809255123138428, - "lr": 9.880495910020451e-06, - "objective/entropy": 209.2935333251953, - "objective/kl": 20.395681381225586, - "objective/non_score_reward": -2.0395681858062744, - "objective/rlhf_reward": -6.816636970549254, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 63.878265380859375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6551488637924194, - "step": 187, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995040893554688 - }, - { - "episode": 3024, - "epoch": 0.05435525038645433, - "loss/policy_avg": 0.03126790001988411, - "lr": 9.879856850715748e-06, - "objective/entropy": 4.618324279785156, - "objective/kl": 24.382261276245117, - "objective/non_score_reward": -2.4382262229919434, - "objective/rlhf_reward": -8.237132870944675, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 6.476547718048096, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5421229600906372, - "step": 188, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999804496765137 - }, - { - "episode": 3040, - "epoch": 0.054642844303842975, - "loss/policy_avg": 0.3424449563026428, - "lr": 9.879217791411043e-06, - "objective/entropy": -136.56385803222656, - "objective/kl": 33.151222229003906, - "objective/non_score_reward": -3.315122127532959, - "objective/rlhf_reward": -11.901238882277887, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 49.403472900390625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5513160228729248, - "step": 189, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999922752380371 - }, - { - "episode": 3056, - "epoch": 0.05493043822123162, - "loss/policy_avg": 0.8403773307800293, - "lr": 9.87857873210634e-06, - "objective/entropy": 322.4634704589844, - "objective/kl": 26.295257568359375, - "objective/non_score_reward": -2.629525661468506, - "objective/rlhf_reward": -9.192590031653566, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 115.08587646484375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7622972130775452, - "step": 190, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968273639678955 - }, - { - "episode": 3072, - "epoch": 0.05521803213862027, - "loss/policy_avg": 0.19895562529563904, - "lr": 9.877939672801637e-06, - "objective/entropy": -226.75164794921875, - "objective/kl": 16.52016830444336, - "objective/non_score_reward": -1.6520167589187622, - "objective/rlhf_reward": -4.78323804882438, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 43.135128021240234, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.48229825496673584, - "step": 191, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.003657341003418 - }, - { - "episode": 3088, - "epoch": 0.055505626056008915, - "loss/policy_avg": 0.132611945271492, - "lr": 9.877300613496934e-06, - "objective/entropy": 138.95777893066406, - "objective/kl": 28.13532257080078, - "objective/non_score_reward": -2.8135323524475098, - "objective/rlhf_reward": -9.830297310550776, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 11.711568832397461, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6496654748916626, - "step": 192, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994869232177734 - }, - { - "episode": 3104, - "epoch": 0.05579321997339756, - "loss/policy_avg": 0.13647544384002686, - "lr": 9.876661554192229e-06, - "objective/entropy": 228.907958984375, - "objective/kl": 20.958343505859375, - "objective/non_score_reward": -2.095834255218506, - "objective/rlhf_reward": -6.435925911145146, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 77.2052230834961, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5815694332122803, - "step": 193, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.998384714126587 - }, - { - "episode": 3120, - "epoch": 0.05608081389078621, - "loss/policy_avg": 0.3623042702674866, - "lr": 9.876022494887526e-06, - "objective/entropy": 101.04753112792969, - "objective/kl": 28.680049896240234, - "objective/non_score_reward": -2.8680050373077393, - "objective/rlhf_reward": -10.021421532245025, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 129.59266662597656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5827709436416626, - "step": 194, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990119934082031 - }, - { - "episode": 3136, - "epoch": 0.056368407808174856, - "loss/policy_avg": -0.07424932718276978, - "lr": 9.875383435582823e-06, - "objective/entropy": 245.4013671875, - "objective/kl": 20.346391677856445, - "objective/non_score_reward": -2.034639358520508, - "objective/rlhf_reward": -6.01585108257917, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 31.92734718322754, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6253474950790405, - "step": 195, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9984948635101318 - }, - { - "episode": 3152, - "epoch": 0.0566560017255635, - "loss/policy_avg": 0.1401011198759079, - "lr": 9.87474437627812e-06, - "objective/entropy": 375.89263916015625, - "objective/kl": 21.685848236083984, - "objective/non_score_reward": -2.1685848236083984, - "objective/rlhf_reward": -6.274339175224304, - "objective/scores": 0.6, - "policy/approxkl_avg": 131.50494384765625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.9226024746894836, - "step": 196, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0007243156433105 - }, - { - "episode": 3168, - "epoch": 0.05694359564295215, - "loss/policy_avg": -0.030730588361620903, - "lr": 9.874105316973416e-06, - "objective/entropy": 140.85540771484375, - "objective/kl": 20.57616424560547, - "objective/non_score_reward": -2.0576162338256836, - "objective/rlhf_reward": -6.779867152781829, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 33.19469451904297, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7271202206611633, - "step": 197, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000385284423828 - }, - { - "episode": 3184, - "epoch": 0.057231189560340796, - "loss/policy_avg": 2.7618093490600586, - "lr": 9.873466257668712e-06, - "objective/entropy": 179.97198486328125, - "objective/kl": 28.560035705566406, - "objective/non_score_reward": -2.856003761291504, - "objective/rlhf_reward": -9.599186535152505, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 32.35374450683594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.653136134147644, - "step": 198, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997452974319458 - }, - { - "episode": 3200, - "epoch": 0.05751878347772944, - "loss/policy_avg": 0.6520799398422241, - "lr": 9.872827198364009e-06, - "objective/entropy": 110.88057708740234, - "objective/kl": 31.026592254638672, - "objective/non_score_reward": -3.1026594638824463, - "objective/rlhf_reward": -11.06900267890039, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 33.437408447265625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5983192324638367, - "step": 199, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984240531921387 - }, - { - "episode": 3216, - "epoch": 0.05780637739511809, - "loss/policy_avg": 0.19128543138504028, - "lr": 9.872188139059305e-06, - "objective/entropy": 234.22332763671875, - "objective/kl": 33.926361083984375, - "objective/non_score_reward": -3.3926358222961426, - "objective/rlhf_reward": -12.119944910617217, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 190.18153381347656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9237314462661743, - "step": 200, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9989691972732544 - }, - { - "episode": 3232, - "epoch": 0.058093971312506744, - "loss/policy_avg": 0.04767340421676636, - "lr": 9.871549079754602e-06, - "objective/entropy": -7.490440368652344, - "objective/kl": 16.179231643676758, - "objective/non_score_reward": -1.6179232597351074, - "objective/rlhf_reward": -4.348986568228279, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 3.6666202545166016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5502414703369141, - "step": 201, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998970031738281 - }, - { - "episode": 3248, - "epoch": 0.05838156522989539, - "loss/policy_avg": 0.009956400841474533, - "lr": 9.8709100204499e-06, - "objective/entropy": 184.87599182128906, - "objective/kl": 30.518714904785156, - "objective/non_score_reward": -3.0518715381622314, - "objective/rlhf_reward": -10.865850737600951, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 144.38037109375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46582168340682983, - "step": 202, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0004916191101074 - }, - { - "episode": 3264, - "epoch": 0.05866915914728404, - "loss/policy_avg": 0.8650859594345093, - "lr": 9.870270961145196e-06, - "objective/entropy": 60.665279388427734, - "objective/kl": 30.722930908203125, - "objective/non_score_reward": -3.0722928047180176, - "objective/rlhf_reward": -10.80821907799995, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 197.62728881835938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.45192649960517883, - "step": 203, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9978991746902466 - }, - { - "episode": 3280, - "epoch": 0.058956753064672685, - "loss/policy_avg": 0.7753949165344238, - "lr": 9.869631901840491e-06, - "objective/entropy": 224.53439331054688, - "objective/kl": 35.50615692138672, - "objective/non_score_reward": -3.5506153106689453, - "objective/rlhf_reward": -9.80246195793152, - "objective/scores": 1.1, - "policy/approxkl_avg": 207.07131958007812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.625594973564148, - "step": 204, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.003218650817871 - }, - { - "episode": 3296, - "epoch": 0.05924434698206133, - "loss/policy_avg": 0.12218689173460007, - "lr": 9.868992842535788e-06, - "objective/entropy": 160.02056884765625, - "objective/kl": 20.542434692382812, - "objective/non_score_reward": -2.054243326187134, - "objective/rlhf_reward": -8.216973185539246, - "objective/scores": 0.0, - "policy/approxkl_avg": 27.70839500427246, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5086226463317871, - "step": 205, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9990867376327515 - }, - { - "episode": 3312, - "epoch": 0.05953194089944998, - "loss/policy_avg": 0.16328184306621552, - "lr": 9.868353783231085e-06, - "objective/entropy": -178.42849731445312, - "objective/kl": 12.709222793579102, - "objective/non_score_reward": -1.270922303199768, - "objective/rlhf_reward": -0.6836892724037167, - "objective/scores": 1.1, - "policy/approxkl_avg": 175.85543823242188, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6555081605911255, - "step": 206, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003648281097412 - }, - { - "episode": 3328, - "epoch": 0.059819534816838625, - "loss/policy_avg": 0.3191947340965271, - "lr": 9.867714723926382e-06, - "objective/entropy": 113.40653991699219, - "objective/kl": 23.92019271850586, - "objective/non_score_reward": -2.392019271850586, - "objective/rlhf_reward": -8.011817782130793, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 120.01425170898438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6426678895950317, - "step": 207, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998692274093628 - }, - { - "episode": 3344, - "epoch": 0.06010712873422727, - "loss/policy_avg": 0.8106866478919983, - "lr": 9.867075664621679e-06, - "objective/entropy": -161.32217407226562, - "objective/kl": 20.696407318115234, - "objective/non_score_reward": -2.069640874862671, - "objective/rlhf_reward": -6.899961569396359, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 60.63603973388672, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.424863338470459, - "step": 208, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001417636871338 - }, - { - "episode": 3360, - "epoch": 0.06039472265161592, - "loss/policy_avg": 0.5404326915740967, - "lr": 9.866436605316974e-06, - "objective/entropy": 136.03414916992188, - "objective/kl": 24.140501022338867, - "objective/non_score_reward": -2.414050340652466, - "objective/rlhf_reward": -5.256201243400573, - "objective/scores": 1.1, - "policy/approxkl_avg": 77.51190948486328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6469956040382385, - "step": 209, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992618560791016 - }, - { - "episode": 3376, - "epoch": 0.060682316569004566, - "loss/policy_avg": -0.004204496741294861, - "lr": 9.86579754601227e-06, - "objective/entropy": -205.11416625976562, - "objective/kl": 22.115215301513672, - "objective/non_score_reward": -2.211521625518799, - "objective/rlhf_reward": -7.3954881235078425, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 130.94525146484375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7604430913925171, - "step": 210, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9986162185668945 - }, - { - "episode": 3392, - "epoch": 0.06096991048639321, - "loss/policy_avg": 0.10069486498832703, - "lr": 9.865158486707568e-06, - "objective/entropy": 38.40431213378906, - "objective/kl": 21.107707977294922, - "objective/non_score_reward": -2.1107707023620605, - "objective/rlhf_reward": -6.962130549366831, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 48.98419189453125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.43749940395355225, - "step": 211, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9993665218353271 - }, - { - "episode": 3408, - "epoch": 0.06125750440378186, - "loss/policy_avg": 0.013450137339532375, - "lr": 9.864519427402863e-06, - "objective/entropy": 97.07965087890625, - "objective/kl": 26.950225830078125, - "objective/non_score_reward": -2.6950225830078125, - "objective/rlhf_reward": -8.955262298854898, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 44.33604431152344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5165296792984009, - "step": 212, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978928565979004 - }, - { - "episode": 3424, - "epoch": 0.061545098321170506, - "loss/policy_avg": 0.4735873341560364, - "lr": 9.86388036809816e-06, - "objective/entropy": -1.7870521545410156, - "objective/kl": 27.062910079956055, - "objective/non_score_reward": -2.7062911987304688, - "objective/rlhf_reward": -9.465914988253992, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 72.81141662597656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6805305480957031, - "step": 213, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998960018157959 - }, - { - "episode": 3440, - "epoch": 0.06183269223855915, - "loss/policy_avg": 0.09523998200893402, - "lr": 9.863241308793457e-06, - "objective/entropy": 32.18935012817383, - "objective/kl": 9.85006046295166, - "objective/non_score_reward": -0.9850060939788818, - "objective/rlhf_reward": -2.5161924554901995, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 3.0238242149353027, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3841710090637207, - "step": 214, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9986772537231445 - }, - { - "episode": 3456, - "epoch": 0.0621202861559478, - "loss/policy_avg": 0.720879316329956, - "lr": 9.862602249488753e-06, - "objective/entropy": 276.2146301269531, - "objective/kl": 28.97698974609375, - "objective/non_score_reward": -2.8976993560791016, - "objective/rlhf_reward": -9.928937201917755, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 191.53884887695312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7353510856628418, - "step": 215, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9999027252197266 - }, - { - "episode": 3472, - "epoch": 0.06240788007333645, - "loss/policy_avg": 0.5507330894470215, - "lr": 9.86196319018405e-06, - "objective/entropy": 250.835693359375, - "objective/kl": 29.98652458190918, - "objective/non_score_reward": -2.998652219772339, - "objective/rlhf_reward": -10.332749491155731, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 85.02761840820312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6071260571479797, - "step": 216, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968819618225098 - }, - { - "episode": 3488, - "epoch": 0.0626954739907251, - "loss/policy_avg": 0.9385891556739807, - "lr": 9.861324130879346e-06, - "objective/entropy": 82.53084564208984, - "objective/kl": 26.54790687561035, - "objective/non_score_reward": -2.6547906398773193, - "objective/rlhf_reward": -8.496456088797125, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 24.03960609436035, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4867916703224182, - "step": 217, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992834329605103 - }, - { - "episode": 3504, - "epoch": 0.06298306790811374, - "loss/policy_avg": 0.3534790575504303, - "lr": 9.860685071574642e-06, - "objective/entropy": 230.29193115234375, - "objective/kl": 21.73017120361328, - "objective/non_score_reward": -2.1730172634124756, - "objective/rlhf_reward": -6.867240424427102, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 81.97232055664062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.558746337890625, - "step": 218, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9974663257598877 - }, - { - "episode": 3520, - "epoch": 0.0632706618255024, - "loss/policy_avg": -0.15977555513381958, - "lr": 9.86004601226994e-06, - "objective/entropy": 113.71033477783203, - "objective/kl": 17.67473030090332, - "objective/non_score_reward": -1.7674732208251953, - "objective/rlhf_reward": -4.947186651007209, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 94.58512115478516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5372532606124878, - "step": 219, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001951217651367 - }, - { - "episode": 3536, - "epoch": 0.06355825574289103, - "loss/policy_avg": 0.9559342265129089, - "lr": 9.859406952965236e-06, - "objective/entropy": 173.58860778808594, - "objective/kl": 33.72608947753906, - "objective/non_score_reward": -3.3726086616516113, - "objective/rlhf_reward": -11.367727937475713, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 6.491452217102051, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6636508703231812, - "step": 220, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9996795654296875 - }, - { - "episode": 3552, - "epoch": 0.06384584966027969, - "loss/policy_avg": -0.854604184627533, - "lr": 9.858767893660533e-06, - "objective/entropy": -67.233154296875, - "objective/kl": 13.420427322387695, - "objective/non_score_reward": -1.3420426845550537, - "objective/rlhf_reward": -2.4444515898239345, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 72.4083251953125, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.8086908459663391, - "step": 221, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0127267837524414 - }, - { - "episode": 3568, - "epoch": 0.06413344357766833, - "loss/policy_avg": 0.5410902500152588, - "lr": 9.858128834355828e-06, - "objective/entropy": 175.63470458984375, - "objective/kl": 35.907081604003906, - "objective/non_score_reward": -3.5907082557678223, - "objective/rlhf_reward": -13.037319932013673, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 99.65482330322266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6278276443481445, - "step": 222, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999873399734497 - }, - { - "episode": 3584, - "epoch": 0.06442103749505698, - "loss/policy_avg": 0.3871188163757324, - "lr": 9.857489775051125e-06, - "objective/entropy": -161.75840759277344, - "objective/kl": 18.288314819335938, - "objective/non_score_reward": -1.8288315534591675, - "objective/rlhf_reward": -5.799554431232151, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 110.67633056640625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6733036041259766, - "step": 223, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991552829742432 - }, - { - "episode": 3600, - "epoch": 0.06470863141244562, - "loss/policy_avg": -0.573300838470459, - "lr": 9.856850715746422e-06, - "objective/entropy": 6.650520324707031, - "objective/kl": 26.58426284790039, - "objective/non_score_reward": -2.658426284790039, - "objective/rlhf_reward": -7.709986720920774, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 44.56170654296875, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.38651180267333984, - "step": 224, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0037808418273926 - }, - { - "episode": 3616, - "epoch": 0.06499622532983428, - "loss/policy_avg": 0.5340808629989624, - "lr": 9.856211656441719e-06, - "objective/entropy": 59.36520004272461, - "objective/kl": 28.841266632080078, - "objective/non_score_reward": -2.884126901626587, - "objective/rlhf_reward": -9.874647860944854, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 53.19476318359375, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6743514537811279, - "step": 225, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9957829713821411 - }, - { - "episode": 3632, - "epoch": 0.06528381924722292, - "loss/policy_avg": 0.5914766192436218, - "lr": 9.855572597137016e-06, - "objective/entropy": 228.81517028808594, - "objective/kl": 30.393442153930664, - "objective/non_score_reward": -3.039344310760498, - "objective/rlhf_reward": -10.209965775685246, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 85.6346435546875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5778177976608276, - "step": 226, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9993802309036255 - }, - { - "episode": 3648, - "epoch": 0.06557141316461157, - "loss/policy_avg": -0.05053609609603882, - "lr": 9.854933537832313e-06, - "objective/entropy": 13.725364685058594, - "objective/kl": 25.695791244506836, - "objective/non_score_reward": -2.5695791244506836, - "objective/rlhf_reward": -8.330905745701726, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 135.800048828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3210442066192627, - "step": 227, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997122287750244 - }, - { - "episode": 3664, - "epoch": 0.06585900708200021, - "loss/policy_avg": 0.4539129137992859, - "lr": 9.854294478527608e-06, - "objective/entropy": 93.42439270019531, - "objective/kl": 30.396175384521484, - "objective/non_score_reward": -3.0396177768707275, - "objective/rlhf_reward": -10.779868462172846, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 55.461158752441406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6131182909011841, - "step": 228, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9976425170898438 - }, - { - "episode": 3680, - "epoch": 0.06614660099938886, - "loss/policy_avg": 0.228049173951149, - "lr": 9.853655419222905e-06, - "objective/entropy": -28.055843353271484, - "objective/kl": 23.269084930419922, - "objective/non_score_reward": -2.326908588409424, - "objective/rlhf_reward": -6.383914981723997, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 143.5833740234375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5752028822898865, - "step": 229, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000626802444458 - }, - { - "episode": 3696, - "epoch": 0.06643419491677752, - "loss/policy_avg": 0.10666107386350632, - "lr": 9.853016359918202e-06, - "objective/entropy": 74.64518737792969, - "objective/kl": 32.4399528503418, - "objective/non_score_reward": -3.243995189666748, - "objective/rlhf_reward": -11.525382618518218, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 181.31935119628906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5801441669464111, - "step": 230, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995744228363037 - }, - { - "episode": 3712, - "epoch": 0.06672178883416616, - "loss/policy_avg": 2.4466023445129395, - "lr": 9.852377300613498e-06, - "objective/entropy": 244.4732666015625, - "objective/kl": 27.413360595703125, - "objective/non_score_reward": -2.7413363456726074, - "objective/rlhf_reward": -9.14051639583976, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 85.86346435546875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8622180223464966, - "step": 231, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9987142086029053 - }, - { - "episode": 3728, - "epoch": 0.06700938275155481, - "loss/policy_avg": 0.8113258481025696, - "lr": 9.851738241308795e-06, - "objective/entropy": 56.00733947753906, - "objective/kl": 21.946327209472656, - "objective/non_score_reward": -2.1946325302124023, - "objective/rlhf_reward": -7.419280850623531, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 29.368534088134766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.45428696274757385, - "step": 232, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980647563934326 - }, - { - "episode": 3744, - "epoch": 0.06729697666894345, - "loss/policy_avg": 0.2869613766670227, - "lr": 9.85109918200409e-06, - "objective/entropy": 128.71649169921875, - "objective/kl": 21.821929931640625, - "objective/non_score_reward": -2.182192802429199, - "objective/rlhf_reward": -7.278173069568023, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 72.65187072753906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8492765426635742, - "step": 233, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982428550720215 - }, - { - "episode": 3760, - "epoch": 0.0675845705863321, - "loss/policy_avg": 1.1545510292053223, - "lr": 9.850460122699387e-06, - "objective/entropy": -46.38230895996094, - "objective/kl": 28.68572235107422, - "objective/non_score_reward": -2.868572235107422, - "objective/rlhf_reward": -10.050457079609004, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 29.78200912475586, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.661322295665741, - "step": 234, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991018772125244 - }, - { - "episode": 3776, - "epoch": 0.06787216450372074, - "loss/policy_avg": 0.7958990335464478, - "lr": 9.849821063394683e-06, - "objective/entropy": 157.34841918945312, - "objective/kl": 28.915939331054688, - "objective/non_score_reward": -2.8915936946868896, - "objective/rlhf_reward": -10.240862164527101, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 46.19620895385742, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6968529224395752, - "step": 235, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994633197784424 - }, - { - "episode": 3792, - "epoch": 0.0681597584211094, - "loss/policy_avg": 0.6319503784179688, - "lr": 9.84918200408998e-06, - "objective/entropy": 356.89532470703125, - "objective/kl": 28.920034408569336, - "objective/non_score_reward": -2.8920035362243652, - "objective/rlhf_reward": -10.144182522495356, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 15.02867317199707, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.926424503326416, - "step": 236, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9992319345474243 - }, - { - "episode": 3808, - "epoch": 0.06844735233849804, - "loss/policy_avg": 0.29689115285873413, - "lr": 9.848542944785276e-06, - "objective/entropy": -114.8179931640625, - "objective/kl": 22.912490844726562, - "objective/non_score_reward": -2.2912492752075195, - "objective/rlhf_reward": -7.649225318225559, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.519531726837158, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4792546033859253, - "step": 237, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9990766048431396 - }, - { - "episode": 3824, - "epoch": 0.06873494625588669, - "loss/policy_avg": 0.6142581701278687, - "lr": 9.847903885480573e-06, - "objective/entropy": 42.130271911621094, - "objective/kl": 30.74860382080078, - "objective/non_score_reward": -3.0748605728149414, - "objective/rlhf_reward": -10.920839407531124, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 37.97405242919922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4455175995826721, - "step": 238, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9975383281707764 - }, - { - "episode": 3840, - "epoch": 0.06902254017327533, - "loss/policy_avg": 0.03958883881568909, - "lr": 9.84726482617587e-06, - "objective/entropy": 148.7663116455078, - "objective/kl": 24.86724853515625, - "objective/non_score_reward": -2.486724615097046, - "objective/rlhf_reward": -8.56829617270599, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 20.696613311767578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7744324207305908, - "step": 239, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0034701824188232 - }, - { - "episode": 3856, - "epoch": 0.06931013409066399, - "loss/policy_avg": -0.12924179434776306, - "lr": 9.846625766871167e-06, - "objective/entropy": 13.191347122192383, - "objective/kl": 36.86333465576172, - "objective/non_score_reward": -3.686333179473877, - "objective/rlhf_reward": -11.821614180446836, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 93.72460174560547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7017860412597656, - "step": 240, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.996403455734253 - }, - { - "episode": 3872, - "epoch": 0.06959772800805263, - "loss/policy_avg": 0.6671891212463379, - "lr": 9.845986707566462e-06, - "objective/entropy": 144.81239318847656, - "objective/kl": 25.728496551513672, - "objective/non_score_reward": -2.572849750518799, - "objective/rlhf_reward": -8.168692888990913, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 24.799148559570312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5449861288070679, - "step": 241, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000767707824707 - }, - { - "episode": 3888, - "epoch": 0.06988532192544128, - "loss/policy_avg": 1.4478445053100586, - "lr": 9.845347648261759e-06, - "objective/entropy": -13.714214324951172, - "objective/kl": 31.57904052734375, - "objective/non_score_reward": -3.1579039096832275, - "objective/rlhf_reward": -11.207783777912226, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 7.07413387298584, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5770883560180664, - "step": 242, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.997456669807434 - }, - { - "episode": 3904, - "epoch": 0.07017291584282992, - "loss/policy_avg": -0.1629352867603302, - "lr": 9.844708588957056e-06, - "objective/entropy": 150.56808471679688, - "objective/kl": 22.077739715576172, - "objective/non_score_reward": -2.2077741622924805, - "objective/rlhf_reward": -8.831096112728119, - "objective/scores": 0.0, - "policy/approxkl_avg": 6.039865970611572, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4642740786075592, - "step": 243, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.013947010040283 - }, - { - "episode": 3920, - "epoch": 0.07046050976021857, - "loss/policy_avg": 4.2705912590026855, - "lr": 9.844069529652353e-06, - "objective/entropy": -73.61671447753906, - "objective/kl": 27.2436580657959, - "objective/non_score_reward": -2.724365711212158, - "objective/rlhf_reward": -9.072634573253701, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 4.7233123779296875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5502868294715881, - "step": 244, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990230798721313 - }, - { - "episode": 3936, - "epoch": 0.07074810367760721, - "loss/policy_avg": 0.09502686560153961, - "lr": 9.84343047034765e-06, - "objective/entropy": 38.153350830078125, - "objective/kl": 25.953601837158203, - "objective/non_score_reward": -2.595360040664673, - "objective/rlhf_reward": -8.434029231743748, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 183.2377471923828, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8148726224899292, - "step": 245, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9989858865737915 - }, - { - "episode": 3952, - "epoch": 0.07103569759499587, - "loss/policy_avg": 0.36105144023895264, - "lr": 9.842791411042945e-06, - "objective/entropy": 46.69014358520508, - "objective/kl": 24.270606994628906, - "objective/non_score_reward": -2.427060842514038, - "objective/rlhf_reward": -8.329641216484408, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 23.915287017822266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4709934592247009, - "step": 246, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001595973968506 - }, - { - "episode": 3968, - "epoch": 0.0713232915123845, - "loss/policy_avg": 0.3951423168182373, - "lr": 9.842152351738242e-06, - "objective/entropy": 0.16453170776367188, - "objective/kl": 27.542736053466797, - "objective/non_score_reward": -2.7542738914489746, - "objective/rlhf_reward": -11.017095446586609, - "objective/scores": 0.0, - "policy/approxkl_avg": 11.038375854492188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6403580904006958, - "step": 247, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978313446044922 - }, - { - "episode": 3984, - "epoch": 0.07161088542977316, - "loss/policy_avg": 0.2933734655380249, - "lr": 9.841513292433539e-06, - "objective/entropy": -41.10125732421875, - "objective/kl": 25.373741149902344, - "objective/non_score_reward": -2.5373740196228027, - "objective/rlhf_reward": -8.770894267646176, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.429267883300781, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6396682262420654, - "step": 248, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0005202293395996 - }, - { - "episode": 4000, - "epoch": 0.07189847934716181, - "loss/policy_avg": 0.24670132994651794, - "lr": 9.840874233128836e-06, - "objective/entropy": -35.8713264465332, - "objective/kl": 30.457420349121094, - "objective/non_score_reward": -3.0457420349121094, - "objective/rlhf_reward": -10.449634091059366, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 79.78580474853516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.41042375564575195, - "step": 249, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9961113929748535 - }, - { - "episode": 4016, - "epoch": 0.07218607326455045, - "loss/policy_avg": 0.017466381192207336, - "lr": 9.840235173824132e-06, - "objective/entropy": 87.24893188476562, - "objective/kl": 17.873748779296875, - "objective/non_score_reward": -1.7873749732971191, - "objective/rlhf_reward": -5.026793541685615, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 83.72406005859375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7392317056655884, - "step": 250, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9985733032226562 - }, - { - "episode": 4032, - "epoch": 0.07247366718193911, - "loss/policy_avg": 0.209593266248703, - "lr": 9.83959611451943e-06, - "objective/entropy": -10.21453857421875, - "objective/kl": 26.26023292541504, - "objective/non_score_reward": -2.626023292541504, - "objective/rlhf_reward": -9.104092931747438, - "objective/scores": 0.35, - "policy/approxkl_avg": 2.64996337890625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5523943901062012, - "step": 251, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9995850324630737 - }, - { - "episode": 4048, - "epoch": 0.07276126109932775, - "loss/policy_avg": 0.5933290719985962, - "lr": 9.838957055214724e-06, - "objective/entropy": -18.139259338378906, - "objective/kl": 29.199474334716797, - "objective/non_score_reward": -2.9199471473693848, - "objective/rlhf_reward": -10.27978894710541, - "objective/scores": 0.35, - "policy/approxkl_avg": 50.652503967285156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5432610511779785, - "step": 252, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9982495307922363 - }, - { - "episode": 4064, - "epoch": 0.0730488550167164, - "loss/policy_avg": 1.320284366607666, - "lr": 9.838317995910021e-06, - "objective/entropy": -10.506271362304688, - "objective/kl": 28.47583770751953, - "objective/non_score_reward": -2.847583532333374, - "objective/rlhf_reward": -9.44292337723249, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 83.18882751464844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5443971157073975, - "step": 253, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997657299041748 - }, - { - "episode": 4080, - "epoch": 0.07333644893410504, - "loss/policy_avg": -0.02555149793624878, - "lr": 9.837678936605318e-06, - "objective/entropy": -81.56509399414062, - "objective/kl": 15.26602840423584, - "objective/non_score_reward": -1.526602864265442, - "objective/rlhf_reward": -4.281582649025034, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 6.358033657073975, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6404141187667847, - "step": 254, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0027146339416504 - }, - { - "episode": 4096, - "epoch": 0.0736240428514937, - "loss/policy_avg": 0.4154921770095825, - "lr": 9.837039877300615e-06, - "objective/entropy": -86.56658935546875, - "objective/kl": 15.54503059387207, - "objective/non_score_reward": -1.5545029640197754, - "objective/rlhf_reward": -4.393183226856302, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 36.390655517578125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9074146747589111, - "step": 255, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9989664554595947 - }, - { - "episode": 4112, - "epoch": 0.07391163676888234, - "loss/policy_avg": -0.2038569152355194, - "lr": 9.83640081799591e-06, - "objective/entropy": -80.65778350830078, - "objective/kl": 20.036571502685547, - "objective/non_score_reward": -2.003657102584839, - "objective/rlhf_reward": -6.189799661907266, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 28.666210174560547, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7191000580787659, - "step": 256, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999082088470459 - }, - { - "episode": 4128, - "epoch": 0.07419923068627099, - "loss/policy_avg": 0.5487632751464844, - "lr": 9.835761758691207e-06, - "objective/entropy": 64.21192932128906, - "objective/kl": 25.55659294128418, - "objective/non_score_reward": -2.555659294128418, - "objective/rlhf_reward": -8.79880495806512, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 74.83338928222656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7529090046882629, - "step": 257, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991776943206787 - }, - { - "episode": 4144, - "epoch": 0.07448682460365963, - "loss/policy_avg": 0.8301103115081787, - "lr": 9.835122699386504e-06, - "objective/entropy": 152.20065307617188, - "objective/kl": 26.725215911865234, - "objective/non_score_reward": -2.6725215911865234, - "objective/rlhf_reward": -10.690086603164673, - "objective/scores": 0.0, - "policy/approxkl_avg": 86.60305786132812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3636325001716614, - "step": 258, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996506929397583 - }, - { - "episode": 4160, - "epoch": 0.07477441852104828, - "loss/policy_avg": 0.6052212119102478, - "lr": 9.8344836400818e-06, - "objective/entropy": 92.0700454711914, - "objective/kl": 20.43947982788086, - "objective/non_score_reward": -2.043948173522949, - "objective/rlhf_reward": -3.775792723894119, - "objective/scores": 1.1, - "policy/approxkl_avg": 6.338429927825928, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4831230342388153, - "step": 259, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9987390041351318 - }, - { - "episode": 4176, - "epoch": 0.07506201243843692, - "loss/policy_avg": 0.33531126379966736, - "lr": 9.833844580777096e-06, - "objective/entropy": 103.8875732421875, - "objective/kl": 41.16206741333008, - "objective/non_score_reward": -4.116207122802734, - "objective/rlhf_reward": -14.860707316462118, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 157.35191345214844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5286747217178345, - "step": 260, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989887475967407 - }, - { - "episode": 4192, - "epoch": 0.07534960635582558, - "loss/policy_avg": 0.8983044624328613, - "lr": 9.833205521472393e-06, - "objective/entropy": -19.21771812438965, - "objective/kl": 27.187969207763672, - "objective/non_score_reward": -2.718796968460083, - "objective/rlhf_reward": -8.927777002530036, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 116.0262451171875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5258731842041016, - "step": 261, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985415935516357 - }, - { - "episode": 4208, - "epoch": 0.07563720027321422, - "loss/policy_avg": 0.3744966983795166, - "lr": 9.83256646216769e-06, - "objective/entropy": 108.31391906738281, - "objective/kl": 27.059907913208008, - "objective/non_score_reward": -2.705990791320801, - "objective/rlhf_reward": -9.090629831949869, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 132.42181396484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7807722687721252, - "step": 262, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999628067016602 - }, - { - "episode": 4224, - "epoch": 0.07592479419060287, - "loss/policy_avg": -0.06834838539361954, - "lr": 9.831927402862987e-06, - "objective/entropy": -89.212890625, - "objective/kl": 21.477336883544922, - "objective/non_score_reward": -2.147733688354492, - "objective/rlhf_reward": -4.190934514999389, - "objective/scores": 1.1, - "policy/approxkl_avg": 2.770085573196411, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6721138954162598, - "step": 263, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0005064010620117 - }, - { - "episode": 4240, - "epoch": 0.07621238810799151, - "loss/policy_avg": 0.20960178971290588, - "lr": 9.831288343558284e-06, - "objective/entropy": 7.579254150390625, - "objective/kl": 31.429780960083008, - "objective/non_score_reward": -3.1429781913757324, - "objective/rlhf_reward": -11.212662303183954, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 39.11629104614258, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7809767723083496, - "step": 264, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.996565580368042 - }, - { - "episode": 4256, - "epoch": 0.07649998202538016, - "loss/policy_avg": 0.37524640560150146, - "lr": 9.830649284253579e-06, - "objective/entropy": 211.3717498779297, - "objective/kl": 22.981361389160156, - "objective/non_score_reward": -2.2981362342834473, - "objective/rlhf_reward": -6.792545056343078, - "objective/scores": 0.6, - "policy/approxkl_avg": 6.7515716552734375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7717372179031372, - "step": 265, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9991950988769531 - }, - { - "episode": 4272, - "epoch": 0.0767875759427688, - "loss/policy_avg": 1.0095475912094116, - "lr": 9.830010224948876e-06, - "objective/entropy": -20.248001098632812, - "objective/kl": 24.134700775146484, - "objective/non_score_reward": -2.4134700298309326, - "objective/rlhf_reward": -7.920546785990396, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 72.81602478027344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.48144859075546265, - "step": 266, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9972069263458252 - }, - { - "episode": 4288, - "epoch": 0.07707516986015746, - "loss/policy_avg": 0.14088629186153412, - "lr": 9.829371165644173e-06, - "objective/entropy": 199.36297607421875, - "objective/kl": 21.469898223876953, - "objective/non_score_reward": -2.1469898223876953, - "objective/rlhf_reward": -7.031699865069941, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 9.675331115722656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.8196889162063599, - "step": 267, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000971555709839 - }, - { - "episode": 4304, - "epoch": 0.0773627637775461, - "loss/policy_avg": 0.7135397791862488, - "lr": 9.82873210633947e-06, - "objective/entropy": 132.78390502929688, - "objective/kl": 29.841154098510742, - "objective/non_score_reward": -2.9841156005859375, - "objective/rlhf_reward": -10.485864262194976, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 51.49626159667969, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6590239405632019, - "step": 268, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9994990825653076 - }, - { - "episode": 4320, - "epoch": 0.07765035769493475, - "loss/policy_avg": 0.6342403888702393, - "lr": 9.828093047034766e-06, - "objective/entropy": 68.02133178710938, - "objective/kl": 25.947755813598633, - "objective/non_score_reward": -2.594775676727295, - "objective/rlhf_reward": -9.000500419226986, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.322699546813965, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7533285617828369, - "step": 269, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9998431205749512 - }, - { - "episode": 4336, - "epoch": 0.0779379516123234, - "loss/policy_avg": 1.3432139158248901, - "lr": 9.827453987730061e-06, - "objective/entropy": -63.51703643798828, - "objective/kl": 25.882217407226562, - "objective/non_score_reward": -2.588221788406372, - "objective/rlhf_reward": -8.230180921331915, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 106.42034912109375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3954962491989136, - "step": 270, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984210729599 - }, - { - "episode": 4352, - "epoch": 0.07822554552971205, - "loss/policy_avg": 0.9003316760063171, - "lr": 9.826814928425358e-06, - "objective/entropy": 303.42669677734375, - "objective/kl": 33.25891876220703, - "objective/non_score_reward": -3.325892448425293, - "objective/rlhf_reward": -11.822616937573315, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 65.77352905273438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7454761266708374, - "step": 271, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9990100860595703 - }, - { - "episode": 4368, - "epoch": 0.0785131394471007, - "loss/policy_avg": 1.1572515964508057, - "lr": 9.826175869120655e-06, - "objective/entropy": -59.230491638183594, - "objective/kl": 25.21849250793457, - "objective/non_score_reward": -2.5218493938446045, - "objective/rlhf_reward": -10.087397575378418, - "objective/scores": 0.0, - "policy/approxkl_avg": 142.75778198242188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5425001382827759, - "step": 272, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998133897781372 - }, - { - "episode": 4384, - "epoch": 0.07880073336448934, - "loss/policy_avg": 0.17176832258701324, - "lr": 9.825536809815952e-06, - "objective/entropy": 213.77191162109375, - "objective/kl": 31.61981773376465, - "objective/non_score_reward": -3.1619815826416016, - "objective/rlhf_reward": -10.7005155784654, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 128.8477783203125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5492858290672302, - "step": 273, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9998712539672852 - }, - { - "episode": 4400, - "epoch": 0.079088327281878, - "loss/policy_avg": -0.22184377908706665, - "lr": 9.824897750511249e-06, - "objective/entropy": 161.00198364257812, - "objective/kl": 34.806671142578125, - "objective/non_score_reward": -3.4806675910949707, - "objective/rlhf_reward": -11.799963655249151, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 48.8912239074707, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4313841462135315, - "step": 274, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0014257431030273 - }, - { - "episode": 4416, - "epoch": 0.07937592119926663, - "loss/policy_avg": 0.590415358543396, - "lr": 9.824258691206546e-06, - "objective/entropy": -94.14356231689453, - "objective/kl": 28.92959976196289, - "objective/non_score_reward": -2.8929600715637207, - "objective/rlhf_reward": -9.90998030227481, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.575645923614502, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.449258029460907, - "step": 275, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9994723796844482 - }, - { - "episode": 4432, - "epoch": 0.07966351511665529, - "loss/policy_avg": 0.2740442454814911, - "lr": 9.823619631901841e-06, - "objective/entropy": 56.66014099121094, - "objective/kl": 24.139942169189453, - "objective/non_score_reward": -2.413994073867798, - "objective/rlhf_reward": -7.533270301596199, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 41.256080627441406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6261377334594727, - "step": 276, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9990897178649902 - }, - { - "episode": 4448, - "epoch": 0.07995110903404393, - "loss/policy_avg": 0.026854295283555984, - "lr": 9.822980572597138e-06, - "objective/entropy": 135.07037353515625, - "objective/kl": 30.443017959594727, - "objective/non_score_reward": -3.044301748275757, - "objective/rlhf_reward": -12.177206993103027, - "objective/scores": 0.0, - "policy/approxkl_avg": 14.024923324584961, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5266727209091187, - "step": 277, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9980167150497437 - }, - { - "episode": 4464, - "epoch": 0.08023870295143258, - "loss/policy_avg": 0.0908375084400177, - "lr": 9.822341513292433e-06, - "objective/entropy": 98.10940551757812, - "objective/kl": 26.351314544677734, - "objective/non_score_reward": -2.635131359100342, - "objective/rlhf_reward": -9.059573057110667, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 61.92028045654297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5201822519302368, - "step": 278, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9965415000915527 - }, - { - "episode": 4480, - "epoch": 0.08052629686882122, - "loss/policy_avg": 0.3492497205734253, - "lr": 9.82170245398773e-06, - "objective/entropy": 79.57078552246094, - "objective/kl": 28.74835205078125, - "objective/non_score_reward": -2.8748350143432617, - "objective/rlhf_reward": -7.099340653419494, - "objective/scores": 1.1, - "policy/approxkl_avg": 45.850738525390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7365690469741821, - "step": 279, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999916553497314 - }, - { - "episode": 4496, - "epoch": 0.08081389078620987, - "loss/policy_avg": 0.6324511170387268, - "lr": 9.821063394683027e-06, - "objective/entropy": 116.90592956542969, - "objective/kl": 33.273155212402344, - "objective/non_score_reward": -3.3273158073425293, - "objective/rlhf_reward": -8.90926299095154, - "objective/scores": 1.1, - "policy/approxkl_avg": 50.5905647277832, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5080363750457764, - "step": 280, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990668296813965 - }, - { - "episode": 4512, - "epoch": 0.08110148470359851, - "loss/policy_avg": -0.1385992020368576, - "lr": 9.820424335378324e-06, - "objective/entropy": 72.11842346191406, - "objective/kl": 33.207122802734375, - "objective/non_score_reward": -3.320712089538574, - "objective/rlhf_reward": -13.282849073410034, - "objective/scores": 0.0, - "policy/approxkl_avg": 60.59511184692383, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8196091651916504, - "step": 281, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998537302017212 - }, - { - "episode": 4528, - "epoch": 0.08138907862098717, - "loss/policy_avg": -0.2620585262775421, - "lr": 9.81978527607362e-06, - "objective/entropy": -5.884607315063477, - "objective/kl": 39.53453063964844, - "objective/non_score_reward": -3.9534530639648438, - "objective/rlhf_reward": -13.691106977240118, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 83.97123718261719, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4607764780521393, - "step": 282, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000446319580078 - }, - { - "episode": 4544, - "epoch": 0.08167667253837581, - "loss/policy_avg": 0.8184198141098022, - "lr": 9.819146216768916e-06, - "objective/entropy": -124.17362976074219, - "objective/kl": 30.42546844482422, - "objective/non_score_reward": -3.0425467491149902, - "objective/rlhf_reward": -10.566067729059775, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 20.279199600219727, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.602076530456543, - "step": 283, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996403455734253 - }, - { - "episode": 4560, - "epoch": 0.08196426645576446, - "loss/policy_avg": 0.1789843738079071, - "lr": 9.818507157464213e-06, - "objective/entropy": 173.48333740234375, - "objective/kl": 23.40087890625, - "objective/non_score_reward": -2.340087890625, - "objective/rlhf_reward": -7.981749632445675, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 17.03640365600586, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6565302014350891, - "step": 284, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.004666566848755 - }, - { - "episode": 4576, - "epoch": 0.0822518603731531, - "loss/policy_avg": 1.0035152435302734, - "lr": 9.81786809815951e-06, - "objective/entropy": 18.757537841796875, - "objective/kl": 24.085613250732422, - "objective/non_score_reward": -2.4085617065429688, - "objective/rlhf_reward": -5.2342465877532955, - "objective/scores": 1.1, - "policy/approxkl_avg": 54.95973587036133, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5847882032394409, - "step": 285, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993207454681396 - }, - { - "episode": 4592, - "epoch": 0.08253945429054176, - "loss/policy_avg": 5.199029922485352, - "lr": 9.817229038854806e-06, - "objective/entropy": -160.87271118164062, - "objective/kl": 20.840656280517578, - "objective/non_score_reward": -2.0840654373168945, - "objective/rlhf_reward": -5.936261987686157, - "objective/scores": 0.6, - "policy/approxkl_avg": 9.209554672241211, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6878505945205688, - "step": 286, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9999942779541016 - }, - { - "episode": 4608, - "epoch": 0.0828270482079304, - "loss/policy_avg": 1.134081244468689, - "lr": 9.816589979550103e-06, - "objective/entropy": 120.20220947265625, - "objective/kl": 32.1230583190918, - "objective/non_score_reward": -3.212306499481201, - "objective/rlhf_reward": -11.449225521087648, - "objective/scores": 0.35, - "policy/approxkl_avg": 37.81696319580078, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8285540342330933, - "step": 287, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9991226196289062 - }, - { - "episode": 4624, - "epoch": 0.08311464212531905, - "loss/policy_avg": 0.17092914879322052, - "lr": 9.8159509202454e-06, - "objective/entropy": 6.329719543457031, - "objective/kl": 29.584348678588867, - "objective/non_score_reward": -2.95843505859375, - "objective/rlhf_reward": -10.171880011976349, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 29.629112243652344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5302486419677734, - "step": 288, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0020604133605957 - }, - { - "episode": 4640, - "epoch": 0.0834022360427077, - "loss/policy_avg": 0.17788568139076233, - "lr": 9.815311860940695e-06, - "objective/entropy": 21.96484375, - "objective/kl": 28.446231842041016, - "objective/non_score_reward": -2.84462308883667, - "objective/rlhf_reward": -9.431081603245671, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 137.49514770507812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6393083333969116, - "step": 289, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981507062911987 - }, - { - "episode": 4656, - "epoch": 0.08368982996009634, - "loss/policy_avg": 0.4766189754009247, - "lr": 9.814672801635992e-06, - "objective/entropy": 87.13041687011719, - "objective/kl": 26.18436050415039, - "objective/non_score_reward": -2.618436098098755, - "objective/rlhf_reward": -9.095142700759274, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 64.16291809082031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4916858971118927, - "step": 290, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992941617965698 - }, - { - "episode": 4672, - "epoch": 0.083977423877485, - "loss/policy_avg": 7.575510025024414, - "lr": 9.81403374233129e-06, - "objective/entropy": -187.93580627441406, - "objective/kl": 21.01421356201172, - "objective/non_score_reward": -2.101421356201172, - "objective/rlhf_reward": -6.9550868078187555, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 58.152530670166016, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.793678343296051, - "step": 291, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9997904300689697 - }, - { - "episode": 4688, - "epoch": 0.08426501779487364, - "loss/policy_avg": 0.3069241940975189, - "lr": 9.813394683026586e-06, - "objective/entropy": 95.74089050292969, - "objective/kl": 22.938138961791992, - "objective/non_score_reward": -2.293813943862915, - "objective/rlhf_reward": -7.052549543158088, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 9.722650527954102, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5521177053451538, - "step": 292, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9989383220672607 - }, - { - "episode": 4704, - "epoch": 0.08455261171226229, - "loss/policy_avg": 0.8028863072395325, - "lr": 9.812755623721883e-06, - "objective/entropy": 225.46250915527344, - "objective/kl": 32.304569244384766, - "objective/non_score_reward": -3.230457305908203, - "objective/rlhf_reward": -11.18849541346232, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 103.39628601074219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5377808809280396, - "step": 293, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9996423721313477 - }, - { - "episode": 4720, - "epoch": 0.08484020562965093, - "loss/policy_avg": 0.5835884809494019, - "lr": 9.81211656441718e-06, - "objective/entropy": 75.27652740478516, - "objective/kl": 30.011789321899414, - "objective/non_score_reward": -3.0011792182922363, - "objective/rlhf_reward": -9.882009925619636, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 10.76335334777832, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7725957632064819, - "step": 294, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982374906539917 - }, - { - "episode": 4736, - "epoch": 0.08512779954703958, - "loss/policy_avg": 0.17510247230529785, - "lr": 9.811477505112475e-06, - "objective/entropy": 153.28558349609375, - "objective/kl": 35.96855926513672, - "objective/non_score_reward": -3.596856117248535, - "objective/rlhf_reward": -12.906472566540598, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 19.366321563720703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6706559658050537, - "step": 295, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993175268173218 - }, - { - "episode": 4752, - "epoch": 0.08541539346442822, - "loss/policy_avg": 0.4794872999191284, - "lr": 9.810838445807772e-06, - "objective/entropy": 254.9187469482422, - "objective/kl": 34.023677825927734, - "objective/non_score_reward": -3.4023680686950684, - "objective/rlhf_reward": -12.158873777003631, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 62.12803268432617, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6503519415855408, - "step": 296, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9984500408172607 - }, - { - "episode": 4768, - "epoch": 0.08570298738181688, - "loss/policy_avg": 1.1904816627502441, - "lr": 9.810199386503069e-06, - "objective/entropy": 146.021484375, - "objective/kl": 35.92856216430664, - "objective/non_score_reward": -3.5928561687469482, - "objective/rlhf_reward": -12.424013684468207, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 37.72700500488281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6158914566040039, - "step": 297, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981852769851685 - }, - { - "episode": 4784, - "epoch": 0.08599058129920552, - "loss/policy_avg": 0.000278279185295105, - "lr": 9.809560327198366e-06, - "objective/entropy": 178.57492065429688, - "objective/kl": 34.800636291503906, - "objective/non_score_reward": -3.4800639152526855, - "objective/rlhf_reward": -10.99653712356207, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 41.639854431152344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7849889993667603, - "step": 298, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9984562397003174 - }, - { - "episode": 4800, - "epoch": 0.08627817521659417, - "loss/policy_avg": 0.7629772424697876, - "lr": 9.808921267893663e-06, - "objective/entropy": -145.59861755371094, - "objective/kl": 28.413082122802734, - "objective/non_score_reward": -2.841308116912842, - "objective/rlhf_reward": -6.965232110023498, - "objective/scores": 1.1, - "policy/approxkl_avg": 13.004857063293457, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.684173047542572, - "step": 299, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9994359016418457 - }, - { - "episode": 4816, - "epoch": 0.08656576913398281, - "loss/policy_avg": 1.7354516983032227, - "lr": 9.808282208588958e-06, - "objective/entropy": 272.84912109375, - "objective/kl": 26.817108154296875, - "objective/non_score_reward": -2.681710720062256, - "objective/rlhf_reward": -9.211070620807346, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 35.25104904174805, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8183693885803223, - "step": 300, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998483657836914 - }, - { - "episode": 4832, - "epoch": 0.08685336305137147, - "loss/policy_avg": 0.06534934043884277, - "lr": 9.807643149284255e-06, - "objective/entropy": 152.22633361816406, - "objective/kl": 30.80361557006836, - "objective/non_score_reward": -3.0803616046905518, - "objective/rlhf_reward": -12.321446180343628, - "objective/scores": 0.0, - "policy/approxkl_avg": 187.40298461914062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.619062066078186, - "step": 301, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9991655349731445 - }, - { - "episode": 4848, - "epoch": 0.0871409569687601, - "loss/policy_avg": 1.8463071584701538, - "lr": 9.80700408997955e-06, - "objective/entropy": -59.8196907043457, - "objective/kl": 31.326427459716797, - "objective/non_score_reward": -3.132642984390259, - "objective/rlhf_reward": -11.014799916537937, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 56.62882995605469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6038594245910645, - "step": 302, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993600845336914 - }, - { - "episode": 4864, - "epoch": 0.08742855088614876, - "loss/policy_avg": 0.08039037883281708, - "lr": 9.806365030674847e-06, - "objective/entropy": 40.064144134521484, - "objective/kl": 22.286996841430664, - "objective/non_score_reward": -2.2286999225616455, - "objective/rlhf_reward": -7.514799362421035, - "objective/scores": 0.35, - "policy/approxkl_avg": 38.59841537475586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.49097996950149536, - "step": 303, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9966613054275513 - }, - { - "episode": 4880, - "epoch": 0.0877161448035374, - "loss/policy_avg": 0.01872839219868183, - "lr": 9.805725971370144e-06, - "objective/entropy": 58.7380256652832, - "objective/kl": 28.672008514404297, - "objective/non_score_reward": -2.8672008514404297, - "objective/rlhf_reward": -9.735470251242319, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 149.07861328125, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6022211313247681, - "step": 304, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0004193782806396 - }, - { - "episode": 4896, - "epoch": 0.08800373872092605, - "loss/policy_avg": 0.5821743011474609, - "lr": 9.80508691206544e-06, - "objective/entropy": -12.124443054199219, - "objective/kl": 24.10376739501953, - "objective/non_score_reward": -2.410377025604248, - "objective/rlhf_reward": -5.241507506370544, - "objective/scores": 1.1, - "policy/approxkl_avg": 3.3420569896698, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6286916732788086, - "step": 305, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002763509750366 - }, - { - "episode": 4912, - "epoch": 0.08829133263831469, - "loss/policy_avg": 0.32468903064727783, - "lr": 9.804447852760737e-06, - "objective/entropy": -245.09518432617188, - "objective/kl": 25.548696517944336, - "objective/non_score_reward": -2.5548696517944336, - "objective/rlhf_reward": -10.219478368759155, - "objective/scores": 0.0, - "policy/approxkl_avg": 18.726303100585938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.633787989616394, - "step": 306, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0003480911254883 - }, - { - "episode": 4928, - "epoch": 0.08857892655570335, - "loss/policy_avg": 0.5798380970954895, - "lr": 9.803808793456034e-06, - "objective/entropy": 91.35831451416016, - "objective/kl": 35.70774459838867, - "objective/non_score_reward": -3.570774555206299, - "objective/rlhf_reward": -14.283098220825195, - "objective/scores": 0.0, - "policy/approxkl_avg": 44.0499267578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4979282021522522, - "step": 307, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003609657287598 - }, - { - "episode": 4944, - "epoch": 0.088866520473092, - "loss/policy_avg": 0.36592239141464233, - "lr": 9.80316973415133e-06, - "objective/entropy": 39.27040100097656, - "objective/kl": 30.252880096435547, - "objective/non_score_reward": -3.025287628173828, - "objective/rlhf_reward": -10.775638136893434, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 3.1499075889587402, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6890300512313843, - "step": 308, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988046884536743 - }, - { - "episode": 4960, - "epoch": 0.08915411439048064, - "loss/policy_avg": 0.08172816783189774, - "lr": 9.802530674846626e-06, - "objective/entropy": -196.7550811767578, - "objective/kl": 30.32009506225586, - "objective/non_score_reward": -3.0320096015930176, - "objective/rlhf_reward": -9.204319153667662, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 8.101791381835938, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5759010910987854, - "step": 309, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000977516174316 - }, - { - "episode": 4976, - "epoch": 0.0894417083078693, - "loss/policy_avg": 0.5907818078994751, - "lr": 9.801891615541923e-06, - "objective/entropy": -3.5698318481445312, - "objective/kl": 28.213176727294922, - "objective/non_score_reward": -2.8213181495666504, - "objective/rlhf_reward": -9.72901317378576, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 56.35433578491211, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6199610233306885, - "step": 310, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9979722499847412 - }, - { - "episode": 4992, - "epoch": 0.08972930222525793, - "loss/policy_avg": 0.39707911014556885, - "lr": 9.80125255623722e-06, - "objective/entropy": -11.338485717773438, - "objective/kl": 24.322521209716797, - "objective/non_score_reward": -2.4322521686553955, - "objective/rlhf_reward": -8.350406386939389, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.5820951461792, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6451054811477661, - "step": 311, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0019586086273193 - }, - { - "episode": 5008, - "epoch": 0.09001689614264659, - "loss/policy_avg": -0.07866669446229935, - "lr": 9.800613496932517e-06, - "objective/entropy": 170.05404663085938, - "objective/kl": 28.295799255371094, - "objective/non_score_reward": -2.8295798301696777, - "objective/rlhf_reward": -9.894486983020869, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 40.782066345214844, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6188048124313354, - "step": 312, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0017778873443604 - }, - { - "episode": 5024, - "epoch": 0.09030449006003523, - "loss/policy_avg": -0.23688295483589172, - "lr": 9.799974437627812e-06, - "objective/entropy": 156.63333129882812, - "objective/kl": 27.922500610351562, - "objective/non_score_reward": -2.792250156402588, - "objective/rlhf_reward": -9.718402723880157, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 22.294483184814453, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4276520609855652, - "step": 313, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0026590824127197 - }, - { - "episode": 5040, - "epoch": 0.09059208397742388, - "loss/policy_avg": 0.09796786308288574, - "lr": 9.799335378323109e-06, - "objective/entropy": -10.673637390136719, - "objective/kl": 20.40918731689453, - "objective/non_score_reward": -2.0409185886383057, - "objective/rlhf_reward": -8.163674473762512, - "objective/scores": 0.0, - "policy/approxkl_avg": 8.275084495544434, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5387430787086487, - "step": 314, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982633590698242 - }, - { - "episode": 5056, - "epoch": 0.09087967789481252, - "loss/policy_avg": 0.17557716369628906, - "lr": 9.798696319018406e-06, - "objective/entropy": 20.533397674560547, - "objective/kl": 33.14729309082031, - "objective/non_score_reward": -3.3147292137145996, - "objective/rlhf_reward": -11.702658264842583, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 58.23655700683594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6181402206420898, - "step": 315, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998816728591919 - }, - { - "episode": 5072, - "epoch": 0.09116727181220118, - "loss/policy_avg": 0.28663304448127747, - "lr": 9.798057259713703e-06, - "objective/entropy": 110.77783203125, - "objective/kl": 24.706939697265625, - "objective/non_score_reward": -2.470694065093994, - "objective/rlhf_reward": -8.278656039301472, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 21.429655075073242, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6165672540664673, - "step": 316, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001864433288574 - }, - { - "episode": 5088, - "epoch": 0.09145486572958982, - "loss/policy_avg": 0.0841158926486969, - "lr": 9.797418200409e-06, - "objective/entropy": 64.50070190429688, - "objective/kl": 32.75787353515625, - "objective/non_score_reward": -3.275787353515625, - "objective/rlhf_reward": -11.499028954569418, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 102.43559265136719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6068868637084961, - "step": 317, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9997694492340088 - }, - { - "episode": 5104, - "epoch": 0.09174245964697847, - "loss/policy_avg": 0.35147473216056824, - "lr": 9.796779141104296e-06, - "objective/entropy": 208.5213623046875, - "objective/kl": 31.126712799072266, - "objective/non_score_reward": -3.112671375274658, - "objective/rlhf_reward": -11.10904937079492, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 146.6444091796875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7985448837280273, - "step": 318, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988431930541992 - }, - { - "episode": 5120, - "epoch": 0.09203005356436711, - "loss/policy_avg": 0.19098839163780212, - "lr": 9.796140081799592e-06, - "objective/entropy": -30.1602783203125, - "objective/kl": 31.919559478759766, - "objective/non_score_reward": -3.191955804824829, - "objective/rlhf_reward": -11.36782262325287, - "objective/scores": 0.35, - "policy/approxkl_avg": 31.554279327392578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7731765508651733, - "step": 319, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0018060207366943 - }, - { - "episode": 5136, - "epoch": 0.09231764748175576, - "loss/policy_avg": 0.05387501046061516, - "lr": 9.795501022494888e-06, - "objective/entropy": 109.4754638671875, - "objective/kl": 32.21202850341797, - "objective/non_score_reward": -3.2212026119232178, - "objective/rlhf_reward": -11.434212188334808, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 7.5359039306640625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5108368396759033, - "step": 320, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981523752212524 - }, - { - "episode": 5152, - "epoch": 0.0926052413991444, - "loss/policy_avg": 0.5724260210990906, - "lr": 9.794861963190185e-06, - "objective/entropy": 62.85846710205078, - "objective/kl": 30.164125442504883, - "objective/non_score_reward": -3.0164127349853516, - "objective/rlhf_reward": -10.332317308584848, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 44.99430465698242, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7216867208480835, - "step": 321, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000819683074951 - }, - { - "episode": 5168, - "epoch": 0.09289283531653306, - "loss/policy_avg": 0.23510941863059998, - "lr": 9.794222903885482e-06, - "objective/entropy": 88.79434204101562, - "objective/kl": 33.60057830810547, - "objective/non_score_reward": -3.360057830810547, - "objective/rlhf_reward": -11.04023096561432, - "objective/scores": 0.6, - "policy/approxkl_avg": 60.40937805175781, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6445315480232239, - "step": 322, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001824378967285 - }, - { - "episode": 5184, - "epoch": 0.0931804292339217, - "loss/policy_avg": 0.10644792020320892, - "lr": 9.793583844580777e-06, - "objective/entropy": 73.26347351074219, - "objective/kl": 32.69441223144531, - "objective/non_score_reward": -3.2694411277770996, - "objective/rlhf_reward": -11.521504967418268, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 11.021139144897461, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5057616829872131, - "step": 323, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9983140230178833 - }, - { - "episode": 5200, - "epoch": 0.09346802315131035, - "loss/policy_avg": 0.7994442582130432, - "lr": 9.792944785276074e-06, - "objective/entropy": 88.5349349975586, - "objective/kl": 25.706418991088867, - "objective/non_score_reward": -2.5706419944763184, - "objective/rlhf_reward": -8.62070823234378, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 55.555015563964844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.554456889629364, - "step": 324, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997270107269287 - }, - { - "episode": 5216, - "epoch": 0.09375561706869899, - "loss/policy_avg": 0.5393191576004028, - "lr": 9.792305725971371e-06, - "objective/entropy": 74.77957153320312, - "objective/kl": 36.75124740600586, - "objective/non_score_reward": -3.6751246452331543, - "objective/rlhf_reward": -12.967165247599283, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 22.211036682128906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.47837570309638977, - "step": 325, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0005834102630615 - }, - { - "episode": 5232, - "epoch": 0.09404321098608764, - "loss/policy_avg": 0.5926495790481567, - "lr": 9.791666666666666e-06, - "objective/entropy": 94.69478607177734, - "objective/kl": 32.18170166015625, - "objective/non_score_reward": -3.218170166015625, - "objective/rlhf_reward": -11.531045725851683, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 12.2184419631958, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9364026188850403, - "step": 326, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986941814422607 - }, - { - "episode": 5248, - "epoch": 0.0943308049034763, - "loss/policy_avg": 8.741055488586426, - "lr": 9.791027607361963e-06, - "objective/entropy": 13.209190368652344, - "objective/kl": 46.40322494506836, - "objective/non_score_reward": -4.640322208404541, - "objective/rlhf_reward": -17.21965341857019, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 196.84405517578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6481121182441711, - "step": 327, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9997447729110718 - }, - { - "episode": 5264, - "epoch": 0.09461839882086494, - "loss/policy_avg": -0.0158542487770319, - "lr": 9.79038854805726e-06, - "objective/entropy": -67.68810272216797, - "objective/kl": 25.325042724609375, - "objective/non_score_reward": -2.5325045585632324, - "objective/rlhf_reward": -8.573758213725641, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.798250198364258, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7733464241027832, - "step": 328, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998700499534607 - }, - { - "episode": 5280, - "epoch": 0.09490599273825359, - "loss/policy_avg": 0.06980250030755997, - "lr": 9.789749488752557e-06, - "objective/entropy": 66.16055297851562, - "objective/kl": 28.001384735107422, - "objective/non_score_reward": -2.8001387119293213, - "objective/rlhf_reward": -9.841305100654049, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 26.662395477294922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4329299330711365, - "step": 329, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9985547065734863 - }, - { - "episode": 5296, - "epoch": 0.09519358665564223, - "loss/policy_avg": 1.1175042390823364, - "lr": 9.789110429447854e-06, - "objective/entropy": 198.39385986328125, - "objective/kl": 35.409645080566406, - "objective/non_score_reward": -3.5409646034240723, - "objective/rlhf_reward": -12.041152181402715, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 80.42436218261719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4869406819343567, - "step": 330, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000159502029419 - }, - { - "episode": 5312, - "epoch": 0.09548118057303089, - "loss/policy_avg": 0.2751445472240448, - "lr": 9.78847137014315e-06, - "objective/entropy": 171.96897888183594, - "objective/kl": 39.34714889526367, - "objective/non_score_reward": -3.9347147941589355, - "objective/rlhf_reward": -13.914030189785073, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 72.23497009277344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.49671417474746704, - "step": 331, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9995276927947998 - }, - { - "episode": 5328, - "epoch": 0.09576877449041953, - "loss/policy_avg": 0.7539587616920471, - "lr": 9.787832310838446e-06, - "objective/entropy": 8.914024353027344, - "objective/kl": 21.132511138916016, - "objective/non_score_reward": -2.113251209259033, - "objective/rlhf_reward": -6.848884735170918, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 55.41283416748047, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5207578539848328, - "step": 332, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999730110168457 - }, - { - "episode": 5344, - "epoch": 0.09605636840780818, - "loss/policy_avg": 0.08111564069986343, - "lr": 9.787193251533743e-06, - "objective/entropy": -32.56279754638672, - "objective/kl": 26.932476043701172, - "objective/non_score_reward": -2.6932475566864014, - "objective/rlhf_reward": -9.447477314501924, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 31.9769344329834, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5189494490623474, - "step": 333, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998490810394287 - }, - { - "episode": 5360, - "epoch": 0.09634396232519682, - "loss/policy_avg": 0.12806567549705505, - "lr": 9.78655419222904e-06, - "objective/entropy": -60.638038635253906, - "objective/kl": 33.80628204345703, - "objective/non_score_reward": -3.3806281089782715, - "objective/rlhf_reward": -12.006740414889988, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 97.76350402832031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5655949115753174, - "step": 334, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991028308868408 - }, - { - "episode": 5376, - "epoch": 0.09663155624258547, - "loss/policy_avg": 0.4162527918815613, - "lr": 9.785915132924337e-06, - "objective/entropy": 73.74658203125, - "objective/kl": 28.956912994384766, - "objective/non_score_reward": -2.895691394805908, - "objective/rlhf_reward": -9.757937069210122, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 12.659797668457031, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6926892995834351, - "step": 335, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990322589874268 - }, - { - "episode": 5392, - "epoch": 0.09691915015997411, - "loss/policy_avg": 0.6766362190246582, - "lr": 9.785276073619633e-06, - "objective/entropy": -167.6099090576172, - "objective/kl": 33.4842414855957, - "objective/non_score_reward": -3.3484244346618652, - "objective/rlhf_reward": -10.46997794950125, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 45.80317687988281, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6405798196792603, - "step": 336, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999110221862793 - }, - { - "episode": 5408, - "epoch": 0.09720674407736277, - "loss/policy_avg": 0.7705954909324646, - "lr": 9.784637014314929e-06, - "objective/entropy": 189.44476318359375, - "objective/kl": 40.57612991333008, - "objective/non_score_reward": -4.057613372802734, - "objective/rlhf_reward": -14.283041070179877, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 47.49778747558594, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7007959485054016, - "step": 337, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9955544471740723 - }, - { - "episode": 5424, - "epoch": 0.0974943379947514, - "loss/policy_avg": 0.8678327798843384, - "lr": 9.783997955010226e-06, - "objective/entropy": 138.7545166015625, - "objective/kl": 43.06449890136719, - "objective/non_score_reward": -4.306450366973877, - "objective/rlhf_reward": -15.775203089328155, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 187.52108764648438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6742819547653198, - "step": 338, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000286102294922 - }, - { - "episode": 5440, - "epoch": 0.09778193191214006, - "loss/policy_avg": 0.13020552694797516, - "lr": 9.783358895705522e-06, - "objective/entropy": -34.55393981933594, - "objective/kl": 27.52876091003418, - "objective/non_score_reward": -2.7528762817382812, - "objective/rlhf_reward": -9.530552032406687, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 5.538684844970703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.45005565881729126, - "step": 339, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000762462615967 - }, - { - "episode": 5456, - "epoch": 0.0980695258295287, - "loss/policy_avg": 0.8651669025421143, - "lr": 9.78271983640082e-06, - "objective/entropy": 184.3627471923828, - "objective/kl": 31.240346908569336, - "objective/non_score_reward": -3.124034881591797, - "objective/rlhf_reward": -12.49613881111145, - "objective/scores": 0.0, - "policy/approxkl_avg": 16.510074615478516, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4378349781036377, - "step": 340, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.998918056488037 - }, - { - "episode": 5472, - "epoch": 0.09835711974691735, - "loss/policy_avg": 0.13001634180545807, - "lr": 9.782080777096116e-06, - "objective/entropy": 123.772705078125, - "objective/kl": 33.01024627685547, - "objective/non_score_reward": -3.3010246753692627, - "objective/rlhf_reward": -8.804098105430603, - "objective/scores": 1.1, - "policy/approxkl_avg": 55.6832275390625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7997548580169678, - "step": 341, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001884937286377 - }, - { - "episode": 5488, - "epoch": 0.098644713664306, - "loss/policy_avg": 0.538088321685791, - "lr": 9.781441717791413e-06, - "objective/entropy": 208.26202392578125, - "objective/kl": 28.19437026977539, - "objective/non_score_reward": -2.819437026977539, - "objective/rlhf_reward": -9.33033687897199, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 56.31122589111328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.775277316570282, - "step": 342, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9953358173370361 - }, - { - "episode": 5504, - "epoch": 0.09893230758169465, - "loss/policy_avg": -0.014354228973388672, - "lr": 9.780802658486708e-06, - "objective/entropy": -60.35287857055664, - "objective/kl": 25.630271911621094, - "objective/non_score_reward": -2.5630269050598145, - "objective/rlhf_reward": -8.926595482855959, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.3227713108062744, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4413827657699585, - "step": 343, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0038928985595703 - }, - { - "episode": 5520, - "epoch": 0.09921990149908329, - "loss/policy_avg": 0.27923208475112915, - "lr": 9.780163599182005e-06, - "objective/entropy": -24.742401123046875, - "objective/kl": 31.480648040771484, - "objective/non_score_reward": -3.1480648517608643, - "objective/rlhf_reward": -11.233009540770931, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 13.539884567260742, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5237823724746704, - "step": 344, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9998574256896973 - }, - { - "episode": 5536, - "epoch": 0.09950749541647194, - "loss/policy_avg": 0.1885061115026474, - "lr": 9.7795245398773e-06, - "objective/entropy": 182.22181701660156, - "objective/kl": 29.661117553710938, - "objective/non_score_reward": -2.966111660003662, - "objective/rlhf_reward": -10.522810748129515, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 3.2202861309051514, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6761025786399841, - "step": 345, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999419927597046 - }, - { - "episode": 5552, - "epoch": 0.0997950893338606, - "loss/policy_avg": 0.7343586087226868, - "lr": 9.778885480572597e-06, - "objective/entropy": 145.13526916503906, - "objective/kl": 45.35038375854492, - "objective/non_score_reward": -4.535038471221924, - "objective/rlhf_reward": -16.315324659618447, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 33.988563537597656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7404603958129883, - "step": 346, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.995590329170227 - }, - { - "episode": 5568, - "epoch": 0.10008268325124924, - "loss/policy_avg": 0.6405590772628784, - "lr": 9.778246421267894e-06, - "objective/entropy": 162.7369842529297, - "objective/kl": 37.150367736816406, - "objective/non_score_reward": -3.7150371074676514, - "objective/rlhf_reward": -13.379196050579905, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 36.95792770385742, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8137757778167725, - "step": 347, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9953217506408691 - }, - { - "episode": 5584, - "epoch": 0.10037027716863789, - "loss/policy_avg": 0.13212129473686218, - "lr": 9.777607361963191e-06, - "objective/entropy": 206.94252014160156, - "objective/kl": 34.0411262512207, - "objective/non_score_reward": -3.4041128158569336, - "objective/rlhf_reward": -11.216450786590576, - "objective/scores": 0.6, - "policy/approxkl_avg": 133.2515869140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7521044015884399, - "step": 348, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9959970712661743 - }, - { - "episode": 5600, - "epoch": 0.10065787108602653, - "loss/policy_avg": 0.9090590476989746, - "lr": 9.776968302658488e-06, - "objective/entropy": 55.456298828125, - "objective/kl": 24.91229248046875, - "objective/non_score_reward": -2.4912290573120117, - "objective/rlhf_reward": -8.140087719234536, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 29.07049560546875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8177493810653687, - "step": 349, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9979220628738403 - }, - { - "episode": 5616, - "epoch": 0.10094546500341518, - "loss/policy_avg": 0.46943986415863037, - "lr": 9.776329243353783e-06, - "objective/entropy": 153.11770629882812, - "objective/kl": 31.714759826660156, - "objective/non_score_reward": -3.171476125717163, - "objective/rlhf_reward": -9.762185250164244, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 49.198020935058594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3995407819747925, - "step": 350, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9990980625152588 - }, - { - "episode": 5632, - "epoch": 0.10123305892080382, - "loss/policy_avg": 0.12656962871551514, - "lr": 9.77569018404908e-06, - "objective/entropy": 109.22264862060547, - "objective/kl": 28.461389541625977, - "objective/non_score_reward": -2.8461389541625977, - "objective/rlhf_reward": -8.46083704078314, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 40.512847900390625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.588497519493103, - "step": 351, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977093935012817 - }, - { - "episode": 5648, - "epoch": 0.10152065283819248, - "loss/policy_avg": 0.7170840501785278, - "lr": 9.775051124744377e-06, - "objective/entropy": 14.107101440429688, - "objective/kl": 41.7979736328125, - "objective/non_score_reward": -4.179797172546387, - "objective/rlhf_reward": -15.377552798300414, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 11.696022987365723, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7339128255844116, - "step": 352, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9979870319366455 - }, - { - "episode": 5664, - "epoch": 0.10180824675558112, - "loss/policy_avg": 0.8306883573532104, - "lr": 9.774412065439674e-06, - "objective/entropy": -67.41658782958984, - "objective/kl": 26.34395408630371, - "objective/non_score_reward": -2.6343955993652344, - "objective/rlhf_reward": -8.137582039833068, - "objective/scores": 0.6, - "policy/approxkl_avg": 115.25839233398438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6014617681503296, - "step": 353, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9970800876617432 - }, - { - "episode": 5680, - "epoch": 0.10209584067296977, - "loss/policy_avg": 2.176168918609619, - "lr": 9.77377300613497e-06, - "objective/entropy": 134.90728759765625, - "objective/kl": 31.819995880126953, - "objective/non_score_reward": -3.181999683380127, - "objective/rlhf_reward": -12.727998733520508, - "objective/scores": 0.0, - "policy/approxkl_avg": 26.059894561767578, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5675879716873169, - "step": 354, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992128610610962 - }, - { - "episode": 5696, - "epoch": 0.10238343459035841, - "loss/policy_avg": 0.9548969268798828, - "lr": 9.773133946830267e-06, - "objective/entropy": -45.11736297607422, - "objective/kl": 30.003692626953125, - "objective/non_score_reward": -3.000369071960449, - "objective/rlhf_reward": -10.659840753584533, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 80.65755462646484, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6548440456390381, - "step": 355, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9980876445770264 - }, - { - "episode": 5712, - "epoch": 0.10267102850774706, - "loss/policy_avg": -0.09791913628578186, - "lr": 9.772494887525563e-06, - "objective/entropy": 59.10938262939453, - "objective/kl": 24.62106704711914, - "objective/non_score_reward": -2.462106943130493, - "objective/rlhf_reward": -8.489177667830868, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 34.42068099975586, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7636164426803589, - "step": 356, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001443862915039 - }, - { - "episode": 5728, - "epoch": 0.1029586224251357, - "loss/policy_avg": 0.6120666265487671, - "lr": 9.77185582822086e-06, - "objective/entropy": 222.30874633789062, - "objective/kl": 32.64442825317383, - "objective/non_score_reward": -3.2644426822662354, - "objective/rlhf_reward": -11.576818349774243, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 6.193035125732422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5914427042007446, - "step": 357, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.002417802810669 - }, - { - "episode": 5744, - "epoch": 0.10324621634252436, - "loss/policy_avg": 0.25659894943237305, - "lr": 9.771216768916156e-06, - "objective/entropy": 173.52723693847656, - "objective/kl": 29.877527236938477, - "objective/non_score_reward": -2.987752914428711, - "objective/rlhf_reward": -7.551011657714843, - "objective/scores": 1.1, - "policy/approxkl_avg": 18.964191436767578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7272264361381531, - "step": 358, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9968700408935547 - }, - { - "episode": 5760, - "epoch": 0.103533810259913, - "loss/policy_avg": 0.4551319479942322, - "lr": 9.770577709611453e-06, - "objective/entropy": 66.63546752929688, - "objective/kl": 29.777273178100586, - "objective/non_score_reward": -2.9777274131774902, - "objective/rlhf_reward": -9.510909175872802, - "objective/scores": 0.6, - "policy/approxkl_avg": 110.96263885498047, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.554497241973877, - "step": 359, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9965577125549316 - }, - { - "episode": 5776, - "epoch": 0.10382140417730165, - "loss/policy_avg": 1.5252394676208496, - "lr": 9.76993865030675e-06, - "objective/entropy": -97.26277923583984, - "objective/kl": 33.4285888671875, - "objective/non_score_reward": -3.3428590297698975, - "objective/rlhf_reward": -11.890483262951733, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 8.201589584350586, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4571065902709961, - "step": 360, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9999207258224487 - }, - { - "episode": 5792, - "epoch": 0.10410899809469029, - "loss/policy_avg": 1.1003179550170898, - "lr": 9.769299591002045e-06, - "objective/entropy": 51.82417297363281, - "objective/kl": 34.724029541015625, - "objective/non_score_reward": -3.472402811050415, - "objective/rlhf_reward": -12.33335193893011, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 8.241430282592773, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.7186964750289917, - "step": 361, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989242553710938 - }, - { - "episode": 5808, - "epoch": 0.10439659201207895, - "loss/policy_avg": 0.40074190497398376, - "lr": 9.768660531697342e-06, - "objective/entropy": 219.26010131835938, - "objective/kl": 36.2478141784668, - "objective/non_score_reward": -3.6247811317443848, - "objective/rlhf_reward": -12.895005021158774, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.6230387687683105, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5367094278335571, - "step": 362, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9987454414367676 - }, - { - "episode": 5824, - "epoch": 0.10468418592946759, - "loss/policy_avg": 0.9861453771591187, - "lr": 9.768021472392639e-06, - "objective/entropy": -9.609394073486328, - "objective/kl": 39.06307601928711, - "objective/non_score_reward": -3.9063076972961426, - "objective/rlhf_reward": -14.109459364207918, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 78.4552993774414, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6734092235565186, - "step": 363, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9968814849853516 - }, - { - "episode": 5840, - "epoch": 0.10497177984685624, - "loss/policy_avg": 0.18136531114578247, - "lr": 9.767382413087936e-06, - "objective/entropy": 78.3685073852539, - "objective/kl": 38.321044921875, - "objective/non_score_reward": -3.8321046829223633, - "objective/rlhf_reward": -13.503589506420205, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 127.91275787353516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46624571084976196, - "step": 364, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0008468627929688 - }, - { - "episode": 5856, - "epoch": 0.10525937376424489, - "loss/policy_avg": -0.3799706697463989, - "lr": 9.766743353783233e-06, - "objective/entropy": 138.2041473388672, - "objective/kl": 46.876441955566406, - "objective/non_score_reward": -4.6876444816589355, - "objective/rlhf_reward": -14.350577926635744, - "objective/scores": 1.1, - "policy/approxkl_avg": 66.94557189941406, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5722706317901611, - "step": 365, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9999732971191406 - }, - { - "episode": 5872, - "epoch": 0.10554696768163353, - "loss/policy_avg": 0.034319084137678146, - "lr": 9.76610429447853e-06, - "objective/entropy": 45.21516418457031, - "objective/kl": 30.351581573486328, - "objective/non_score_reward": -3.035158157348633, - "objective/rlhf_reward": -9.21691409194586, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 5.7516632080078125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4482240676879883, - "step": 366, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992876052856445 - }, - { - "episode": 5888, - "epoch": 0.10583456159902219, - "loss/policy_avg": 0.8865995407104492, - "lr": 9.765465235173825e-06, - "objective/entropy": -21.33509063720703, - "objective/kl": 35.2110595703125, - "objective/non_score_reward": -3.5211057662963867, - "objective/rlhf_reward": -12.35109020868937, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 39.482017517089844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6004269123077393, - "step": 367, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9980597496032715 - }, - { - "episode": 5904, - "epoch": 0.10612215551641083, - "loss/policy_avg": 0.14120006561279297, - "lr": 9.764826175869122e-06, - "objective/entropy": 314.3269348144531, - "objective/kl": 33.36817932128906, - "objective/non_score_reward": -3.336818218231201, - "objective/rlhf_reward": -11.68541360420047, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 111.91177368164062, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7748069763183594, - "step": 368, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0023789405822754 - }, - { - "episode": 5920, - "epoch": 0.10640974943379948, - "loss/policy_avg": 1.3205476999282837, - "lr": 9.764187116564417e-06, - "objective/entropy": -41.12682342529297, - "objective/kl": 31.178136825561523, - "objective/non_score_reward": -3.1178135871887207, - "objective/rlhf_reward": -10.990302207882763, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 50.676719665527344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.9075043797492981, - "step": 369, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977303743362427 - }, - { - "episode": 5936, - "epoch": 0.10669734335118812, - "loss/policy_avg": 0.4172307848930359, - "lr": 9.763548057259714e-06, - "objective/entropy": 151.11341857910156, - "objective/kl": 29.471710205078125, - "objective/non_score_reward": -2.947171211242676, - "objective/rlhf_reward": -10.410082795707089, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 31.20602035522461, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6370272636413574, - "step": 370, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9985219240188599 - }, - { - "episode": 5952, - "epoch": 0.10698493726857677, - "loss/policy_avg": -0.09500053524971008, - "lr": 9.76290899795501e-06, - "objective/entropy": -34.93052673339844, - "objective/kl": 32.19451904296875, - "objective/non_score_reward": -3.219452142715454, - "objective/rlhf_reward": -11.273688945833761, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 64.82252502441406, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7023112773895264, - "step": 371, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001070499420166 - }, - { - "episode": 5968, - "epoch": 0.10727253118596541, - "loss/policy_avg": 0.6650490760803223, - "lr": 9.762269938650308e-06, - "objective/entropy": -44.10865783691406, - "objective/kl": 27.115589141845703, - "objective/non_score_reward": -2.7115590572357178, - "objective/rlhf_reward": -9.021407242092202, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 34.185760498046875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7515213489532471, - "step": 372, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9973523616790771 - }, - { - "episode": 5984, - "epoch": 0.10756012510335407, - "loss/policy_avg": 0.7072340846061707, - "lr": 9.761630879345604e-06, - "objective/entropy": 4.434268951416016, - "objective/kl": 43.21569061279297, - "objective/non_score_reward": -4.321569442749023, - "objective/rlhf_reward": -15.770505034717257, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.586810350418091, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6180188655853271, - "step": 373, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001654624938965 - }, - { - "episode": 6000, - "epoch": 0.10784771902074271, - "loss/policy_avg": 1.28859281539917, - "lr": 9.7609918200409e-06, - "objective/entropy": -139.96766662597656, - "objective/kl": 30.635095596313477, - "objective/non_score_reward": -3.063509464263916, - "objective/rlhf_reward": -7.854037737846375, - "objective/scores": 1.1, - "policy/approxkl_avg": 153.5921630859375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7665011882781982, - "step": 374, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9986308813095093 - }, - { - "episode": 6016, - "epoch": 0.10813531293813136, - "loss/policy_avg": 1.1559712886810303, - "lr": 9.760352760736196e-06, - "objective/entropy": 112.28376007080078, - "objective/kl": 48.56169891357422, - "objective/non_score_reward": -4.856169700622559, - "objective/rlhf_reward": -18.00084741850671, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 29.986862182617188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.86993408203125, - "step": 375, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9950189590454102 - }, - { - "episode": 6032, - "epoch": 0.10842290685552, - "loss/policy_avg": 0.43735095858573914, - "lr": 9.759713701431493e-06, - "objective/entropy": 161.14744567871094, - "objective/kl": 20.346540451049805, - "objective/non_score_reward": -2.034654140472412, - "objective/rlhf_reward": -6.657663944180369, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 8.951998710632324, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.7470377683639526, - "step": 376, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.000926971435547 - }, - { - "episode": 6048, - "epoch": 0.10871050077290866, - "loss/policy_avg": 0.25953274965286255, - "lr": 9.75907464212679e-06, - "objective/entropy": -127.31167602539062, - "objective/kl": 32.83821105957031, - "objective/non_score_reward": -3.283820867538452, - "objective/rlhf_reward": -13.135283589363098, - "objective/scores": 0.0, - "policy/approxkl_avg": 43.502960205078125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7563947439193726, - "step": 377, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9980974197387695 - }, - { - "episode": 6064, - "epoch": 0.1089980946902973, - "loss/policy_avg": 1.1847639083862305, - "lr": 9.758435582822087e-06, - "objective/entropy": 53.43251037597656, - "objective/kl": 30.13711929321289, - "objective/non_score_reward": -3.013711929321289, - "objective/rlhf_reward": -7.654847121238708, - "objective/scores": 1.1, - "policy/approxkl_avg": 24.648468017578125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5256083607673645, - "step": 378, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0001864433288574 - }, - { - "episode": 6080, - "epoch": 0.10928568860768595, - "loss/policy_avg": 0.10543081164360046, - "lr": 9.757796523517384e-06, - "objective/entropy": 216.22293090820312, - "objective/kl": 33.44567108154297, - "objective/non_score_reward": -3.344566822052002, - "objective/rlhf_reward": -11.99966583499084, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 30.81055450439453, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5716228485107422, - "step": 379, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9968583583831787 - }, - { - "episode": 6096, - "epoch": 0.10957328252507459, - "loss/policy_avg": 0.3527596592903137, - "lr": 9.75715746421268e-06, - "objective/entropy": -127.59818267822266, - "objective/kl": 31.49237632751465, - "objective/non_score_reward": -3.149237632751465, - "objective/rlhf_reward": -12.596950769424438, - "objective/scores": 0.0, - "policy/approxkl_avg": 19.017166137695312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4384676218032837, - "step": 380, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995179176330566 - }, - { - "episode": 6112, - "epoch": 0.10986087644246324, - "loss/policy_avg": 0.9311287999153137, - "lr": 9.756518404907976e-06, - "objective/entropy": 117.0103530883789, - "objective/kl": 30.302433013916016, - "objective/non_score_reward": -3.030243158340454, - "objective/rlhf_reward": -10.459113364637481, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 65.951171875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.41777661442756653, - "step": 381, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992172718048096 - }, - { - "episode": 6128, - "epoch": 0.11014847035985188, - "loss/policy_avg": 0.027314603328704834, - "lr": 9.755879345603273e-06, - "objective/entropy": 82.98536682128906, - "objective/kl": 41.457672119140625, - "objective/non_score_reward": -4.1457672119140625, - "objective/rlhf_reward": -15.241434386282592, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 5.37526273727417, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7102963924407959, - "step": 382, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000887632369995 - }, - { - "episode": 6144, - "epoch": 0.11043606427724054, - "loss/policy_avg": -0.5239760279655457, - "lr": 9.75524028629857e-06, - "objective/entropy": -19.319984436035156, - "objective/kl": 31.706575393676758, - "objective/non_score_reward": -3.1706576347351074, - "objective/rlhf_reward": -10.282630062103273, - "objective/scores": 0.6, - "policy/approxkl_avg": 33.74637222290039, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.521426796913147, - "step": 383, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0061190128326416 - }, - { - "episode": 6160, - "epoch": 0.11072365819462919, - "loss/policy_avg": 0.19491565227508545, - "lr": 9.754601226993867e-06, - "objective/entropy": 153.27801513671875, - "objective/kl": 30.898479461669922, - "objective/non_score_reward": -3.0898478031158447, - "objective/rlhf_reward": -10.84361919144028, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 24.972707748413086, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6943016648292542, - "step": 384, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9995365142822266 - }, - { - "episode": 6176, - "epoch": 0.11101125211201783, - "loss/policy_avg": 0.9045780897140503, - "lr": 9.753962167689162e-06, - "objective/entropy": 229.45260620117188, - "objective/kl": 45.034461975097656, - "objective/non_score_reward": -4.503446578979492, - "objective/rlhf_reward": -16.563187460513458, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 8.020683288574219, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5668317079544067, - "step": 385, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000062942504883 - }, - { - "episode": 6192, - "epoch": 0.11129884602940648, - "loss/policy_avg": 0.33030185103416443, - "lr": 9.753323108384459e-06, - "objective/entropy": 153.65707397460938, - "objective/kl": 42.31884002685547, - "objective/non_score_reward": -4.231884002685547, - "objective/rlhf_reward": -15.371276705470635, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 176.17214965820312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6527254581451416, - "step": 386, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980921745300293 - }, - { - "episode": 6208, - "epoch": 0.11158643994679512, - "loss/policy_avg": 1.2582824230194092, - "lr": 9.752684049079756e-06, - "objective/entropy": 212.47308349609375, - "objective/kl": 41.99869918823242, - "objective/non_score_reward": -4.1998701095581055, - "objective/rlhf_reward": -14.852069209294257, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 34.943233489990234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.551336407661438, - "step": 387, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.997947335243225 - }, - { - "episode": 6224, - "epoch": 0.11187403386418378, - "loss/policy_avg": 1.361016035079956, - "lr": 9.752044989775053e-06, - "objective/entropy": -335.09619140625, - "objective/kl": 30.397010803222656, - "objective/non_score_reward": -3.039701223373413, - "objective/rlhf_reward": -10.333976383480142, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 14.473678588867188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.9271190166473389, - "step": 388, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9977775812149048 - }, - { - "episode": 6240, - "epoch": 0.11216162778157242, - "loss/policy_avg": 0.34025201201438904, - "lr": 9.751405930470348e-06, - "objective/entropy": 50.92825698852539, - "objective/kl": 39.54961013793945, - "objective/non_score_reward": -3.954960823059082, - "objective/rlhf_reward": -13.995015259059976, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 8.61404037475586, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.47977396845817566, - "step": 389, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9968550205230713 - }, - { - "episode": 6256, - "epoch": 0.11244922169896107, - "loss/policy_avg": 0.012692228890955448, - "lr": 9.750766871165645e-06, - "objective/entropy": -33.92766571044922, - "objective/kl": 31.518718719482422, - "objective/non_score_reward": -3.151872158050537, - "objective/rlhf_reward": -8.207488393783569, - "objective/scores": 1.1, - "policy/approxkl_avg": 84.33369445800781, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5067895650863647, - "step": 390, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9976071119308472 - }, - { - "episode": 6272, - "epoch": 0.11273681561634971, - "loss/policy_avg": 0.5984074473381042, - "lr": 9.750127811860941e-06, - "objective/entropy": -239.443359375, - "objective/kl": 31.10334014892578, - "objective/non_score_reward": -3.1103343963623047, - "objective/rlhf_reward": -10.041337525844575, - "objective/scores": 0.6, - "policy/approxkl_avg": 30.063674926757812, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7201836705207825, - "step": 391, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000786304473877 - }, - { - "episode": 6288, - "epoch": 0.11302440953373837, - "loss/policy_avg": 0.7581092715263367, - "lr": 9.749488752556238e-06, - "objective/entropy": 85.20730590820312, - "objective/kl": 40.380855560302734, - "objective/non_score_reward": -4.038085460662842, - "objective/rlhf_reward": -14.701744298549041, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 18.875045776367188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5445913672447205, - "step": 392, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9985957145690918 - }, - { - "episode": 6304, - "epoch": 0.113312003451127, - "loss/policy_avg": 1.7639085054397583, - "lr": 9.748849693251534e-06, - "objective/entropy": 124.08705139160156, - "objective/kl": 37.808753967285156, - "objective/non_score_reward": -3.7808759212493896, - "objective/rlhf_reward": -13.298674936565469, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 16.500898361206055, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.4794216752052307, - "step": 393, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0014162063598633 - }, - { - "episode": 6320, - "epoch": 0.11359959736851566, - "loss/policy_avg": 0.012201100587844849, - "lr": 9.74821063394683e-06, - "objective/entropy": 200.1130828857422, - "objective/kl": 30.82569122314453, - "objective/non_score_reward": -3.082569122314453, - "objective/rlhf_reward": -7.930276489257812, - "objective/scores": 1.1, - "policy/approxkl_avg": 7.863556861877441, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5342352390289307, - "step": 394, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999612808227539 - }, - { - "episode": 6336, - "epoch": 0.1138871912859043, - "loss/policy_avg": 2.2059273719787598, - "lr": 9.747571574642127e-06, - "objective/entropy": -69.09872436523438, - "objective/kl": 40.18467330932617, - "objective/non_score_reward": -4.018467426300049, - "objective/rlhf_reward": -14.650037605960932, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 23.521875381469727, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5724920034408569, - "step": 395, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9997222423553467 - }, - { - "episode": 6352, - "epoch": 0.11417478520329295, - "loss/policy_avg": 0.4041597843170166, - "lr": 9.746932515337424e-06, - "objective/entropy": -215.51731872558594, - "objective/kl": 27.624664306640625, - "objective/non_score_reward": -2.7624664306640625, - "objective/rlhf_reward": -11.049865961074829, - "objective/scores": 0.0, - "policy/approxkl_avg": 39.29521560668945, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6042770743370056, - "step": 396, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989702701568604 - }, - { - "episode": 6368, - "epoch": 0.11446237912068159, - "loss/policy_avg": 0.4775196313858032, - "lr": 9.746293456032721e-06, - "objective/entropy": 41.82182693481445, - "objective/kl": 36.31709289550781, - "objective/non_score_reward": -3.631709575653076, - "objective/rlhf_reward": -12.702010269435952, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 44.893619537353516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.37915006279945374, - "step": 397, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001455783843994 - }, - { - "episode": 6384, - "epoch": 0.11474997303807025, - "loss/policy_avg": 0.056639641523361206, - "lr": 9.745654396728016e-06, - "objective/entropy": -153.1647186279297, - "objective/kl": 32.43135452270508, - "objective/non_score_reward": -3.243135452270508, - "objective/rlhf_reward": -11.310682540357696, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 2.9430336952209473, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7443736791610718, - "step": 398, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000112533569336 - }, - { - "episode": 6400, - "epoch": 0.11503756695545889, - "loss/policy_avg": 0.045253098011016846, - "lr": 9.745015337423313e-06, - "objective/entropy": -105.165283203125, - "objective/kl": 39.292572021484375, - "objective/non_score_reward": -3.929257392883301, - "objective/rlhf_reward": -14.236076953823925, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 31.803394317626953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.41279107332229614, - "step": 399, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001852512359619 - }, - { - "episode": 6416, - "epoch": 0.11532516087284754, - "loss/policy_avg": 1.3353252410888672, - "lr": 9.74437627811861e-06, - "objective/entropy": 56.36566925048828, - "objective/kl": 36.79115676879883, - "objective/non_score_reward": -3.6791152954101562, - "objective/rlhf_reward": -12.983128563563028, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 52.49983215332031, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5429282188415527, - "step": 400, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9969511032104492 - }, - { - "episode": 6432, - "epoch": 0.11561275479023618, - "loss/policy_avg": 0.19346949458122253, - "lr": 9.743737218813907e-06, - "objective/entropy": 94.13348388671875, - "objective/kl": 33.9053840637207, - "objective/non_score_reward": -3.3905386924743652, - "objective/rlhf_reward": -12.111556748957977, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 10.45969009399414, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5399774312973022, - "step": 401, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9987437725067139 - }, - { - "episode": 6448, - "epoch": 0.11590034870762483, - "loss/policy_avg": 0.14212624728679657, - "lr": 9.743098159509204e-06, - "objective/entropy": -67.64189147949219, - "objective/kl": 23.04766273498535, - "objective/non_score_reward": -2.3047664165496826, - "objective/rlhf_reward": -7.662806778159693, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 17.699844360351562, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5857589244842529, - "step": 402, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000443696975708 - }, - { - "episode": 6464, - "epoch": 0.11618794262501349, - "loss/policy_avg": 2.842088222503662, - "lr": 9.7424591002045e-06, - "objective/entropy": 104.11701965332031, - "objective/kl": 37.51358413696289, - "objective/non_score_reward": -3.7513585090637207, - "objective/rlhf_reward": -12.081715498806211, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 18.802593231201172, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8919892311096191, - "step": 403, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999840497970581 - }, - { - "episode": 6480, - "epoch": 0.11647553654240213, - "loss/policy_avg": 3.926600456237793, - "lr": 9.741820040899796e-06, - "objective/entropy": -60.85142517089844, - "objective/kl": 39.3304557800293, - "objective/non_score_reward": -3.9330458641052246, - "objective/rlhf_reward": -15.732182502746582, - "objective/scores": 0.0, - "policy/approxkl_avg": 15.211052894592285, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6057410836219788, - "step": 404, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9988353252410889 - }, - { - "episode": 6496, - "epoch": 0.11676313045979078, - "loss/policy_avg": 0.7047057747840881, - "lr": 9.741180981595093e-06, - "objective/entropy": 86.78068542480469, - "objective/kl": 32.590457916259766, - "objective/non_score_reward": -3.2590458393096924, - "objective/rlhf_reward": -10.913477124945196, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 73.14445495605469, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5404595136642456, - "step": 405, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9992141723632812 - }, - { - "episode": 6512, - "epoch": 0.11705072437717942, - "loss/policy_avg": 0.7668646574020386, - "lr": 9.74054192229039e-06, - "objective/entropy": 9.115959167480469, - "objective/kl": 35.6148796081543, - "objective/non_score_reward": -3.561488389968872, - "objective/rlhf_reward": -12.123247566000494, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 24.980825424194336, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.684908390045166, - "step": 406, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991612434387207 - }, - { - "episode": 6528, - "epoch": 0.11733831829456808, - "loss/policy_avg": 0.901952862739563, - "lr": 9.739902862985686e-06, - "objective/entropy": 47.42900848388672, - "objective/kl": 36.136173248291016, - "objective/non_score_reward": -3.613617420196533, - "objective/rlhf_reward": -12.72113610903422, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 29.850797653198242, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5216494202613831, - "step": 407, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996336698532104 - }, - { - "episode": 6544, - "epoch": 0.11762591221195672, - "loss/policy_avg": 0.4201366901397705, - "lr": 9.739263803680983e-06, - "objective/entropy": -11.0733642578125, - "objective/kl": 35.00093078613281, - "objective/non_score_reward": -3.5000932216644287, - "objective/rlhf_reward": -9.600372886657714, - "objective/scores": 1.1, - "policy/approxkl_avg": 32.18763732910156, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5420930981636047, - "step": 408, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.997605323791504 - }, - { - "episode": 6560, - "epoch": 0.11791350612934537, - "loss/policy_avg": 1.302764892578125, - "lr": 9.73862474437628e-06, - "objective/entropy": 168.5387420654297, - "objective/kl": 26.525001525878906, - "objective/non_score_reward": -2.6525001525878906, - "objective/rlhf_reward": -9.094229185374912, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 59.64923858642578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.529288649559021, - "step": 409, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9986768960952759 - }, - { - "episode": 6576, - "epoch": 0.11820110004673401, - "loss/policy_avg": 1.0619229078292847, - "lr": 9.737985685071575e-06, - "objective/entropy": -54.82817459106445, - "objective/kl": 37.211219787597656, - "objective/non_score_reward": -3.7211220264434814, - "objective/rlhf_reward": -13.328228085246636, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 47.928985595703125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5946022272109985, - "step": 410, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001032590866089 - }, - { - "episode": 6592, - "epoch": 0.11848869396412266, - "loss/policy_avg": 0.4641076922416687, - "lr": 9.737346625766872e-06, - "objective/entropy": 80.71646881103516, - "objective/kl": 35.40373992919922, - "objective/non_score_reward": -3.5403738021850586, - "objective/rlhf_reward": -12.680542829449536, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 4.019253730773926, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5870873928070068, - "step": 411, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9998197555541992 - }, - { - "episode": 6608, - "epoch": 0.1187762878815113, - "loss/policy_avg": 0.3565133213996887, - "lr": 9.736707566462167e-06, - "objective/entropy": 122.7892074584961, - "objective/kl": 39.498130798339844, - "objective/non_score_reward": -3.9498136043548584, - "objective/rlhf_reward": -13.67654818512586, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 63.53807830810547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.581372857093811, - "step": 412, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0015203952789307 - }, - { - "episode": 6624, - "epoch": 0.11906388179889996, - "loss/policy_avg": 0.14506877958774567, - "lr": 9.736068507157464e-06, - "objective/entropy": 193.5592041015625, - "objective/kl": 30.521562576293945, - "objective/non_score_reward": -3.052156448364258, - "objective/rlhf_reward": -10.475291983286539, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 22.271638870239258, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7713235020637512, - "step": 413, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9975152015686035 - }, - { - "episode": 6640, - "epoch": 0.1193514757162886, - "loss/policy_avg": 0.9468994736671448, - "lr": 9.735429447852761e-06, - "objective/entropy": 148.8424835205078, - "objective/kl": 37.5145378112793, - "objective/non_score_reward": -3.7514538764953613, - "objective/rlhf_reward": -15.005815267562866, - "objective/scores": 0.0, - "policy/approxkl_avg": 9.498788833618164, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.38916516304016113, - "step": 414, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000758171081543 - }, - { - "episode": 6656, - "epoch": 0.11963906963367725, - "loss/policy_avg": 0.7254658937454224, - "lr": 9.734790388548058e-06, - "objective/entropy": 56.421714782714844, - "objective/kl": 33.228389739990234, - "objective/non_score_reward": -3.32283878326416, - "objective/rlhf_reward": -11.46652638462455, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 17.776447296142578, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6019710302352905, - "step": 415, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001137256622314 - }, - { - "episode": 6672, - "epoch": 0.11992666355106589, - "loss/policy_avg": 0.12397602200508118, - "lr": 9.734151329243355e-06, - "objective/entropy": -148.2471466064453, - "objective/kl": 25.882095336914062, - "objective/non_score_reward": -2.588209629058838, - "objective/rlhf_reward": -8.974236705390316, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 0.8484023809432983, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4417728781700134, - "step": 416, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.002113103866577 - }, - { - "episode": 6688, - "epoch": 0.12021425746845454, - "loss/policy_avg": -0.03540700674057007, - "lr": 9.73351226993865e-06, - "objective/entropy": -65.22505187988281, - "objective/kl": 25.781585693359375, - "objective/non_score_reward": -2.5781586170196533, - "objective/rlhf_reward": -10.312634468078613, - "objective/scores": 0.0, - "policy/approxkl_avg": 0.9484915733337402, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.44973552227020264, - "step": 417, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.003369092941284 - }, - { - "episode": 6704, - "epoch": 0.12050185138584318, - "loss/policy_avg": 2.237513303756714, - "lr": 9.732873210633947e-06, - "objective/entropy": 72.41790008544922, - "objective/kl": 41.708648681640625, - "objective/non_score_reward": -4.170865058898926, - "objective/rlhf_reward": -13.759741698147032, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 6.452242851257324, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.46763697266578674, - "step": 418, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999596357345581 - }, - { - "episode": 6720, - "epoch": 0.12078944530323184, - "loss/policy_avg": -0.033215656876564026, - "lr": 9.732234151329244e-06, - "objective/entropy": 116.18624877929688, - "objective/kl": 41.70143508911133, - "objective/non_score_reward": -4.170144081115723, - "objective/rlhf_reward": -15.018715386808502, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 0.950503408908844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5579153299331665, - "step": 419, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0016934871673584 - }, - { - "episode": 6736, - "epoch": 0.12107703922062048, - "loss/policy_avg": 0.5230793952941895, - "lr": 9.73159509202454e-06, - "objective/entropy": 87.67442321777344, - "objective/kl": 42.121944427490234, - "objective/non_score_reward": -4.212194442749023, - "objective/rlhf_reward": -15.523264799147768, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 44.811546325683594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5887485146522522, - "step": 420, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9986464977264404 - }, - { - "episode": 6752, - "epoch": 0.12136463313800913, - "loss/policy_avg": 0.09617140889167786, - "lr": 9.730956032719838e-06, - "objective/entropy": 197.31307983398438, - "objective/kl": 41.32299041748047, - "objective/non_score_reward": -4.132298469543457, - "objective/rlhf_reward": -15.129195547103883, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.07308292388916, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5032777786254883, - "step": 421, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003557205200195 - }, - { - "episode": 6768, - "epoch": 0.12165222705539779, - "loss/policy_avg": 0.0820683017373085, - "lr": 9.730316973415135e-06, - "objective/entropy": 90.92608642578125, - "objective/kl": 33.22870635986328, - "objective/non_score_reward": -3.3228707313537598, - "objective/rlhf_reward": -11.735222904887749, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 6.888459205627441, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6309401392936707, - "step": 422, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998487949371338 - }, - { - "episode": 6784, - "epoch": 0.12193982097278643, - "loss/policy_avg": 0.13335853815078735, - "lr": 9.72967791411043e-06, - "objective/entropy": 58.8111686706543, - "objective/kl": 17.325424194335938, - "objective/non_score_reward": -1.7325425148010254, - "objective/rlhf_reward": -5.196836725870767, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 2.40964412689209, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4101444184780121, - "step": 423, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999779462814331 - }, - { - "episode": 6800, - "epoch": 0.12222741489017508, - "loss/policy_avg": 1.1839892864227295, - "lr": 9.729038854805727e-06, - "objective/entropy": 278.55230712890625, - "objective/kl": 36.13326644897461, - "objective/non_score_reward": -3.6133267879486084, - "objective/rlhf_reward": -12.7199738184611, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 37.6474609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6956678628921509, - "step": 424, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.999556303024292 - }, - { - "episode": 6816, - "epoch": 0.12251500880756372, - "loss/policy_avg": 0.5160382390022278, - "lr": 9.728399795501023e-06, - "objective/entropy": -4.561044692993164, - "objective/kl": 48.20618438720703, - "objective/non_score_reward": -4.820618152618408, - "objective/rlhf_reward": -17.801520469601513, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 29.267677307128906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6663553714752197, - "step": 425, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9961378574371338 - }, - { - "episode": 6832, - "epoch": 0.12280260272495237, - "loss/policy_avg": -0.027832061052322388, - "lr": 9.72776073619632e-06, - "objective/entropy": -14.169868469238281, - "objective/kl": 28.816591262817383, - "objective/non_score_reward": -2.8816590309143066, - "objective/rlhf_reward": -10.126636123657228, - "objective/scores": 0.35, - "policy/approxkl_avg": 10.623421669006348, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.42182457447052, - "step": 426, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0000364780426025 - }, - { - "episode": 6848, - "epoch": 0.12309019664234101, - "loss/policy_avg": 0.9478355050086975, - "lr": 9.727121676891617e-06, - "objective/entropy": -48.67333221435547, - "objective/kl": 22.937318801879883, - "objective/non_score_reward": -2.293731927871704, - "objective/rlhf_reward": -7.227516422944005, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.106391191482544, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6140183210372925, - "step": 427, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0030503273010254 - }, - { - "episode": 6864, - "epoch": 0.12337779055972967, - "loss/policy_avg": 0.6610305309295654, - "lr": 9.726482617586912e-06, - "objective/entropy": 80.99835968017578, - "objective/kl": 39.61425018310547, - "objective/non_score_reward": -3.961425304412842, - "objective/rlhf_reward": -14.520187649756593, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.697940349578857, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5119843482971191, - "step": 428, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000978946685791 - }, - { - "episode": 6880, - "epoch": 0.1236653844771183, - "loss/policy_avg": 0.11895343661308289, - "lr": 9.72584355828221e-06, - "objective/entropy": 162.822021484375, - "objective/kl": 44.34868621826172, - "objective/non_score_reward": -4.434868812561035, - "objective/rlhf_reward": -16.339474773406984, - "objective/scores": 0.35, - "policy/approxkl_avg": 4.151267051696777, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5969315767288208, - "step": 429, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984312057495117 - }, - { - "episode": 6896, - "epoch": 0.12395297839450696, - "loss/policy_avg": 0.5579686164855957, - "lr": 9.725204498977506e-06, - "objective/entropy": -17.16387367248535, - "objective/kl": 37.852745056152344, - "objective/non_score_reward": -3.7852747440338135, - "objective/rlhf_reward": -10.741099214553834, - "objective/scores": 1.1, - "policy/approxkl_avg": 41.654693603515625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4921834468841553, - "step": 430, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9981064796447754 - }, - { - "episode": 6912, - "epoch": 0.1242405723118956, - "loss/policy_avg": 0.15593896806240082, - "lr": 9.724565439672803e-06, - "objective/entropy": 149.734130859375, - "objective/kl": 25.60231590270996, - "objective/non_score_reward": -2.5602316856384277, - "objective/rlhf_reward": -8.416097994121621, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 30.770790100097656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4205164313316345, - "step": 431, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.001136541366577 - }, - { - "episode": 6928, - "epoch": 0.12452816622928425, - "loss/policy_avg": 0.7011826038360596, - "lr": 9.7239263803681e-06, - "objective/entropy": 55.692283630371094, - "objective/kl": 43.931175231933594, - "objective/non_score_reward": -4.393117904663086, - "objective/rlhf_reward": -15.83913828531901, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 20.95511245727539, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3865165710449219, - "step": 432, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988900423049927 - }, - { - "episode": 6944, - "epoch": 0.1248157601466729, - "loss/policy_avg": 1.022209882736206, - "lr": 9.723287321063397e-06, - "objective/entropy": 83.84861755371094, - "objective/kl": 42.09056854248047, - "objective/non_score_reward": -4.209057331085205, - "objective/rlhf_reward": -15.011400456699441, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 58.270423889160156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6480612754821777, - "step": 433, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9980558156967163 - }, - { - "episode": 6960, - "epoch": 0.12510335406406153, - "loss/policy_avg": 0.5657510757446289, - "lr": 9.722648261758692e-06, - "objective/entropy": 115.53985595703125, - "objective/kl": 33.222572326660156, - "objective/non_score_reward": -3.3222572803497314, - "objective/rlhf_reward": -11.684908661905844, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 22.127004623413086, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8776997923851013, - "step": 434, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9980456829071045 - }, - { - "episode": 6976, - "epoch": 0.1253909479814502, - "loss/policy_avg": 0.861635684967041, - "lr": 9.722009202453989e-06, - "objective/entropy": 191.2237548828125, - "objective/kl": 33.726585388183594, - "objective/non_score_reward": -3.3726587295532227, - "objective/rlhf_reward": -11.757301584879556, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 66.25660705566406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5842768549919128, - "step": 435, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0004849433898926 - }, - { - "episode": 6992, - "epoch": 0.12567854189883884, - "loss/policy_avg": 0.30258873105049133, - "lr": 9.721370143149284e-06, - "objective/entropy": 179.46835327148438, - "objective/kl": 39.91570281982422, - "objective/non_score_reward": -3.9915707111358643, - "objective/rlhf_reward": -14.362163100306113, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 8.522405624389648, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.45521217584609985, - "step": 436, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9980738162994385 - }, - { - "episode": 7008, - "epoch": 0.12596613581622748, - "loss/policy_avg": 0.9346391558647156, - "lr": 9.720731083844581e-06, - "objective/entropy": 37.353126525878906, - "objective/kl": 43.99368667602539, - "objective/non_score_reward": -4.3993682861328125, - "objective/rlhf_reward": -16.11652124207771, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 6.334951877593994, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7622473835945129, - "step": 437, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0002927780151367 - }, - { - "episode": 7024, - "epoch": 0.12625372973361612, - "loss/policy_avg": 1.3644543886184692, - "lr": 9.720092024539878e-06, - "objective/entropy": -80.11536407470703, - "objective/kl": 26.775297164916992, - "objective/non_score_reward": -2.677529811859131, - "objective/rlhf_reward": -9.33151695975433, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 41.969295501708984, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.37151363492012024, - "step": 438, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9997823238372803 - }, - { - "episode": 7040, - "epoch": 0.1265413236510048, - "loss/policy_avg": -0.03804589435458183, - "lr": 9.719452965235175e-06, - "objective/entropy": -68.57923889160156, - "objective/kl": 41.42705535888672, - "objective/non_score_reward": -4.14270544052124, - "objective/rlhf_reward": -15.170821285247804, - "objective/scores": 0.35, - "policy/approxkl_avg": 0.610215425491333, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.3907659649848938, - "step": 439, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002786636352539 - }, - { - "episode": 7056, - "epoch": 0.12682891756839343, - "loss/policy_avg": 0.9513897895812988, - "lr": 9.718813905930472e-06, - "objective/entropy": 176.7696533203125, - "objective/kl": 32.5645751953125, - "objective/non_score_reward": -3.2564573287963867, - "objective/rlhf_reward": -11.666579568122309, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 84.38311004638672, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.43554389476776123, - "step": 440, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9961769580841064 - }, - { - "episode": 7072, - "epoch": 0.12711651148578207, - "loss/policy_avg": 1.6144888401031494, - "lr": 9.718174846625767e-06, - "objective/entropy": -14.703704833984375, - "objective/kl": 21.40297508239746, - "objective/non_score_reward": -2.1402974128723145, - "objective/rlhf_reward": -6.1611901283264165, - "objective/scores": 0.6, - "policy/approxkl_avg": 90.28463745117188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6536741852760315, - "step": 441, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9973065853118896 - }, - { - "episode": 7088, - "epoch": 0.12740410540317074, - "loss/policy_avg": -0.564086377620697, - "lr": 9.717535787321064e-06, - "objective/entropy": -92.54092407226562, - "objective/kl": 27.47213363647461, - "objective/non_score_reward": -2.747213363647461, - "objective/rlhf_reward": -9.432593672481135, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 24.76102066040039, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.4654249846935272, - "step": 442, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999948263168335 - }, - { - "episode": 7104, - "epoch": 0.12769169932055938, - "loss/policy_avg": 0.7788177728652954, - "lr": 9.71689672801636e-06, - "objective/entropy": -28.373756408691406, - "objective/kl": 35.91747283935547, - "objective/non_score_reward": -3.591747283935547, - "objective/rlhf_reward": -13.025353243857055, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 27.097518920898438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.38184916973114014, - "step": 443, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997880220413208 - }, - { - "episode": 7120, - "epoch": 0.12797929323794802, - "loss/policy_avg": 0.3843851685523987, - "lr": 9.716257668711657e-06, - "objective/entropy": 140.32058715820312, - "objective/kl": 37.66426467895508, - "objective/non_score_reward": -3.7664265632629395, - "objective/rlhf_reward": -13.332372681299844, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 19.241600036621094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.349905788898468, - "step": 444, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0018460750579834 - }, - { - "episode": 7136, - "epoch": 0.12826688715533666, - "loss/policy_avg": 0.540947437286377, - "lr": 9.715618609406954e-06, - "objective/entropy": 148.06629943847656, - "objective/kl": 42.55817413330078, - "objective/non_score_reward": -4.25581693649292, - "objective/rlhf_reward": -15.66401835653631, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 75.46281433105469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.541397213935852, - "step": 445, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983032941818237 - }, - { - "episode": 7152, - "epoch": 0.12855448107272532, - "loss/policy_avg": 1.184004306793213, - "lr": 9.714979550102251e-06, - "objective/entropy": 84.38250732421875, - "objective/kl": 33.90479278564453, - "objective/non_score_reward": -3.390479564666748, - "objective/rlhf_reward": -12.220281770735411, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 51.5472412109375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5456966161727905, - "step": 446, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9997706413269043 - }, - { - "episode": 7168, - "epoch": 0.12884207499011396, - "loss/policy_avg": 1.1816997528076172, - "lr": 9.714340490797546e-06, - "objective/entropy": -57.552371978759766, - "objective/kl": 30.747276306152344, - "objective/non_score_reward": -3.074728012084961, - "objective/rlhf_reward": -7.898911571502686, - "objective/scores": 1.1, - "policy/approxkl_avg": 35.125282287597656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.30223560333251953, - "step": 447, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9982578754425049 - }, - { - "episode": 7184, - "epoch": 0.1291296689075026, - "loss/policy_avg": 0.3517414927482605, - "lr": 9.713701431492843e-06, - "objective/entropy": 172.75254821777344, - "objective/kl": 32.79669189453125, - "objective/non_score_reward": -3.2796695232391357, - "objective/rlhf_reward": -11.456818585813629, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 27.51565170288086, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4375608265399933, - "step": 448, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9987891912460327 - }, - { - "episode": 7200, - "epoch": 0.12941726282489124, - "loss/policy_avg": 0.3732157051563263, - "lr": 9.71306237218814e-06, - "objective/entropy": 71.94863891601562, - "objective/kl": 37.338172912597656, - "objective/non_score_reward": -3.7338175773620605, - "objective/rlhf_reward": -12.535269832611085, - "objective/scores": 0.6, - "policy/approxkl_avg": 85.17916870117188, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.43261417746543884, - "step": 449, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.99893057346344 - }, - { - "episode": 7216, - "epoch": 0.1297048567422799, - "loss/policy_avg": 1.4543174505233765, - "lr": 9.712423312883437e-06, - "objective/entropy": 258.2192687988281, - "objective/kl": 49.02899169921875, - "objective/non_score_reward": -4.902898788452148, - "objective/rlhf_reward": -18.286083135634584, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 21.199018478393555, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6569823026657104, - "step": 450, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.996167778968811 - }, - { - "episode": 7232, - "epoch": 0.12999245065966855, - "loss/policy_avg": 0.49340057373046875, - "lr": 9.711784253578734e-06, - "objective/entropy": -43.6832160949707, - "objective/kl": 33.31085968017578, - "objective/non_score_reward": -3.3310861587524414, - "objective/rlhf_reward": -11.87374721011673, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 28.588882446289062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6205878257751465, - "step": 451, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999979019165039 - }, - { - "episode": 7248, - "epoch": 0.1302800445770572, - "loss/policy_avg": 0.6000991463661194, - "lr": 9.711145194274029e-06, - "objective/entropy": 90.76286315917969, - "objective/kl": 45.31011962890625, - "objective/non_score_reward": -4.531011581420898, - "objective/rlhf_reward": -16.745444872466425, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 113.64555358886719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5617672204971313, - "step": 452, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9989123344421387 - }, - { - "episode": 7264, - "epoch": 0.13056763849444583, - "loss/policy_avg": 1.042801022529602, - "lr": 9.710506134969326e-06, - "objective/entropy": 189.15316772460938, - "objective/kl": 36.73876953125, - "objective/non_score_reward": -3.673877477645874, - "objective/rlhf_reward": -13.295509910583498, - "objective/scores": 0.35, - "policy/approxkl_avg": 21.024385452270508, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4428805708885193, - "step": 453, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9950382709503174 - }, - { - "episode": 7280, - "epoch": 0.1308552324118345, - "loss/policy_avg": 0.6862419843673706, - "lr": 9.709867075664623e-06, - "objective/entropy": -38.022178649902344, - "objective/kl": 48.085838317871094, - "objective/non_score_reward": -4.808583736419678, - "objective/rlhf_reward": -17.71856340149277, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 89.67850494384766, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5650781393051147, - "step": 454, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9975779056549072 - }, - { - "episode": 7296, - "epoch": 0.13114282632922314, - "loss/policy_avg": -0.21822161972522736, - "lr": 9.70922801635992e-06, - "objective/entropy": 24.07161521911621, - "objective/kl": 39.552284240722656, - "objective/non_score_reward": -3.955228328704834, - "objective/rlhf_reward": -14.339960935528637, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 4.527206897735596, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.32756006717681885, - "step": 455, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0025227069854736 - }, - { - "episode": 7312, - "epoch": 0.13143042024661178, - "loss/policy_avg": 0.050335630774497986, - "lr": 9.708588957055215e-06, - "objective/entropy": 47.709957122802734, - "objective/kl": 34.94654083251953, - "objective/non_score_reward": -3.4946541786193848, - "objective/rlhf_reward": -11.054897938610289, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 20.525684356689453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.40393322706222534, - "step": 456, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999319314956665 - }, - { - "episode": 7328, - "epoch": 0.13171801416400042, - "loss/policy_avg": -0.2107769250869751, - "lr": 9.707949897750512e-06, - "objective/entropy": 4.4464111328125, - "objective/kl": 30.348583221435547, - "objective/non_score_reward": -3.034858226776123, - "objective/rlhf_reward": -10.760830977050167, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 9.349994659423828, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4865412712097168, - "step": 457, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.025125503540039 - }, - { - "episode": 7344, - "epoch": 0.1320056080813891, - "loss/policy_avg": 0.9970263242721558, - "lr": 9.707310838445809e-06, - "objective/entropy": 197.11566162109375, - "objective/kl": 38.80963897705078, - "objective/non_score_reward": -3.8809640407562256, - "objective/rlhf_reward": -14.008084261211092, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 64.41117858886719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.47768014669418335, - "step": 458, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9979461431503296 - }, - { - "episode": 7360, - "epoch": 0.13229320199877773, - "loss/policy_avg": 0.19499164819717407, - "lr": 9.706671779141105e-06, - "objective/entropy": 109.55068969726562, - "objective/kl": 34.07399368286133, - "objective/non_score_reward": -3.4073991775512695, - "objective/rlhf_reward": -13.629597425460815, - "objective/scores": 0.0, - "policy/approxkl_avg": 8.495365142822266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.565830647945404, - "step": 459, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999122977256775 - }, - { - "episode": 7376, - "epoch": 0.13258079591616637, - "loss/policy_avg": 0.5532440543174744, - "lr": 9.7060327198364e-06, - "objective/entropy": 83.35699462890625, - "objective/kl": 32.4083251953125, - "objective/non_score_reward": -3.240832567214966, - "objective/rlhf_reward": -11.359210524622519, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.45071268081665, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4963003993034363, - "step": 460, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.011878490447998 - }, - { - "episode": 7392, - "epoch": 0.13286838983355503, - "loss/policy_avg": 0.552447497844696, - "lr": 9.705393660531698e-06, - "objective/entropy": 160.72750854492188, - "objective/kl": 47.16038131713867, - "objective/non_score_reward": -4.716038227081299, - "objective/rlhf_reward": -17.440320809085932, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 79.7755126953125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3849433958530426, - "step": 461, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9991737604141235 - }, - { - "episode": 7408, - "epoch": 0.13315598375094367, - "loss/policy_avg": 1.8305895328521729, - "lr": 9.704754601226994e-06, - "objective/entropy": 148.47381591796875, - "objective/kl": 32.803104400634766, - "objective/non_score_reward": -3.280310869216919, - "objective/rlhf_reward": -11.779607584982543, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 4.440328121185303, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.34545716643333435, - "step": 462, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0012433528900146 - }, - { - "episode": 7424, - "epoch": 0.1334435776683323, - "loss/policy_avg": -0.21606217324733734, - "lr": 9.704115541922291e-06, - "objective/entropy": -211.98297119140625, - "objective/kl": 22.90569305419922, - "objective/non_score_reward": -2.2905690670013428, - "objective/rlhf_reward": -7.337447400363992, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 3.513453483581543, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6393148899078369, - "step": 463, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0019214153289795 - }, - { - "episode": 7440, - "epoch": 0.13373117158572095, - "loss/policy_avg": 1.7707817554473877, - "lr": 9.703476482617588e-06, - "objective/entropy": -42.62212371826172, - "objective/kl": 38.86042022705078, - "objective/non_score_reward": -3.8860418796539307, - "objective/rlhf_reward": -13.987908928599907, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 6.585241317749023, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5668050050735474, - "step": 464, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999806880950928 - }, - { - "episode": 7456, - "epoch": 0.13401876550310962, - "loss/policy_avg": 0.5606961250305176, - "lr": 9.702837423312883e-06, - "objective/entropy": 46.48912811279297, - "objective/kl": 41.47301483154297, - "objective/non_score_reward": -4.14730167388916, - "objective/rlhf_reward": -14.18920729160309, - "objective/scores": 0.6, - "policy/approxkl_avg": 15.323009490966797, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4189653694629669, - "step": 465, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.002035617828369 - }, - { - "episode": 7472, - "epoch": 0.13430635942049826, - "loss/policy_avg": 0.3116866946220398, - "lr": 9.70219836400818e-06, - "objective/entropy": 202.82122802734375, - "objective/kl": 30.228025436401367, - "objective/non_score_reward": -3.0228028297424316, - "objective/rlhf_reward": -10.266382093700479, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 3.352916955947876, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5167281627655029, - "step": 466, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.0003514289855957 - }, - { - "episode": 7488, - "epoch": 0.1345939533378869, - "loss/policy_avg": 0.9980499148368835, - "lr": 9.701559304703477e-06, - "objective/entropy": 127.01738739013672, - "objective/kl": 39.83085632324219, - "objective/non_score_reward": -3.9830856323242188, - "objective/rlhf_reward": -14.45138919633186, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 25.166885375976562, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.736939549446106, - "step": 467, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994404315948486 - }, - { - "episode": 7504, - "epoch": 0.13488154725527554, - "loss/policy_avg": 0.21544580161571503, - "lr": 9.700920245398774e-06, - "objective/entropy": 233.09375, - "objective/kl": 32.72058868408203, - "objective/non_score_reward": -3.272059202194214, - "objective/rlhf_reward": -11.72898670408575, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 78.07327270507812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.62492835521698, - "step": 468, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9992430210113525 - }, - { - "episode": 7520, - "epoch": 0.1351691411726642, - "loss/policy_avg": 0.4316645860671997, - "lr": 9.700281186094071e-06, - "objective/entropy": -37.32112121582031, - "objective/kl": 29.643779754638672, - "objective/non_score_reward": -2.9643778800964355, - "objective/rlhf_reward": -10.406913261027679, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 80.32553100585938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8007365465164185, - "step": 469, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981448650360107 - }, - { - "episode": 7536, - "epoch": 0.13545673509005285, - "loss/policy_avg": -0.22065140306949615, - "lr": 9.699642126789368e-06, - "objective/entropy": -288.51220703125, - "objective/kl": 31.09638023376465, - "objective/non_score_reward": -3.109638214111328, - "objective/rlhf_reward": -10.882293074336602, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 15.306625366210938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6064466238021851, - "step": 470, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000288248062134 - }, - { - "episode": 7552, - "epoch": 0.1357443290074415, - "loss/policy_avg": 0.7062017917633057, - "lr": 9.699003067484663e-06, - "objective/entropy": -185.96678161621094, - "objective/kl": 38.07769012451172, - "objective/non_score_reward": -3.8077688217163086, - "objective/rlhf_reward": -12.307355795742247, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 21.195262908935547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6327893137931824, - "step": 471, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9975873231887817 - }, - { - "episode": 7568, - "epoch": 0.13603192292483013, - "loss/policy_avg": 0.21052365005016327, - "lr": 9.69836400817996e-06, - "objective/entropy": -114.1561050415039, - "objective/kl": 38.60865020751953, - "objective/non_score_reward": -3.8608651161193848, - "objective/rlhf_reward": -15.443459749221802, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.808056831359863, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.587588906288147, - "step": 472, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999895453453064 - }, - { - "episode": 7584, - "epoch": 0.1363195168422188, - "loss/policy_avg": 0.9177588224411011, - "lr": 9.697724948875257e-06, - "objective/entropy": 91.9778823852539, - "objective/kl": 49.228004455566406, - "objective/non_score_reward": -4.9228010177612305, - "objective/rlhf_reward": -18.34956817915979, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 252.69491577148438, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5273959636688232, - "step": 473, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998390555381775 - }, - { - "episode": 7600, - "epoch": 0.13660711075960744, - "loss/policy_avg": 0.2135259062051773, - "lr": 9.697085889570554e-06, - "objective/entropy": -91.76605224609375, - "objective/kl": 20.413612365722656, - "objective/non_score_reward": -2.0413613319396973, - "objective/rlhf_reward": -6.741612751682368, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 1.6130738258361816, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4268151521682739, - "step": 474, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9995267391204834 - }, - { - "episode": 7616, - "epoch": 0.13689470467699608, - "loss/policy_avg": 0.7761150598526001, - "lr": 9.69644683026585e-06, - "objective/entropy": 25.679851531982422, - "objective/kl": 40.76634979248047, - "objective/non_score_reward": -4.076634883880615, - "objective/rlhf_reward": -14.927937605468134, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 14.822543144226074, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7041196823120117, - "step": 475, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9984660148620605 - }, - { - "episode": 7632, - "epoch": 0.13718229859438472, - "loss/policy_avg": 1.497192144393921, - "lr": 9.695807770961146e-06, - "objective/entropy": -73.21554565429688, - "objective/kl": 31.698223114013672, - "objective/non_score_reward": -3.1698226928710938, - "objective/rlhf_reward": -10.731878827290473, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.129430770874023, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6742293834686279, - "step": 476, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998528242111206 - }, - { - "episode": 7648, - "epoch": 0.13746989251177338, - "loss/policy_avg": 0.7623737454414368, - "lr": 9.695168711656443e-06, - "objective/entropy": -212.29415893554688, - "objective/kl": 26.89659881591797, - "objective/non_score_reward": -2.689659833908081, - "objective/rlhf_reward": -9.433126482993288, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 40.945072174072266, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6739322543144226, - "step": 477, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999362587928772 - }, - { - "episode": 7664, - "epoch": 0.13775748642916202, - "loss/policy_avg": 0.8399478793144226, - "lr": 9.694529652351738e-06, - "objective/entropy": -28.784271240234375, - "objective/kl": 46.6888542175293, - "objective/non_score_reward": -4.668885707855225, - "objective/rlhf_reward": -17.31629320356695, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 33.967262268066406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7294750213623047, - "step": 478, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998927116394043 - }, - { - "episode": 7680, - "epoch": 0.13804508034655066, - "loss/policy_avg": 0.48977339267730713, - "lr": 9.693890593047035e-06, - "objective/entropy": -103.34806823730469, - "objective/kl": 35.017757415771484, - "objective/non_score_reward": -3.5017752647399902, - "objective/rlhf_reward": -12.583269674976435, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 104.5495376586914, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6413424015045166, - "step": 479, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9937057495117188 - }, - { - "episode": 7696, - "epoch": 0.13833267426393933, - "loss/policy_avg": 0.20745977759361267, - "lr": 9.693251533742331e-06, - "objective/entropy": 57.130958557128906, - "objective/kl": 41.31460189819336, - "objective/non_score_reward": -4.131460189819336, - "objective/rlhf_reward": -15.010069215091402, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 30.427654266357422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5963334441184998, - "step": 480, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0048885345458984 - }, - { - "episode": 7712, - "epoch": 0.13862026818132797, - "loss/policy_avg": 0.15290355682373047, - "lr": 9.692612474437628e-06, - "objective/entropy": 106.57427978515625, - "objective/kl": 41.791648864746094, - "objective/non_score_reward": -4.179165363311768, - "objective/rlhf_reward": -13.792941962124083, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 30.64380645751953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.521045446395874, - "step": 481, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998927354812622 - }, - { - "episode": 7728, - "epoch": 0.1389078620987166, - "loss/policy_avg": 0.7474868893623352, - "lr": 9.691973415132925e-06, - "objective/entropy": 228.7914581298828, - "objective/kl": 35.901405334472656, - "objective/non_score_reward": -3.5901405811309814, - "objective/rlhf_reward": -11.960562801361085, - "objective/scores": 0.6, - "policy/approxkl_avg": 88.05641174316406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5807632207870483, - "step": 482, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9998561143875122 - }, - { - "episode": 7744, - "epoch": 0.13919545601610525, - "loss/policy_avg": 1.6704270839691162, - "lr": 9.691334355828222e-06, - "objective/entropy": -124.35450744628906, - "objective/kl": 36.7768440246582, - "objective/non_score_reward": -3.6776845455169678, - "objective/rlhf_reward": -13.286906321247187, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 20.388381958007812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6395858526229858, - "step": 483, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9977163076400757 - }, - { - "episode": 7760, - "epoch": 0.13948304993349392, - "loss/policy_avg": 0.41683727502822876, - "lr": 9.690695296523517e-06, - "objective/entropy": 82.72738647460938, - "objective/kl": 38.18916702270508, - "objective/non_score_reward": -3.8189167976379395, - "objective/rlhf_reward": -13.851835568149653, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 7.188452243804932, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5991804599761963, - "step": 484, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0014636516571045 - }, - { - "episode": 7776, - "epoch": 0.13977064385088256, - "loss/policy_avg": 0.9677872657775879, - "lr": 9.690056237218814e-06, - "objective/entropy": 33.29289627075195, - "objective/kl": 28.069137573242188, - "objective/non_score_reward": -2.8069138526916504, - "objective/rlhf_reward": -9.868405782912655, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.909322738647461, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3673190474510193, - "step": 485, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988304376602173 - }, - { - "episode": 7792, - "epoch": 0.1400582377682712, - "loss/policy_avg": 1.4412565231323242, - "lr": 9.689417177914111e-06, - "objective/entropy": 143.29071044921875, - "objective/kl": 37.918006896972656, - "objective/non_score_reward": -3.7918009757995605, - "objective/rlhf_reward": -13.71660516700302, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 25.657358169555664, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4064646065235138, - "step": 486, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9968105554580688 - }, - { - "episode": 7808, - "epoch": 0.14034583168565984, - "loss/policy_avg": 0.5567857027053833, - "lr": 9.688778118609408e-06, - "objective/entropy": 108.03604125976562, - "objective/kl": 25.368505477905273, - "objective/non_score_reward": -2.5368504524230957, - "objective/rlhf_reward": -8.485543017805206, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 7.421592712402344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5672407746315002, - "step": 487, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002861499786377 - }, - { - "episode": 7824, - "epoch": 0.1406334256030485, - "loss/policy_avg": 0.1908845454454422, - "lr": 9.688139059304705e-06, - "objective/entropy": 155.91831970214844, - "objective/kl": 27.6815128326416, - "objective/non_score_reward": -2.7681517601013184, - "objective/rlhf_reward": -6.672606325149536, - "objective/scores": 1.1, - "policy/approxkl_avg": 10.237553596496582, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4108530282974243, - "step": 488, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9994752407073975 - }, - { - "episode": 7840, - "epoch": 0.14092101952043715, - "loss/policy_avg": 0.9416247606277466, - "lr": 9.6875e-06, - "objective/entropy": -103.95333862304688, - "objective/kl": 41.44330978393555, - "objective/non_score_reward": -4.144330978393555, - "objective/rlhf_reward": -14.177324867248537, - "objective/scores": 0.6, - "policy/approxkl_avg": 14.313966751098633, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.464949369430542, - "step": 489, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988256692886353 - }, - { - "episode": 7856, - "epoch": 0.1412086134378258, - "loss/policy_avg": 2.2338528633117676, - "lr": 9.686860940695297e-06, - "objective/entropy": 47.52754211425781, - "objective/kl": 42.061561584472656, - "objective/non_score_reward": -4.206155776977539, - "objective/rlhf_reward": -15.49911001685254, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 55.551963806152344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7129836082458496, - "step": 490, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9957895278930664 - }, - { - "episode": 7872, - "epoch": 0.14149620735521443, - "loss/policy_avg": 0.20792043209075928, - "lr": 9.686221881390594e-06, - "objective/entropy": -1.685638427734375, - "objective/kl": 38.568145751953125, - "objective/non_score_reward": -3.8568148612976074, - "objective/rlhf_reward": -14.10174730780713, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.467138290405273, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4478898048400879, - "step": 491, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003159046173096 - }, - { - "episode": 7888, - "epoch": 0.1417838012726031, - "loss/policy_avg": -0.18662777543067932, - "lr": 9.68558282208589e-06, - "objective/entropy": -26.272117614746094, - "objective/kl": 42.39691925048828, - "objective/non_score_reward": -4.239691734313965, - "objective/rlhf_reward": -15.633254084616823, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 55.918922424316406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5446948409080505, - "step": 492, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999772548675537 - }, - { - "episode": 7904, - "epoch": 0.14207139518999173, - "loss/policy_avg": 0.31151118874549866, - "lr": 9.684943762781188e-06, - "objective/entropy": 33.51483154296875, - "objective/kl": 41.72319030761719, - "objective/non_score_reward": -4.172318935394287, - "objective/rlhf_reward": -15.265443642337885, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 126.68960571289062, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.847740650177002, - "step": 493, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9983421564102173 - }, - { - "episode": 7920, - "epoch": 0.14235898910738037, - "loss/policy_avg": 0.7131699323654175, - "lr": 9.684304703476484e-06, - "objective/entropy": -26.66382598876953, - "objective/kl": 45.098487854003906, - "objective/non_score_reward": -4.5098490715026855, - "objective/rlhf_reward": -16.58879766902481, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 113.07894897460938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4858088493347168, - "step": 494, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9995558261871338 - }, - { - "episode": 7936, - "epoch": 0.142646583024769, - "loss/policy_avg": 0.7710833549499512, - "lr": 9.68366564417178e-06, - "objective/entropy": 83.98237609863281, - "objective/kl": 33.111812591552734, - "objective/non_score_reward": -3.3111815452575684, - "objective/rlhf_reward": -11.763773563320994, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 51.01200866699219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5449756979942322, - "step": 495, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9964442253112793 - }, - { - "episode": 7952, - "epoch": 0.14293417694215768, - "loss/policy_avg": 0.6315375566482544, - "lr": 9.683026584867076e-06, - "objective/entropy": -325.8221435546875, - "objective/kl": 24.298229217529297, - "objective/non_score_reward": -2.4298229217529297, - "objective/rlhf_reward": -8.360041939948482, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.398147702217102, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5080133676528931, - "step": 496, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000394344329834 - }, - { - "episode": 7968, - "epoch": 0.14322177085954632, - "loss/policy_avg": 0.2566729485988617, - "lr": 9.682387525562373e-06, - "objective/entropy": -97.19512939453125, - "objective/kl": 27.388530731201172, - "objective/non_score_reward": -2.7388532161712646, - "objective/rlhf_reward": -9.596163117621822, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 83.0306167602539, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6368144154548645, - "step": 497, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9983546733856201 - }, - { - "episode": 7984, - "epoch": 0.14350936477693496, - "loss/policy_avg": 2.3038244247436523, - "lr": 9.68174846625767e-06, - "objective/entropy": -312.97418212890625, - "objective/kl": 39.9110221862793, - "objective/non_score_reward": -3.9911022186279297, - "objective/rlhf_reward": -15.96440851688385, - "objective/scores": 0.0, - "policy/approxkl_avg": 229.93643188476562, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5881428718566895, - "step": 498, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9984025955200195 - }, - { - "episode": 8000, - "epoch": 0.14379695869432363, - "loss/policy_avg": 0.4790765643119812, - "lr": 9.681109406952967e-06, - "objective/entropy": 102.07373809814453, - "objective/kl": 39.629451751708984, - "objective/non_score_reward": -3.9629452228546143, - "objective/rlhf_reward": -14.189921622694122, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 38.121917724609375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5411556959152222, - "step": 499, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9958930015563965 - }, - { - "episode": 8016, - "epoch": 0.14408455261171227, - "loss/policy_avg": 1.2157926559448242, - "lr": 9.680470347648262e-06, - "objective/entropy": 91.86766052246094, - "objective/kl": 41.75320053100586, - "objective/non_score_reward": -4.175320148468018, - "objective/rlhf_reward": -16.701281309127808, - "objective/scores": 0.0, - "policy/approxkl_avg": 203.22079467773438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7408411502838135, - "step": 500, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977760314941406 - }, - { - "episode": 8032, - "epoch": 0.1443721465291009, - "loss/policy_avg": 0.07564640045166016, - "lr": 9.67983128834356e-06, - "objective/entropy": -72.9471664428711, - "objective/kl": 29.065643310546875, - "objective/non_score_reward": -2.906564474105835, - "objective/rlhf_reward": -10.247655727950434, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 7.510244369506836, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.49747195839881897, - "step": 501, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999770164489746 - }, - { - "episode": 8048, - "epoch": 0.14465974044648955, - "loss/policy_avg": 0.7079442739486694, - "lr": 9.679192229038854e-06, - "objective/entropy": 44.22209930419922, - "objective/kl": 47.81110382080078, - "objective/non_score_reward": -4.781109809875488, - "objective/rlhf_reward": -17.462581282079803, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 23.670162200927734, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.42216792702674866, - "step": 502, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999302864074707 - }, - { - "episode": 8064, - "epoch": 0.14494733436387822, - "loss/policy_avg": 0.17914190888404846, - "lr": 9.678553169734151e-06, - "objective/entropy": -84.96662902832031, - "objective/kl": 34.51586151123047, - "objective/non_score_reward": -3.4515867233276367, - "objective/rlhf_reward": -12.427744128791193, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 106.49320983886719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8383775353431702, - "step": 503, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998382806777954 - }, - { - "episode": 8080, - "epoch": 0.14523492828126686, - "loss/policy_avg": 1.3371942043304443, - "lr": 9.677914110429448e-06, - "objective/entropy": 2.8826751708984375, - "objective/kl": 46.65193557739258, - "objective/non_score_reward": -4.665193557739258, - "objective/rlhf_reward": -17.05665400988253, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 12.464231491088867, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7261908054351807, - "step": 504, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9986940622329712 - }, - { - "episode": 8096, - "epoch": 0.1455225221986555, - "loss/policy_avg": 0.5748533010482788, - "lr": 9.677275051124745e-06, - "objective/entropy": 66.49700927734375, - "objective/kl": 40.18706130981445, - "objective/non_score_reward": -4.018706321716309, - "objective/rlhf_reward": -14.412966256559478, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 14.041053771972656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7998212575912476, - "step": 505, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9987947940826416 - }, - { - "episode": 8112, - "epoch": 0.14581011611604414, - "loss/policy_avg": 0.04638584703207016, - "lr": 9.676635991820042e-06, - "objective/entropy": -63.98976516723633, - "objective/kl": 29.01580810546875, - "objective/non_score_reward": -2.901580810546875, - "objective/rlhf_reward": -10.050064413753107, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 3.6628360748291016, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5268914103507996, - "step": 506, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.000178098678589 - }, - { - "episode": 8128, - "epoch": 0.1460977100334328, - "loss/policy_avg": 1.0773940086364746, - "lr": 9.675996932515339e-06, - "objective/entropy": -80.96644592285156, - "objective/kl": 32.92605972290039, - "objective/non_score_reward": -3.2926058769226074, - "objective/rlhf_reward": -11.811174714301508, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 15.708122253417969, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.707777738571167, - "step": 507, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9975059032440186 - }, - { - "episode": 8144, - "epoch": 0.14638530395082144, - "loss/policy_avg": 0.8820767998695374, - "lr": 9.675357873210634e-06, - "objective/entropy": 20.50244140625, - "objective/kl": 37.39576721191406, - "objective/non_score_reward": -3.739576578140259, - "objective/rlhf_reward": -13.01089508362287, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 92.53317260742188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7722728252410889, - "step": 508, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9979910850524902 - }, - { - "episode": 8160, - "epoch": 0.14667289786821008, - "loss/policy_avg": 0.33342352509498596, - "lr": 9.67471881390593e-06, - "objective/entropy": -130.01919555664062, - "objective/kl": 37.01002502441406, - "objective/non_score_reward": -3.7010021209716797, - "objective/rlhf_reward": -13.142149215162384, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 111.35385131835938, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7142958641052246, - "step": 509, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9980767965316772 - }, - { - "episode": 8176, - "epoch": 0.14696049178559872, - "loss/policy_avg": 2.0854592323303223, - "lr": 9.674079754601228e-06, - "objective/entropy": -153.78465270996094, - "objective/kl": 37.76445770263672, - "objective/non_score_reward": -3.7764456272125244, - "objective/rlhf_reward": -15.105782985687256, - "objective/scores": 0.0, - "policy/approxkl_avg": 13.81374740600586, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4845733046531677, - "step": 510, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999340295791626 - }, - { - "episode": 8192, - "epoch": 0.1472480857029874, - "loss/policy_avg": 0.4099072813987732, - "lr": 9.673440695296525e-06, - "objective/entropy": -178.3352508544922, - "objective/kl": 27.485443115234375, - "objective/non_score_reward": -2.748544216156006, - "objective/rlhf_reward": -10.994177103042603, - "objective/scores": 0.0, - "policy/approxkl_avg": 12.48928451538086, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6313323974609375, - "step": 511, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982385635375977 - }, - { - "episode": 8208, - "epoch": 0.14753567962037603, - "loss/policy_avg": -0.27953195571899414, - "lr": 9.672801635991821e-06, - "objective/entropy": -270.947509765625, - "objective/kl": 38.093963623046875, - "objective/non_score_reward": -3.809396743774414, - "objective/rlhf_reward": -13.633467230860312, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 1.603658676147461, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6720960736274719, - "step": 512, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000967502593994 - }, - { - "episode": 8224, - "epoch": 0.14782327353776467, - "loss/policy_avg": 1.2482883930206299, - "lr": 9.672162576687117e-06, - "objective/entropy": 129.20474243164062, - "objective/kl": 45.26194763183594, - "objective/non_score_reward": -4.526195049285889, - "objective/rlhf_reward": -16.371446386973062, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 25.947460174560547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5968309044837952, - "step": 513, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9988489151000977 - }, - { - "episode": 8240, - "epoch": 0.1481108674551533, - "loss/policy_avg": -0.1279393583536148, - "lr": 9.671523517382413e-06, - "objective/entropy": -51.24613952636719, - "objective/kl": 45.18963623046875, - "objective/non_score_reward": -4.518963813781738, - "objective/rlhf_reward": -16.56008394935959, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.309969902038574, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.7829691171646118, - "step": 514, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001248598098755 - }, - { - "episode": 8256, - "epoch": 0.14839846137254198, - "loss/policy_avg": 1.3355847597122192, - "lr": 9.67088445807771e-06, - "objective/entropy": -83.05213165283203, - "objective/kl": 52.17377853393555, - "objective/non_score_reward": -5.21737813949585, - "objective/rlhf_reward": -17.94579282844183, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 16.227012634277344, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.46618372201919556, - "step": 515, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9993078708648682 - }, - { - "episode": 8272, - "epoch": 0.14868605528993062, - "loss/policy_avg": -0.5786250829696655, - "lr": 9.670245398773007e-06, - "objective/entropy": 113.66342163085938, - "objective/kl": 33.45789337158203, - "objective/non_score_reward": -3.3457894325256348, - "objective/rlhf_reward": -10.459439073444578, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 25.492090225219727, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 1.0281870365142822, - "step": 516, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.002322196960449 - }, - { - "episode": 8288, - "epoch": 0.14897364920731926, - "loss/policy_avg": 1.3281817436218262, - "lr": 9.669606339468304e-06, - "objective/entropy": -242.1662139892578, - "objective/kl": 37.54686737060547, - "objective/non_score_reward": -3.7546873092651367, - "objective/rlhf_reward": -12.896042885557684, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 13.032073020935059, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6886230707168579, - "step": 517, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9998258352279663 - }, - { - "episode": 8304, - "epoch": 0.1492612431247079, - "loss/policy_avg": 0.027422528713941574, - "lr": 9.668967280163601e-06, - "objective/entropy": -86.30763244628906, - "objective/kl": 30.702964782714844, - "objective/non_score_reward": -3.0702965259552, - "objective/rlhf_reward": -10.76541432121628, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 38.08885955810547, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6287646293640137, - "step": 518, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001487731933594 - }, - { - "episode": 8320, - "epoch": 0.14954883704209657, - "loss/policy_avg": 0.026706572622060776, - "lr": 9.668328220858896e-06, - "objective/entropy": -210.8118438720703, - "objective/kl": 35.477561950683594, - "objective/non_score_reward": -3.5477566719055176, - "objective/rlhf_reward": -12.831776821349544, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 16.58705711364746, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8783824443817139, - "step": 519, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.997406244277954 - }, - { - "episode": 8336, - "epoch": 0.1498364309594852, - "loss/policy_avg": -0.08051559329032898, - "lr": 9.667689161554193e-06, - "objective/entropy": -74.4656982421875, - "objective/kl": 35.04960250854492, - "objective/non_score_reward": -3.504960298538208, - "objective/rlhf_reward": -12.41572085386904, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 63.21892547607422, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5668095350265503, - "step": 520, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998744010925293 - }, - { - "episode": 8352, - "epoch": 0.15012402487687385, - "loss/policy_avg": 3.5908355712890625, - "lr": 9.66705010224949e-06, - "objective/entropy": -9.926849365234375, - "objective/kl": 33.93809509277344, - "objective/non_score_reward": -3.3938088417053223, - "objective/rlhf_reward": -12.175236260890962, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.082281112670898, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.665743350982666, - "step": 521, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0017683506011963 - }, - { - "episode": 8368, - "epoch": 0.1504116187942625, - "loss/policy_avg": -0.30025702714920044, - "lr": 9.666411042944787e-06, - "objective/entropy": -11.379417419433594, - "objective/kl": 32.27046585083008, - "objective/non_score_reward": -3.227046489715576, - "objective/rlhf_reward": -11.392414653094944, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 32.16754913330078, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.808684229850769, - "step": 522, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0002644062042236 - }, - { - "episode": 8384, - "epoch": 0.15069921271165115, - "loss/policy_avg": 0.14779390394687653, - "lr": 9.665771983640082e-06, - "objective/entropy": -218.98468017578125, - "objective/kl": 32.48244857788086, - "objective/non_score_reward": -3.2482452392578125, - "objective/rlhf_reward": -11.667467508345766, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 5.038860321044922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7193934917449951, - "step": 523, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9989609718322754 - }, - { - "episode": 8400, - "epoch": 0.1509868066290398, - "loss/policy_avg": 0.4430120587348938, - "lr": 9.665132924335379e-06, - "objective/entropy": -117.5274887084961, - "objective/kl": 34.405799865722656, - "objective/non_score_reward": -3.440580129623413, - "objective/rlhf_reward": -12.158200535837727, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 42.504093170166016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5020079612731934, - "step": 524, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977314472198486 - }, - { - "episode": 8416, - "epoch": 0.15127440054642843, - "loss/policy_avg": 0.2200871706008911, - "lr": 9.664493865030676e-06, - "objective/entropy": -109.22439575195312, - "objective/kl": 27.03481674194336, - "objective/non_score_reward": -2.703481674194336, - "objective/rlhf_reward": -9.152067666471588, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 67.23435974121094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7329505681991577, - "step": 525, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9962201118469238 - }, - { - "episode": 8432, - "epoch": 0.1515619944638171, - "loss/policy_avg": 0.7378959655761719, - "lr": 9.663854805725971e-06, - "objective/entropy": 51.754112243652344, - "objective/kl": 52.813716888427734, - "objective/non_score_reward": -5.281371593475342, - "objective/rlhf_reward": -19.644534471447827, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 35.635292053222656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.509136438369751, - "step": 526, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999948501586914 - }, - { - "episode": 8448, - "epoch": 0.15184958838120574, - "loss/policy_avg": 0.12148091197013855, - "lr": 9.663215746421268e-06, - "objective/entropy": -194.1416015625, - "objective/kl": 25.8511962890625, - "objective/non_score_reward": -2.5851194858551025, - "objective/rlhf_reward": -8.889879803271636, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 57.73289489746094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5093759894371033, - "step": 527, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9996519088745117 - }, - { - "episode": 8464, - "epoch": 0.15213718229859438, - "loss/policy_avg": 0.5260336399078369, - "lr": 9.662576687116565e-06, - "objective/entropy": -28.452239990234375, - "objective/kl": 47.280189514160156, - "objective/non_score_reward": -4.728018760681152, - "objective/rlhf_reward": -17.307955060068686, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 6.9857563972473145, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6503279209136963, - "step": 528, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982560873031616 - }, - { - "episode": 8480, - "epoch": 0.15242477621598302, - "loss/policy_avg": 0.35683369636535645, - "lr": 9.661937627811862e-06, - "objective/entropy": 73.58621978759766, - "objective/kl": 32.123538970947266, - "objective/non_score_reward": -3.2123541831970215, - "objective/rlhf_reward": -8.449416255950927, - "objective/scores": 1.1, - "policy/approxkl_avg": 18.77471923828125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4997715950012207, - "step": 529, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.996805191040039 - }, - { - "episode": 8496, - "epoch": 0.1527123701333717, - "loss/policy_avg": 0.5127010345458984, - "lr": 9.661298568507158e-06, - "objective/entropy": -205.9234161376953, - "objective/kl": 33.17967987060547, - "objective/non_score_reward": -3.3179678916931152, - "objective/rlhf_reward": -11.149165334478887, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 9.201175689697266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6817153692245483, - "step": 530, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999847173690796 - }, - { - "episode": 8512, - "epoch": 0.15299996405076033, - "loss/policy_avg": 2.6809909343719482, - "lr": 9.660659509202455e-06, - "objective/entropy": -134.624267578125, - "objective/kl": 42.431121826171875, - "objective/non_score_reward": -4.243112564086914, - "objective/rlhf_reward": -15.49149716180122, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 7.862943649291992, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7814310789108276, - "step": 531, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0017952919006348 - }, - { - "episode": 8528, - "epoch": 0.15328755796814897, - "loss/policy_avg": 1.801469326019287, - "lr": 9.66002044989775e-06, - "objective/entropy": -132.70196533203125, - "objective/kl": 41.52480697631836, - "objective/non_score_reward": -4.152480602264404, - "objective/rlhf_reward": -15.186090548236933, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 20.436891555786133, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5815787315368652, - "step": 532, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0002007484436035 - }, - { - "episode": 8544, - "epoch": 0.1535751518855376, - "loss/policy_avg": -0.09221082925796509, - "lr": 9.659381390593047e-06, - "objective/entropy": -122.66864776611328, - "objective/kl": 32.93227005004883, - "objective/non_score_reward": -3.293226957321167, - "objective/rlhf_reward": -11.439574495951334, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 4.255980014801025, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7247042655944824, - "step": 533, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9997018575668335 - }, - { - "episode": 8560, - "epoch": 0.15386274580292628, - "loss/policy_avg": 2.4887824058532715, - "lr": 9.658742331288344e-06, - "objective/entropy": -99.11781311035156, - "objective/kl": 40.88715362548828, - "objective/non_score_reward": -4.088715553283691, - "objective/rlhf_reward": -16.354861974716187, - "objective/scores": 0.0, - "policy/approxkl_avg": 15.820581436157227, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.699001133441925, - "step": 534, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.996523141860962 - }, - { - "episode": 8576, - "epoch": 0.15415033972031492, - "loss/policy_avg": -0.13506712019443512, - "lr": 9.658103271983641e-06, - "objective/entropy": -89.46176147460938, - "objective/kl": 39.85636901855469, - "objective/non_score_reward": -3.9856371879577637, - "objective/rlhf_reward": -14.280688291013824, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.0638363361358643, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5437813401222229, - "step": 535, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0016019344329834 - }, - { - "episode": 8592, - "epoch": 0.15443793363770356, - "loss/policy_avg": 0.1690143346786499, - "lr": 9.657464212678938e-06, - "objective/entropy": -69.37506866455078, - "objective/kl": 33.3219108581543, - "objective/non_score_reward": -3.332190990447998, - "objective/rlhf_reward": -11.666904454649078, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 3.42812442779541, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7411162853240967, - "step": 536, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0052967071533203 - }, - { - "episode": 8608, - "epoch": 0.1547255275550922, - "loss/policy_avg": 0.6035354733467102, - "lr": 9.656825153374235e-06, - "objective/entropy": -19.672401428222656, - "objective/kl": 43.020362854003906, - "objective/non_score_reward": -4.302036285400391, - "objective/rlhf_reward": -15.651886313167168, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 42.07685089111328, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7375407814979553, - "step": 537, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9986612796783447 - }, - { - "episode": 8624, - "epoch": 0.15501312147248086, - "loss/policy_avg": 0.617863655090332, - "lr": 9.65618609406953e-06, - "objective/entropy": -96.45539093017578, - "objective/kl": 43.908935546875, - "objective/non_score_reward": -4.390893936157227, - "objective/rlhf_reward": -17.56357455253601, - "objective/scores": 0.0, - "policy/approxkl_avg": 41.87989807128906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6939387917518616, - "step": 538, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990483522415161 - }, - { - "episode": 8640, - "epoch": 0.1553007153898695, - "loss/policy_avg": 1.1206727027893066, - "lr": 9.655547034764827e-06, - "objective/entropy": -212.7060089111328, - "objective/kl": 30.902385711669922, - "objective/non_score_reward": -3.0902388095855713, - "objective/rlhf_reward": -11.019319584875731, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 32.364540100097656, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.4254102110862732, - "step": 539, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9980194568634033 - }, - { - "episode": 8656, - "epoch": 0.15558830930725814, - "loss/policy_avg": 0.10584121942520142, - "lr": 9.654907975460124e-06, - "objective/entropy": -244.4935760498047, - "objective/kl": 35.60231018066406, - "objective/non_score_reward": -3.5602309703826904, - "objective/rlhf_reward": -14.24092435836792, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.080685615539551, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5694947838783264, - "step": 540, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.001322031021118 - }, - { - "episode": 8672, - "epoch": 0.1558759032246468, - "loss/policy_avg": 0.6853758096694946, - "lr": 9.65426891615542e-06, - "objective/entropy": 200.61570739746094, - "objective/kl": 53.1297492980957, - "objective/non_score_reward": -5.31297492980957, - "objective/rlhf_reward": -18.85190019607544, - "objective/scores": 0.6, - "policy/approxkl_avg": 14.122016906738281, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.846699595451355, - "step": 541, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9972319602966309 - }, - { - "episode": 8688, - "epoch": 0.15616349714203545, - "loss/policy_avg": 0.8006021976470947, - "lr": 9.653629856850718e-06, - "objective/entropy": -177.16116333007812, - "objective/kl": 48.53944396972656, - "objective/non_score_reward": -4.853944301605225, - "objective/rlhf_reward": -17.965178589434966, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 88.10623931884766, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5958341360092163, - "step": 542, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.996194839477539 - }, - { - "episode": 8704, - "epoch": 0.1564510910594241, - "loss/policy_avg": 0.7566841244697571, - "lr": 9.652990797546013e-06, - "objective/entropy": -168.28700256347656, - "objective/kl": 38.52307891845703, - "objective/non_score_reward": -3.8523077964782715, - "objective/rlhf_reward": -14.067595055609374, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 21.15276336669922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8955333232879639, - "step": 543, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0001535415649414 - }, - { - "episode": 8720, - "epoch": 0.15673868497681273, - "loss/policy_avg": 2.5178232192993164, - "lr": 9.65235173824131e-06, - "objective/entropy": -204.9830322265625, - "objective/kl": 42.33374786376953, - "objective/non_score_reward": -4.23337459564209, - "objective/rlhf_reward": -15.55489716776977, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 63.29674530029297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7277956008911133, - "step": 544, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9969232082366943 - }, - { - "episode": 8736, - "epoch": 0.1570262788942014, - "loss/policy_avg": 0.6960093975067139, - "lr": 9.651712678936605e-06, - "objective/entropy": 95.6426773071289, - "objective/kl": 32.65497970581055, - "objective/non_score_reward": -3.265498161315918, - "objective/rlhf_reward": -11.736479315787477, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 17.43046760559082, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4536857604980469, - "step": 545, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9972350597381592 - }, - { - "episode": 8752, - "epoch": 0.15731387281159004, - "loss/policy_avg": 0.9652435779571533, - "lr": 9.651073619631902e-06, - "objective/entropy": 20.44597625732422, - "objective/kl": 33.02486038208008, - "objective/non_score_reward": -3.302485942840576, - "objective/rlhf_reward": -11.0872378966966, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 30.350900650024414, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7735705375671387, - "step": 546, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9980368614196777 - }, - { - "episode": 8768, - "epoch": 0.15760146672897868, - "loss/policy_avg": 0.7415152788162231, - "lr": 9.650434560327199e-06, - "objective/entropy": -252.13894653320312, - "objective/kl": 33.90022659301758, - "objective/non_score_reward": -3.3900227546691895, - "objective/rlhf_reward": -11.898231034696686, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 6.943023681640625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5897954702377319, - "step": 547, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.003216028213501 - }, - { - "episode": 8784, - "epoch": 0.15788906064636732, - "loss/policy_avg": 0.5711463093757629, - "lr": 9.649795501022496e-06, - "objective/entropy": -185.48312377929688, - "objective/kl": 37.01996994018555, - "objective/non_score_reward": -3.7019972801208496, - "objective/rlhf_reward": -12.407988643646242, - "objective/scores": 0.6, - "policy/approxkl_avg": 4.724537372589111, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.563432514667511, - "step": 548, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000291585922241 - }, - { - "episode": 8800, - "epoch": 0.158176654563756, - "loss/policy_avg": 1.1151976585388184, - "lr": 9.649156441717792e-06, - "objective/entropy": -300.8952331542969, - "objective/kl": 50.2194938659668, - "objective/non_score_reward": -5.021949768066406, - "objective/rlhf_reward": -18.531539766994072, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 10.345390319824219, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5680312514305115, - "step": 549, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.998915672302246 - }, - { - "episode": 8816, - "epoch": 0.15846424848114463, - "loss/policy_avg": -0.044495925307273865, - "lr": 9.64851738241309e-06, - "objective/entropy": -122.7219009399414, - "objective/kl": 33.53822708129883, - "objective/non_score_reward": -3.353822708129883, - "objective/rlhf_reward": -11.681957976023355, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 2.664604902267456, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.675072193145752, - "step": 550, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000653028488159 - }, - { - "episode": 8832, - "epoch": 0.15875184239853327, - "loss/policy_avg": 0.5436700582504272, - "lr": 9.647878323108384e-06, - "objective/entropy": -333.17449951171875, - "objective/kl": 26.372095108032227, - "objective/non_score_reward": -2.637209892272949, - "objective/rlhf_reward": -8.94471893078478, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 19.02151870727539, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7037907838821411, - "step": 551, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9981114864349365 - }, - { - "episode": 8848, - "epoch": 0.1590394363159219, - "loss/policy_avg": 3.402425765991211, - "lr": 9.647239263803681e-06, - "objective/entropy": -13.591423034667969, - "objective/kl": 46.72789764404297, - "objective/non_score_reward": -4.672789573669434, - "objective/rlhf_reward": -17.240561823459014, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 26.33546257019043, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6139748096466064, - "step": 552, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998695731163025 - }, - { - "episode": 8864, - "epoch": 0.15932703023331057, - "loss/policy_avg": 1.2331594228744507, - "lr": 9.646600204498978e-06, - "objective/entropy": 47.64061737060547, - "objective/kl": 45.18553924560547, - "objective/non_score_reward": -4.518553733825684, - "objective/rlhf_reward": -15.951509179846319, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 38.655860900878906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5798001289367676, - "step": 553, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9974098205566406 - }, - { - "episode": 8880, - "epoch": 0.1596146241506992, - "loss/policy_avg": 0.5167936086654663, - "lr": 9.645961145194275e-06, - "objective/entropy": -60.59259796142578, - "objective/kl": 34.74770736694336, - "objective/non_score_reward": -3.474771022796631, - "objective/rlhf_reward": -12.074255581173013, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 62.432594299316406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5199726819992065, - "step": 554, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9998353719711304 - }, - { - "episode": 8896, - "epoch": 0.15990221806808785, - "loss/policy_avg": 0.5368022322654724, - "lr": 9.645322085889572e-06, - "objective/entropy": -187.44839477539062, - "objective/kl": 44.21339416503906, - "objective/non_score_reward": -4.42133903503418, - "objective/rlhf_reward": -16.023497348249542, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 24.31802749633789, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7579926252365112, - "step": 555, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.997237205505371 - }, - { - "episode": 8912, - "epoch": 0.1601898119854765, - "loss/policy_avg": 0.281308650970459, - "lr": 9.644683026584867e-06, - "objective/entropy": -127.6090087890625, - "objective/kl": 37.9559440612793, - "objective/non_score_reward": -3.7955944538116455, - "objective/rlhf_reward": -13.666606032642061, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 24.68885040283203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5633901357650757, - "step": 556, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9994604587554932 - }, - { - "episode": 8928, - "epoch": 0.16047740590286516, - "loss/policy_avg": 0.20386452972888947, - "lr": 9.644043967280164e-06, - "objective/entropy": -198.5346221923828, - "objective/kl": 23.749149322509766, - "objective/non_score_reward": -2.37491512298584, - "objective/rlhf_reward": -8.140410625670834, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 4.18134069442749, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5204647183418274, - "step": 557, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9984912872314453 - }, - { - "episode": 8944, - "epoch": 0.1607649998202538, - "loss/policy_avg": 0.07800257205963135, - "lr": 9.643404907975461e-06, - "objective/entropy": -6.492671966552734, - "objective/kl": 38.0936164855957, - "objective/non_score_reward": -3.8093619346618652, - "objective/rlhf_reward": -13.721675479205782, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 34.346961975097656, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6237818002700806, - "step": 558, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9995702505111694 - }, - { - "episode": 8960, - "epoch": 0.16105259373764244, - "loss/policy_avg": 0.3377554416656494, - "lr": 9.642765848670758e-06, - "objective/entropy": -133.30020141601562, - "objective/kl": 43.600772857666016, - "objective/non_score_reward": -4.360077857971191, - "objective/rlhf_reward": -16.11479810240857, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 41.798095703125, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6249016523361206, - "step": 559, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9988707304000854 - }, - { - "episode": 8976, - "epoch": 0.1613401876550311, - "loss/policy_avg": 0.9884511232376099, - "lr": 9.642126789366055e-06, - "objective/entropy": -234.08079528808594, - "objective/kl": 37.608299255371094, - "objective/non_score_reward": -3.7608296871185303, - "objective/rlhf_reward": -10.643318510055543, - "objective/scores": 1.1, - "policy/approxkl_avg": 17.653762817382812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7640604376792908, - "step": 560, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9996306896209717 - }, - { - "episode": 8992, - "epoch": 0.16162778157241975, - "loss/policy_avg": 0.6431108713150024, - "lr": 9.641487730061352e-06, - "objective/entropy": -74.99330139160156, - "objective/kl": 41.57349395751953, - "objective/non_score_reward": -4.157349109649658, - "objective/rlhf_reward": -15.303883824378175, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 38.90899658203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.47073087096214294, - "step": 561, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9977537393569946 - }, - { - "episode": 9008, - "epoch": 0.1619153754898084, - "loss/policy_avg": 0.022986948490142822, - "lr": 9.640848670756647e-06, - "objective/entropy": 68.02236938476562, - "objective/kl": 48.71097946166992, - "objective/non_score_reward": -4.871097564697266, - "objective/rlhf_reward": -15.084390974044801, - "objective/scores": 1.1, - "policy/approxkl_avg": 33.34661102294922, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.692690372467041, - "step": 562, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001621723175049 - }, - { - "episode": 9024, - "epoch": 0.16220296940719703, - "loss/policy_avg": 0.07986500859260559, - "lr": 9.640209611451944e-06, - "objective/entropy": -252.08282470703125, - "objective/kl": 31.946533203125, - "objective/non_score_reward": -3.1946535110473633, - "objective/rlhf_reward": -10.831202576832709, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 7.246588706970215, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6767693758010864, - "step": 563, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9986155033111572 - }, - { - "episode": 9040, - "epoch": 0.1624905633245857, - "loss/policy_avg": 1.3177964687347412, - "lr": 9.63957055214724e-06, - "objective/entropy": -93.1424331665039, - "objective/kl": 48.07649612426758, - "objective/non_score_reward": -4.807649612426758, - "objective/rlhf_reward": -17.283187220768866, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 11.541034698486328, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.561279296875, - "step": 564, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9975336790084839 - }, - { - "episode": 9056, - "epoch": 0.16277815724197434, - "loss/policy_avg": 0.01985163800418377, - "lr": 9.638931492842537e-06, - "objective/entropy": -291.87255859375, - "objective/kl": 36.759742736816406, - "objective/non_score_reward": -3.675974130630493, - "objective/rlhf_reward": -13.325294115630488, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 2.6155858039855957, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6654636263847351, - "step": 565, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000267505645752 - }, - { - "episode": 9072, - "epoch": 0.16306575115936298, - "loss/policy_avg": 0.2043217122554779, - "lr": 9.638292433537834e-06, - "objective/entropy": -73.8929443359375, - "objective/kl": 51.33345413208008, - "objective/non_score_reward": -5.133345603942871, - "objective/rlhf_reward": -19.109550078113642, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 33.83651351928711, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7135969400405884, - "step": 566, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998769998550415 - }, - { - "episode": 9088, - "epoch": 0.16335334507675162, - "loss/policy_avg": 0.5555611848831177, - "lr": 9.63765337423313e-06, - "objective/entropy": -161.65342712402344, - "objective/kl": 41.112884521484375, - "objective/non_score_reward": -4.111289024353027, - "objective/rlhf_reward": -14.711821810404459, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 42.619407653808594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5503981709480286, - "step": 567, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0006415843963623 - }, - { - "episode": 9104, - "epoch": 0.16364093899414028, - "loss/policy_avg": 0.822566032409668, - "lr": 9.637014314928426e-06, - "objective/entropy": -121.46438598632812, - "objective/kl": 38.44181823730469, - "objective/non_score_reward": -3.844181776046753, - "objective/rlhf_reward": -13.551898355754922, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 86.43548583984375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5650614500045776, - "step": 568, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9973005056381226 - }, - { - "episode": 9120, - "epoch": 0.16392853291152892, - "loss/policy_avg": 0.9177229404449463, - "lr": 9.636375255623721e-06, - "objective/entropy": -71.64779663085938, - "objective/kl": 53.37040710449219, - "objective/non_score_reward": -5.337040901184082, - "objective/rlhf_reward": -21.348163604736328, - "objective/scores": 0.0, - "policy/approxkl_avg": 16.75054359436035, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7622511386871338, - "step": 569, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9974229335784912 - }, - { - "episode": 9136, - "epoch": 0.16421612682891756, - "loss/policy_avg": 2.550429582595825, - "lr": 9.635736196319018e-06, - "objective/entropy": -198.0861053466797, - "objective/kl": 37.597801208496094, - "objective/non_score_reward": -3.7597804069519043, - "objective/rlhf_reward": -13.305787817637125, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 17.44908905029297, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7178118228912354, - "step": 570, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9977492094039917 - }, - { - "episode": 9152, - "epoch": 0.1645037207463062, - "loss/policy_avg": 1.1576682329177856, - "lr": 9.635097137014315e-06, - "objective/entropy": -94.52823638916016, - "objective/kl": 41.15838623046875, - "objective/non_score_reward": -4.115838050842285, - "objective/rlhf_reward": -12.063353157043458, - "objective/scores": 1.1, - "policy/approxkl_avg": 34.44091033935547, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6733859181404114, - "step": 571, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9955500364303589 - }, - { - "episode": 9168, - "epoch": 0.16479131466369487, - "loss/policy_avg": 0.7017364501953125, - "lr": 9.634458077709612e-06, - "objective/entropy": -58.18111801147461, - "objective/kl": 47.94506072998047, - "objective/non_score_reward": -4.794506549835205, - "objective/rlhf_reward": -17.818776333068293, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 5.405066013336182, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6459875106811523, - "step": 572, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999786615371704 - }, - { - "episode": 9184, - "epoch": 0.1650789085810835, - "loss/policy_avg": 0.6844867467880249, - "lr": 9.633819018404909e-06, - "objective/entropy": -182.14816284179688, - "objective/kl": 37.610591888427734, - "objective/non_score_reward": -3.761059284210205, - "objective/rlhf_reward": -13.593638758273467, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 45.46365737915039, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5423248410224915, - "step": 573, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0002753734588623 - }, - { - "episode": 9200, - "epoch": 0.16536650249847215, - "loss/policy_avg": 0.8669987916946411, - "lr": 9.633179959100206e-06, - "objective/entropy": -232.71591186523438, - "objective/kl": 40.21477127075195, - "objective/non_score_reward": -4.021476745605469, - "objective/rlhf_reward": -14.424048905790436, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 35.74150085449219, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5931271314620972, - "step": 574, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9967412948608398 - }, - { - "episode": 9216, - "epoch": 0.1656540964158608, - "loss/policy_avg": 1.4781056642532349, - "lr": 9.632540899795501e-06, - "objective/entropy": -273.8671569824219, - "objective/kl": 34.78312301635742, - "objective/non_score_reward": -3.4783124923706055, - "objective/rlhf_reward": -12.489417989452448, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 10.15843391418457, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6542942523956299, - "step": 575, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.997775673866272 - }, - { - "episode": 9232, - "epoch": 0.16594169033324946, - "loss/policy_avg": 0.36914390325546265, - "lr": 9.631901840490798e-06, - "objective/entropy": -327.1934814453125, - "objective/kl": 27.51502227783203, - "objective/non_score_reward": -2.75150203704834, - "objective/rlhf_reward": -9.606008923053743, - "objective/scores": 0.35, - "policy/approxkl_avg": 28.219341278076172, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6692686080932617, - "step": 576, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0001039505004883 - }, - { - "episode": 9248, - "epoch": 0.1662292842506381, - "loss/policy_avg": 0.13604627549648285, - "lr": 9.631262781186095e-06, - "objective/entropy": -145.65203857421875, - "objective/kl": 41.72386169433594, - "objective/non_score_reward": -4.172386169433594, - "objective/rlhf_reward": -15.347908547430663, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 14.616079330444336, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7129019498825073, - "step": 577, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000507354736328 - }, - { - "episode": 9264, - "epoch": 0.16651687816802674, - "loss/policy_avg": 0.17738819122314453, - "lr": 9.630623721881392e-06, - "objective/entropy": -60.45104217529297, - "objective/kl": 34.91204833984375, - "objective/non_score_reward": -3.4912052154541016, - "objective/rlhf_reward": -12.40856227180059, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 5.907928943634033, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5182020664215088, - "step": 578, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0003273487091064 - }, - { - "episode": 9280, - "epoch": 0.1668044720854154, - "loss/policy_avg": 1.5535671710968018, - "lr": 9.629984662576689e-06, - "objective/entropy": -110.2305679321289, - "objective/kl": 39.423648834228516, - "objective/non_score_reward": -3.9423651695251465, - "objective/rlhf_reward": -14.41021057340948, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 16.688522338867188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.553377628326416, - "step": 579, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000542402267456 - }, - { - "episode": 9296, - "epoch": 0.16709206600280405, - "loss/policy_avg": -0.21501094102859497, - "lr": 9.629345603271984e-06, - "objective/entropy": -208.572509765625, - "objective/kl": 29.07717514038086, - "objective/non_score_reward": -2.90771746635437, - "objective/rlhf_reward": -10.207038243015376, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 16.939083099365234, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6292991638183594, - "step": 580, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0005156993865967 - }, - { - "episode": 9312, - "epoch": 0.16737965992019269, - "loss/policy_avg": -0.1628519594669342, - "lr": 9.62870654396728e-06, - "objective/entropy": -64.79794311523438, - "objective/kl": 37.24529266357422, - "objective/non_score_reward": -3.724529266357422, - "objective/rlhf_reward": -14.898117303848267, - "objective/scores": 0.0, - "policy/approxkl_avg": 81.15853881835938, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.37679994106292725, - "step": 581, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0008161067962646 - }, - { - "episode": 9328, - "epoch": 0.16766725383758133, - "loss/policy_avg": 0.1621185839176178, - "lr": 9.628067484662578e-06, - "objective/entropy": -256.913330078125, - "objective/kl": 36.676727294921875, - "objective/non_score_reward": -3.667672872543335, - "objective/rlhf_reward": -13.008831744611847, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 34.5910530090332, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5756609439849854, - "step": 582, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9992961883544922 - }, - { - "episode": 9344, - "epoch": 0.16795484775497, - "loss/policy_avg": 0.29023104906082153, - "lr": 9.627428425357874e-06, - "objective/entropy": -18.905437469482422, - "objective/kl": 45.148834228515625, - "objective/non_score_reward": -4.514883041381836, - "objective/rlhf_reward": -16.57858121674812, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 30.742298126220703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8326776027679443, - "step": 583, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982852935791016 - }, - { - "episode": 9360, - "epoch": 0.16824244167235863, - "loss/policy_avg": 1.3382362127304077, - "lr": 9.626789366053171e-06, - "objective/entropy": -269.2889404296875, - "objective/kl": 36.53486633300781, - "objective/non_score_reward": -3.653486490249634, - "objective/rlhf_reward": -10.213946199417116, - "objective/scores": 1.1, - "policy/approxkl_avg": 35.406089782714844, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8077678680419922, - "step": 584, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 13, - "val/ratio": 1.99800443649292 - }, - { - "episode": 9376, - "epoch": 0.16853003558974727, - "loss/policy_avg": 0.42918533086776733, - "lr": 9.626150306748468e-06, - "objective/entropy": 149.2460174560547, - "objective/kl": 39.844512939453125, - "objective/non_score_reward": -3.9844510555267334, - "objective/rlhf_reward": -14.578554355834406, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 59.42961883544922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3823997676372528, - "step": 585, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9983184337615967 - }, - { - "episode": 9392, - "epoch": 0.1688176295071359, - "loss/policy_avg": 0.09273044764995575, - "lr": 9.625511247443763e-06, - "objective/entropy": -167.0006103515625, - "objective/kl": 32.98309326171875, - "objective/non_score_reward": -3.298309087753296, - "objective/rlhf_reward": -11.833986723159237, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 101.41903686523438, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.547985315322876, - "step": 586, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9995431900024414 - }, - { - "episode": 9408, - "epoch": 0.16910522342452458, - "loss/policy_avg": 0.1386108696460724, - "lr": 9.62487218813906e-06, - "objective/entropy": -189.29864501953125, - "objective/kl": 43.7606201171875, - "objective/non_score_reward": -4.376062393188477, - "objective/rlhf_reward": -16.02329647820747, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 3.214796543121338, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.551296591758728, - "step": 587, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9979034662246704 - }, - { - "episode": 9424, - "epoch": 0.16939281734191322, - "loss/policy_avg": 0.15018180012702942, - "lr": 9.624233128834357e-06, - "objective/entropy": -151.0711669921875, - "objective/kl": 36.06580352783203, - "objective/non_score_reward": -3.6065807342529297, - "objective/rlhf_reward": -13.084686806708007, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 24.849681854248047, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5903939008712769, - "step": 588, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9969112873077393 - }, - { - "episode": 9440, - "epoch": 0.16968041125930186, - "loss/policy_avg": 0.6633468866348267, - "lr": 9.623594069529654e-06, - "objective/entropy": -88.4188232421875, - "objective/kl": 41.29294204711914, - "objective/non_score_reward": -4.129294395446777, - "objective/rlhf_reward": -15.175541928320555, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 53.11321258544922, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7691140174865723, - "step": 589, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9956674575805664 - }, - { - "episode": 9456, - "epoch": 0.1699680051766905, - "loss/policy_avg": -0.025976277887821198, - "lr": 9.62295501022495e-06, - "objective/entropy": -23.04672622680664, - "objective/kl": 42.318519592285156, - "objective/non_score_reward": -4.231852054595947, - "objective/rlhf_reward": -15.371149151530815, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 17.313936233520508, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4834129214286804, - "step": 590, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9979733228683472 - }, - { - "episode": 9472, - "epoch": 0.17025559909407917, - "loss/policy_avg": 0.35992100834846497, - "lr": 9.622315950920246e-06, - "objective/entropy": -135.56903076171875, - "objective/kl": 37.865943908691406, - "objective/non_score_reward": -3.7865943908691406, - "objective/rlhf_reward": -13.787127458785456, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 34.8250732421875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6128636598587036, - "step": 591, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9998369216918945 - }, - { - "episode": 9488, - "epoch": 0.1705431930114678, - "loss/policy_avg": -0.050314195454120636, - "lr": 9.621676891615543e-06, - "objective/entropy": -198.44839477539062, - "objective/kl": 38.32909393310547, - "objective/non_score_reward": -3.832909345626831, - "objective/rlhf_reward": -13.972387277816217, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.4614109992980957, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7091498374938965, - "step": 592, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986573457717896 - }, - { - "episode": 9504, - "epoch": 0.17083078692885645, - "loss/policy_avg": -0.00115779263433069, - "lr": 9.621037832310838e-06, - "objective/entropy": -143.01028442382812, - "objective/kl": 32.83100509643555, - "objective/non_score_reward": -3.2831003665924072, - "objective/rlhf_reward": -11.184990237431464, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 8.594361305236816, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6112456321716309, - "step": 593, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0005526542663574 - }, - { - "episode": 9520, - "epoch": 0.1711183808462451, - "loss/policy_avg": 0.5914216041564941, - "lr": 9.620398773006135e-06, - "objective/entropy": -15.847023010253906, - "objective/kl": 41.543052673339844, - "objective/non_score_reward": -4.154305458068848, - "objective/rlhf_reward": -15.193389613826838, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 26.619815826416016, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5229488611221313, - "step": 594, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999138355255127 - }, - { - "episode": 9536, - "epoch": 0.17140597476363376, - "loss/policy_avg": 0.48162180185317993, - "lr": 9.619759713701432e-06, - "objective/entropy": 49.441688537597656, - "objective/kl": 41.245628356933594, - "objective/non_score_reward": -4.124563217163086, - "objective/rlhf_reward": -15.11965022334228, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 119.17011260986328, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4364345073699951, - "step": 595, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999006986618042 - }, - { - "episode": 9552, - "epoch": 0.1716935686810224, - "loss/policy_avg": 0.13220281898975372, - "lr": 9.619120654396729e-06, - "objective/entropy": -217.9917755126953, - "objective/kl": 41.454307556152344, - "objective/non_score_reward": -4.145431041717529, - "objective/rlhf_reward": -15.100772025997998, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 26.718765258789062, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6766424775123596, - "step": 596, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9994851350784302 - }, - { - "episode": 9568, - "epoch": 0.17198116259841104, - "loss/policy_avg": 0.39218467473983765, - "lr": 9.618481595092026e-06, - "objective/entropy": -321.5009460449219, - "objective/kl": 36.74188232421875, - "objective/non_score_reward": -3.6741881370544434, - "objective/rlhf_reward": -13.034893279493438, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 27.68082046508789, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5078242421150208, - "step": 597, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999809980392456 - }, - { - "episode": 9584, - "epoch": 0.1722687565157997, - "loss/policy_avg": 0.1808081567287445, - "lr": 9.617842535787323e-06, - "objective/entropy": -172.8281707763672, - "objective/kl": 40.136253356933594, - "objective/non_score_reward": -4.013625144958496, - "objective/rlhf_reward": -14.603903393359527, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 13.015371322631836, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5869624614715576, - "step": 598, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.999362587928772 - }, - { - "episode": 9600, - "epoch": 0.17255635043318834, - "loss/policy_avg": -0.12051941454410553, - "lr": 9.617203476482618e-06, - "objective/entropy": -113.57295989990234, - "objective/kl": 37.42548370361328, - "objective/non_score_reward": -3.7425484657287598, - "objective/rlhf_reward": -13.022782753186164, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 117.42220306396484, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7584841847419739, - "step": 599, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973876476287842 - }, - { - "episode": 9616, - "epoch": 0.17284394435057698, - "loss/policy_avg": 1.5483704805374146, - "lr": 9.616564417177915e-06, - "objective/entropy": -355.0560302734375, - "objective/kl": 39.736637115478516, - "objective/non_score_reward": -3.973663568496704, - "objective/rlhf_reward": -14.378882491382296, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 12.028496742248535, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7178500294685364, - "step": 600, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.000492572784424 - }, - { - "episode": 9632, - "epoch": 0.17313153826796562, - "loss/policy_avg": 0.6728031039237976, - "lr": 9.615925357873211e-06, - "objective/entropy": -193.89974975585938, - "objective/kl": 32.3430290222168, - "objective/non_score_reward": -3.2343029975891113, - "objective/rlhf_reward": -11.275351767957794, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 27.227508544921875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5896027088165283, - "step": 601, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998093605041504 - }, - { - "episode": 9648, - "epoch": 0.1734191321853543, - "loss/policy_avg": 2.2606358528137207, - "lr": 9.615286298568508e-06, - "objective/entropy": -240.98487854003906, - "objective/kl": 43.463775634765625, - "objective/non_score_reward": -4.346377372741699, - "objective/rlhf_reward": -15.723651414335357, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 39.062442779541016, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5951015949249268, - "step": 602, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998093843460083 - }, - { - "episode": 9664, - "epoch": 0.17370672610274293, - "loss/policy_avg": 1.1621661186218262, - "lr": 9.614647239263805e-06, - "objective/entropy": -128.38201904296875, - "objective/kl": 49.477264404296875, - "objective/non_score_reward": -4.947726249694824, - "objective/rlhf_reward": -18.44927029898706, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 32.63263702392578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6886211633682251, - "step": 603, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9999189376831055 - }, - { - "episode": 9680, - "epoch": 0.17399432002013157, - "loss/policy_avg": 0.4888812303543091, - "lr": 9.6140081799591e-06, - "objective/entropy": -90.85795593261719, - "objective/kl": 46.02019500732422, - "objective/non_score_reward": -4.602019786834717, - "objective/rlhf_reward": -16.85182079574163, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 18.202045440673828, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6015419960021973, - "step": 604, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.99855637550354 - }, - { - "episode": 9696, - "epoch": 0.1742819139375202, - "loss/policy_avg": 0.5101502537727356, - "lr": 9.613369120654397e-06, - "objective/entropy": -405.04302978515625, - "objective/kl": 39.584861755371094, - "objective/non_score_reward": -3.958486557006836, - "objective/rlhf_reward": -12.910226498485777, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 3.033137083053589, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6735005378723145, - "step": 605, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.001615524291992 - }, - { - "episode": 9712, - "epoch": 0.17456950785490888, - "loss/policy_avg": 1.0287859439849854, - "lr": 9.612730061349694e-06, - "objective/entropy": -313.8016662597656, - "objective/kl": 44.508392333984375, - "objective/non_score_reward": -4.450839042663574, - "objective/rlhf_reward": -14.879638825298521, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 46.738216400146484, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.47219163179397583, - "step": 606, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9983954429626465 - }, - { - "episode": 9728, - "epoch": 0.17485710177229752, - "loss/policy_avg": 0.0703946053981781, - "lr": 9.612091002044991e-06, - "objective/entropy": -399.7001037597656, - "objective/kl": 28.662792205810547, - "objective/non_score_reward": -2.866279363632202, - "objective/rlhf_reward": -10.105867826674862, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 54.66145706176758, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5760586261749268, - "step": 607, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9973869323730469 - }, - { - "episode": 9744, - "epoch": 0.17514469568968616, - "loss/policy_avg": -0.015791811048984528, - "lr": 9.611451942740288e-06, - "objective/entropy": -378.9763488769531, - "objective/kl": 36.73283386230469, - "objective/non_score_reward": -3.673283576965332, - "objective/rlhf_reward": -13.089013609949667, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 12.10586929321289, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6941601037979126, - "step": 608, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 16, - "val/ratio": 1.9997576475143433 - }, - { - "episode": 9760, - "epoch": 0.1754322896070748, - "loss/policy_avg": 0.15788066387176514, - "lr": 9.610812883435585e-06, - "objective/entropy": -255.90493774414062, - "objective/kl": 48.99408721923828, - "objective/non_score_reward": -4.899409294128418, - "objective/rlhf_reward": -17.86430288950602, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 5.658967971801758, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6300114393234253, - "step": 609, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0000362396240234 - }, - { - "episode": 9776, - "epoch": 0.17571988352446347, - "loss/policy_avg": 1.4745373725891113, - "lr": 9.61017382413088e-06, - "objective/entropy": -92.97071838378906, - "objective/kl": 54.66108322143555, - "objective/non_score_reward": -5.466108322143555, - "objective/rlhf_reward": -19.917022059636054, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 8.048752784729004, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5211347341537476, - "step": 610, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9973746538162231 - }, - { - "episode": 9792, - "epoch": 0.1760074774418521, - "loss/policy_avg": -0.23048746585845947, - "lr": 9.609534764826177e-06, - "objective/entropy": -110.38803100585938, - "objective/kl": 28.35976219177246, - "objective/non_score_reward": -2.8359761238098145, - "objective/rlhf_reward": -11.343904733657837, - "objective/scores": 0.0, - "policy/approxkl_avg": 10.715461730957031, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.7749781608581543, - "step": 611, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001636505126953 - }, - { - "episode": 9808, - "epoch": 0.17629507135924075, - "loss/policy_avg": 2.0556282997131348, - "lr": 9.608895705521472e-06, - "objective/entropy": -86.34033203125, - "objective/kl": 38.10757064819336, - "objective/non_score_reward": -3.8107571601867676, - "objective/rlhf_reward": -13.638909462753851, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 79.96404266357422, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.47780632972717285, - "step": 612, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981083869934082 - }, - { - "episode": 9824, - "epoch": 0.17658266527662939, - "loss/policy_avg": 2.2824478149414062, - "lr": 9.608256646216769e-06, - "objective/entropy": -346.4453125, - "objective/kl": 51.050071716308594, - "objective/non_score_reward": -5.105007171630859, - "objective/rlhf_reward": -16.020029401779176, - "objective/scores": 1.1, - "policy/approxkl_avg": 11.854001998901367, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6633209586143494, - "step": 613, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.998619556427002 - }, - { - "episode": 9840, - "epoch": 0.17687025919401805, - "loss/policy_avg": 0.2608579397201538, - "lr": 9.607617586912066e-06, - "objective/entropy": -75.5977783203125, - "objective/kl": 30.53676986694336, - "objective/non_score_reward": -3.0536770820617676, - "objective/rlhf_reward": -10.552848582685577, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 82.93180847167969, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7450053691864014, - "step": 614, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9989418983459473 - }, - { - "episode": 9856, - "epoch": 0.1771578531114067, - "loss/policy_avg": 0.5107153654098511, - "lr": 9.606978527607363e-06, - "objective/entropy": -230.27703857421875, - "objective/kl": 40.28911590576172, - "objective/non_score_reward": -4.028911590576172, - "objective/rlhf_reward": -14.665047745318756, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 18.19654083251953, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7694085836410522, - "step": 615, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9983909130096436 - }, - { - "episode": 9872, - "epoch": 0.17744544702879533, - "loss/policy_avg": 0.14565017819404602, - "lr": 9.60633946830266e-06, - "objective/entropy": -214.1361083984375, - "objective/kl": 16.23416519165039, - "objective/non_score_reward": -1.6234164237976074, - "objective/rlhf_reward": -3.5699468001138897, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 9.1363525390625, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.7368258237838745, - "step": 616, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9986224174499512 - }, - { - "episode": 9888, - "epoch": 0.177733040946184, - "loss/policy_avg": 0.916619598865509, - "lr": 9.605700408997955e-06, - "objective/entropy": 65.34817504882812, - "objective/kl": 39.758148193359375, - "objective/non_score_reward": -3.9758148193359375, - "objective/rlhf_reward": -14.561623623877196, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 49.81817626953125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6418388485908508, - "step": 617, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9961199760437012 - }, - { - "episode": 9904, - "epoch": 0.17802063486357264, - "loss/policy_avg": 0.43867984414100647, - "lr": 9.605061349693252e-06, - "objective/entropy": -291.1097106933594, - "objective/kl": 30.660541534423828, - "objective/non_score_reward": -3.066054105758667, - "objective/rlhf_reward": -7.8642161846160885, - "objective/scores": 1.1, - "policy/approxkl_avg": 38.47760009765625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8005675077438354, - "step": 618, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0004940032958984 - }, - { - "episode": 9920, - "epoch": 0.17830822878096128, - "loss/policy_avg": 0.48867934942245483, - "lr": 9.604422290388548e-06, - "objective/entropy": -219.26034545898438, - "objective/kl": 40.27912902832031, - "objective/non_score_reward": -4.0279130935668945, - "objective/rlhf_reward": -14.378319756189981, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 17.053512573242188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5421030521392822, - "step": 619, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990657567977905 - }, - { - "episode": 9936, - "epoch": 0.17859582269834992, - "loss/policy_avg": -0.16388291120529175, - "lr": 9.603783231083845e-06, - "objective/entropy": -5.59771728515625, - "objective/kl": 49.777130126953125, - "objective/non_score_reward": -4.977713584899902, - "objective/rlhf_reward": -18.551603042815607, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 13.792423248291016, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7284325957298279, - "step": 620, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0001120567321777 - }, - { - "episode": 9952, - "epoch": 0.1788834166157386, - "loss/policy_avg": 0.22561949491500854, - "lr": 9.603144171779142e-06, - "objective/entropy": -119.03425598144531, - "objective/kl": 39.67613220214844, - "objective/non_score_reward": -3.967613697052002, - "objective/rlhf_reward": -14.528818419485717, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 15.579143524169922, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.46927276253700256, - "step": 621, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9986939430236816 - }, - { - "episode": 9968, - "epoch": 0.17917101053312723, - "loss/policy_avg": -0.07936866581439972, - "lr": 9.602505112474439e-06, - "objective/entropy": -312.1820983886719, - "objective/kl": 38.546356201171875, - "objective/non_score_reward": -3.85463547706604, - "objective/rlhf_reward": -13.814421925608237, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.652659893035889, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6830805540084839, - "step": 622, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9996051788330078 - }, - { - "episode": 9984, - "epoch": 0.17945860445051587, - "loss/policy_avg": 0.13592973351478577, - "lr": 9.601866053169734e-06, - "objective/entropy": -120.9598388671875, - "objective/kl": 29.522968292236328, - "objective/non_score_reward": -2.952296733856201, - "objective/rlhf_reward": -10.409186935424804, - "objective/scores": 0.35, - "policy/approxkl_avg": 67.76509094238281, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5022489428520203, - "step": 623, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998583197593689 - }, - { - "episode": 10000, - "epoch": 0.1797461983679045, - "loss/policy_avg": 0.12349797785282135, - "lr": 9.601226993865031e-06, - "objective/entropy": -97.76485443115234, - "objective/kl": 31.539573669433594, - "objective/non_score_reward": -3.1539573669433594, - "objective/rlhf_reward": -11.215829467773437, - "objective/scores": 0.35, - "policy/approxkl_avg": 20.97466278076172, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7436941862106323, - "step": 624, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9988412857055664 - }, - { - "episode": 10016, - "epoch": 0.18003379228529318, - "loss/policy_avg": 0.24854499101638794, - "lr": 9.600587934560328e-06, - "objective/entropy": -142.84173583984375, - "objective/kl": 37.62577819824219, - "objective/non_score_reward": -3.762577533721924, - "objective/rlhf_reward": -13.569357517178418, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 1.546608567237854, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.48624539375305176, - "step": 625, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000833749771118 - }, - { - "episode": 10032, - "epoch": 0.18032138620268182, - "loss/policy_avg": 1.1254993677139282, - "lr": 9.599948875255625e-06, - "objective/entropy": -217.01876831054688, - "objective/kl": 50.34384536743164, - "objective/non_score_reward": -5.034384727478027, - "objective/rlhf_reward": -18.758937218276362, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 56.71213150024414, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6278942823410034, - "step": 626, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 13, - "val/ratio": 1.9996230602264404 - }, - { - "episode": 10048, - "epoch": 0.18060898012007046, - "loss/policy_avg": 0.9575183391571045, - "lr": 9.599309815950922e-06, - "objective/entropy": -334.89495849609375, - "objective/kl": 38.62886047363281, - "objective/non_score_reward": -3.8628854751586914, - "objective/rlhf_reward": -13.935770833285982, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 46.06591033935547, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6787221431732178, - "step": 627, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9963583946228027 - }, - { - "episode": 10064, - "epoch": 0.1808965740374591, - "loss/policy_avg": 0.37166520953178406, - "lr": 9.598670756646217e-06, - "objective/entropy": -253.23953247070312, - "objective/kl": 29.572723388671875, - "objective/non_score_reward": -2.9572722911834717, - "objective/rlhf_reward": -10.313317143710789, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 29.606956481933594, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5231032371520996, - "step": 628, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9965605735778809 - }, - { - "episode": 10080, - "epoch": 0.18118416795484776, - "loss/policy_avg": 1.5333852767944336, - "lr": 9.598031697341514e-06, - "objective/entropy": -217.50408935546875, - "objective/kl": 35.96586227416992, - "objective/non_score_reward": -3.596586227416992, - "objective/rlhf_reward": -9.986344790458679, - "objective/scores": 1.1, - "policy/approxkl_avg": 48.386070251464844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7713555097579956, - "step": 629, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9963762760162354 - }, - { - "episode": 10096, - "epoch": 0.1814717618722364, - "loss/policy_avg": 0.10991726070642471, - "lr": 9.59739263803681e-06, - "objective/entropy": -114.67947387695312, - "objective/kl": 38.67190933227539, - "objective/non_score_reward": -3.8671910762786865, - "objective/rlhf_reward": -13.987811687405467, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 64.13754272460938, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6923480033874512, - "step": 630, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9989780187606812 - }, - { - "episode": 10112, - "epoch": 0.18175935578962504, - "loss/policy_avg": 0.595374584197998, - "lr": 9.596753578732108e-06, - "objective/entropy": -72.69680786132812, - "objective/kl": 44.85560607910156, - "objective/non_score_reward": -4.485560894012451, - "objective/rlhf_reward": -16.49164567431961, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 164.61390686035156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.42736077308654785, - "step": 631, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9984698295593262 - }, - { - "episode": 10128, - "epoch": 0.18204694970701368, - "loss/policy_avg": -0.3946479856967926, - "lr": 9.596114519427405e-06, - "objective/entropy": 51.04241943359375, - "objective/kl": 40.956275939941406, - "objective/non_score_reward": -4.095627784729004, - "objective/rlhf_reward": -14.9825101852417, - "objective/scores": 0.35, - "policy/approxkl_avg": 6.049509525299072, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6018968224525452, - "step": 632, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0019989013671875 - }, - { - "episode": 10144, - "epoch": 0.18233454362440235, - "loss/policy_avg": 0.049627840518951416, - "lr": 9.595475460122701e-06, - "objective/entropy": -355.97625732421875, - "objective/kl": 14.994010925292969, - "objective/non_score_reward": -1.4994010925292969, - "objective/rlhf_reward": -4.4413450648456365, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 37.16582489013672, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8317785263061523, - "step": 633, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9989538192749023 - }, - { - "episode": 10160, - "epoch": 0.182622137541791, - "loss/policy_avg": 0.5741980075836182, - "lr": 9.594836400817997e-06, - "objective/entropy": -140.58338928222656, - "objective/kl": 38.065673828125, - "objective/non_score_reward": -3.806567668914795, - "objective/rlhf_reward": -13.62214973932894, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 11.47018814086914, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5414795875549316, - "step": 634, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9968845844268799 - }, - { - "episode": 10176, - "epoch": 0.18290973145917963, - "loss/policy_avg": 0.23386423289775848, - "lr": 9.594197341513293e-06, - "objective/entropy": -104.3201675415039, - "objective/kl": 38.72356414794922, - "objective/non_score_reward": -3.872356414794922, - "objective/rlhf_reward": -15.489425420761108, - "objective/scores": 0.0, - "policy/approxkl_avg": 44.49272155761719, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5448688864707947, - "step": 635, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9986062049865723 - }, - { - "episode": 10192, - "epoch": 0.1831973253765683, - "loss/policy_avg": -0.018415771424770355, - "lr": 9.593558282208589e-06, - "objective/entropy": -323.28582763671875, - "objective/kl": 35.251190185546875, - "objective/non_score_reward": -3.525118827819824, - "objective/rlhf_reward": -12.36714245478312, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 9.481925010681152, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6414960622787476, - "step": 636, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9988579750061035 - }, - { - "episode": 10208, - "epoch": 0.18348491929395694, - "loss/policy_avg": 0.43244630098342896, - "lr": 9.592919222903886e-06, - "objective/entropy": 79.17412567138672, - "objective/kl": 46.785804748535156, - "objective/non_score_reward": -4.678580284118652, - "objective/rlhf_reward": -14.314322090148927, - "objective/scores": 1.1, - "policy/approxkl_avg": 13.969576835632324, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5658503770828247, - "step": 637, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0009024143218994 - }, - { - "episode": 10224, - "epoch": 0.18377251321134558, - "loss/policy_avg": 0.343896746635437, - "lr": 9.592280163599182e-06, - "objective/entropy": -136.99612426757812, - "objective/kl": 37.76481628417969, - "objective/non_score_reward": -3.7764816284179688, - "objective/rlhf_reward": -13.705926513671876, - "objective/scores": 0.35, - "policy/approxkl_avg": 22.39147186279297, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6087017059326172, - "step": 638, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0004689693450928 - }, - { - "episode": 10240, - "epoch": 0.18406010712873422, - "loss/policy_avg": -0.11058389395475388, - "lr": 9.59164110429448e-06, - "objective/entropy": -101.61506652832031, - "objective/kl": 42.183929443359375, - "objective/non_score_reward": -4.218392372131348, - "objective/rlhf_reward": -14.9261596900987, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 14.685033798217773, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6736252903938293, - "step": 639, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9999792575836182 - }, - { - "episode": 10256, - "epoch": 0.18434770104612289, - "loss/policy_avg": 0.0898745208978653, - "lr": 9.591002044989776e-06, - "objective/entropy": -113.99971008300781, - "objective/kl": 40.50453186035156, - "objective/non_score_reward": -4.050453186035156, - "objective/rlhf_reward": -14.876299891501588, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 36.957977294921875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5056084990501404, - "step": 640, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9983000755310059 - }, - { - "episode": 10272, - "epoch": 0.18463529496351153, - "loss/policy_avg": 0.5849778652191162, - "lr": 9.590362985685071e-06, - "objective/entropy": -274.3189392089844, - "objective/kl": 40.9473876953125, - "objective/non_score_reward": -4.094738960266113, - "objective/rlhf_reward": -14.2562493703523, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 1.2600057125091553, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7059886455535889, - "step": 641, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9991240501403809 - }, - { - "episode": 10288, - "epoch": 0.18492288888090017, - "loss/policy_avg": 0.23213040828704834, - "lr": 9.589723926380368e-06, - "objective/entropy": -247.97145080566406, - "objective/kl": 26.106571197509766, - "objective/non_score_reward": -2.610656976699829, - "objective/rlhf_reward": -9.018796045978632, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 64.95057678222656, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6558966636657715, - "step": 642, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.001206398010254 - }, - { - "episode": 10304, - "epoch": 0.1852104827982888, - "loss/policy_avg": 11.370098114013672, - "lr": 9.589084867075665e-06, - "objective/entropy": -46.817996978759766, - "objective/kl": 38.347511291503906, - "objective/non_score_reward": -3.8347513675689697, - "objective/rlhf_reward": -15.339005589485168, - "objective/scores": 0.0, - "policy/approxkl_avg": 5.018136978149414, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.7419272661209106, - "step": 643, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.000868558883667 - }, - { - "episode": 10320, - "epoch": 0.18549807671567747, - "loss/policy_avg": 0.19895608723163605, - "lr": 9.588445807770962e-06, - "objective/entropy": -103.27410125732422, - "objective/kl": 37.708839416503906, - "objective/non_score_reward": -3.7708842754364014, - "objective/rlhf_reward": -13.659704764087763, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 2.654179334640503, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.38327789306640625, - "step": 644, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9996483325958252 - }, - { - "episode": 10336, - "epoch": 0.1857856706330661, - "loss/policy_avg": 0.46165961027145386, - "lr": 9.587806748466259e-06, - "objective/entropy": -277.8352355957031, - "objective/kl": 36.63224792480469, - "objective/non_score_reward": -3.663224697113037, - "objective/rlhf_reward": -13.096639483180596, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 18.761638641357422, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5858588218688965, - "step": 645, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9983824491500854 - }, - { - "episode": 10352, - "epoch": 0.18607326455045475, - "loss/policy_avg": 0.7416555881500244, - "lr": 9.587167689161556e-06, - "objective/entropy": -125.7095718383789, - "objective/kl": 40.707427978515625, - "objective/non_score_reward": -4.070743083953857, - "objective/rlhf_reward": -14.904369452086787, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 79.23682403564453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5754390358924866, - "step": 646, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9980629682540894 - }, - { - "episode": 10368, - "epoch": 0.1863608584678434, - "loss/policy_avg": 0.17042623460292816, - "lr": 9.586528629856851e-06, - "objective/entropy": -272.11273193359375, - "objective/kl": 37.0462646484375, - "objective/non_score_reward": -3.7046265602111816, - "objective/rlhf_reward": -13.394673903186884, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 27.754566192626953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7012656331062317, - "step": 647, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9996850490570068 - }, - { - "episode": 10384, - "epoch": 0.18664845238523206, - "loss/policy_avg": 0.7964584827423096, - "lr": 9.585889570552148e-06, - "objective/entropy": -128.65829467773438, - "objective/kl": 36.44789123535156, - "objective/non_score_reward": -3.6447887420654297, - "objective/rlhf_reward": -13.25364259245984, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 22.820283889770508, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6445922255516052, - "step": 648, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0019288063049316 - }, - { - "episode": 10400, - "epoch": 0.1869360463026207, - "loss/policy_avg": 0.39208611845970154, - "lr": 9.585250511247445e-06, - "objective/entropy": -168.140625, - "objective/kl": 40.124786376953125, - "objective/non_score_reward": -4.012479305267334, - "objective/rlhf_reward": -14.690666639541071, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 8.317047119140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6378756761550903, - "step": 649, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9976195096969604 - }, - { - "episode": 10416, - "epoch": 0.18722364022000934, - "loss/policy_avg": 0.10013342648744583, - "lr": 9.584611451942742e-06, - "objective/entropy": -109.35111236572266, - "objective/kl": 34.334266662597656, - "objective/non_score_reward": -3.4334263801574707, - "objective/rlhf_reward": -12.392069867163329, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 8.64936351776123, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5027997493743896, - "step": 650, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9990345239639282 - }, - { - "episode": 10432, - "epoch": 0.18751123413739798, - "loss/policy_avg": -0.09684228897094727, - "lr": 9.583972392638038e-06, - "objective/entropy": -120.9965591430664, - "objective/kl": 42.267295837402344, - "objective/non_score_reward": -4.2267303466796875, - "objective/rlhf_reward": -15.082091207775186, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 3.862962245941162, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5842040777206421, - "step": 651, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9994006156921387 - }, - { - "episode": 10448, - "epoch": 0.18779882805478665, - "loss/policy_avg": 0.7214713096618652, - "lr": 9.583333333333335e-06, - "objective/entropy": 134.17503356933594, - "objective/kl": 42.244041442871094, - "objective/non_score_reward": -4.224404335021973, - "objective/rlhf_reward": -15.072786922725747, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 37.499027252197266, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.571992039680481, - "step": 652, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9959886074066162 - }, - { - "episode": 10464, - "epoch": 0.1880864219721753, - "loss/policy_avg": 1.098000407218933, - "lr": 9.58269427402863e-06, - "objective/entropy": 41.24781799316406, - "objective/kl": 44.69629669189453, - "objective/non_score_reward": -4.469630241394043, - "objective/rlhf_reward": -14.954801474453184, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 9.596094131469727, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9781997799873352, - "step": 653, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000553607940674 - }, - { - "episode": 10480, - "epoch": 0.18837401588956393, - "loss/policy_avg": 0.009640902280807495, - "lr": 9.582055214723927e-06, - "objective/entropy": -158.4319305419922, - "objective/kl": 39.49668502807617, - "objective/non_score_reward": -3.9496688842773438, - "objective/rlhf_reward": -14.194554600779135, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 5.894138336181641, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7409219741821289, - "step": 654, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000908851623535 - }, - { - "episode": 10496, - "epoch": 0.1886616098069526, - "loss/policy_avg": 1.1138458251953125, - "lr": 9.581416155419224e-06, - "objective/entropy": -360.32135009765625, - "objective/kl": 37.8345947265625, - "objective/non_score_reward": -3.7834596633911133, - "objective/rlhf_reward": -13.683239798159942, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 22.492328643798828, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6220200657844543, - "step": 655, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9974379539489746 - }, - { - "episode": 10512, - "epoch": 0.18894920372434124, - "loss/policy_avg": 0.20075130462646484, - "lr": 9.58077709611452e-06, - "objective/entropy": -133.64358520507812, - "objective/kl": 45.42504119873047, - "objective/non_score_reward": -4.54250431060791, - "objective/rlhf_reward": -15.770017242431642, - "objective/scores": 0.6, - "policy/approxkl_avg": 11.763971328735352, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.645634651184082, - "step": 656, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999332308769226 - }, - { - "episode": 10528, - "epoch": 0.18923679764172988, - "loss/policy_avg": -0.052210867404937744, - "lr": 9.580138036809816e-06, - "objective/entropy": -62.48654556274414, - "objective/kl": 38.83781433105469, - "objective/non_score_reward": -3.883781671524048, - "objective/rlhf_reward": -15.535126209259033, - "objective/scores": 0.0, - "policy/approxkl_avg": 57.294769287109375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5782778263092041, - "step": 657, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9967520236968994 - }, - { - "episode": 10544, - "epoch": 0.18952439155911852, - "loss/policy_avg": 0.23513492941856384, - "lr": 9.579498977505113e-06, - "objective/entropy": -327.19793701171875, - "objective/kl": 38.20623779296875, - "objective/non_score_reward": -3.8206238746643066, - "objective/rlhf_reward": -13.858663637836543, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 53.93671417236328, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6590805053710938, - "step": 658, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.999345302581787 - }, - { - "episode": 10560, - "epoch": 0.18981198547650718, - "loss/policy_avg": -0.06993924081325531, - "lr": 9.57885991820041e-06, - "objective/entropy": -264.03887939453125, - "objective/kl": 37.34988784790039, - "objective/non_score_reward": -3.7349889278411865, - "objective/rlhf_reward": -14.939955472946167, - "objective/scores": 0.0, - "policy/approxkl_avg": 0.5475348234176636, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5164666175842285, - "step": 659, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000227928161621 - }, - { - "episode": 10576, - "epoch": 0.19009957939389582, - "loss/policy_avg": 0.4225189685821533, - "lr": 9.578220858895705e-06, - "objective/entropy": 19.757404327392578, - "objective/kl": 28.303627014160156, - "objective/non_score_reward": -2.830362558364868, - "objective/rlhf_reward": -8.921450233459472, - "objective/scores": 0.6, - "policy/approxkl_avg": 5.871613502502441, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5146780014038086, - "step": 660, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9982116222381592 - }, - { - "episode": 10592, - "epoch": 0.19038717331128446, - "loss/policy_avg": 0.08937665820121765, - "lr": 9.577581799591002e-06, - "objective/entropy": 44.000144958496094, - "objective/kl": 42.06709671020508, - "objective/non_score_reward": -4.206709861755371, - "objective/rlhf_reward": -15.164979224622833, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 4.199661731719971, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5933263301849365, - "step": 661, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9983890056610107 - }, - { - "episode": 10608, - "epoch": 0.1906747672286731, - "loss/policy_avg": 0.3704705834388733, - "lr": 9.576942740286299e-06, - "objective/entropy": -61.501338958740234, - "objective/kl": 31.85788345336914, - "objective/non_score_reward": -3.185788154602051, - "objective/rlhf_reward": -10.343152618408203, - "objective/scores": 0.6, - "policy/approxkl_avg": 26.145584106445312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7544271945953369, - "step": 662, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998536229133606 - }, - { - "episode": 10624, - "epoch": 0.19096236114606177, - "loss/policy_avg": 1.3078722953796387, - "lr": 9.576303680981596e-06, - "objective/entropy": 34.678199768066406, - "objective/kl": 49.809627532958984, - "objective/non_score_reward": -4.980962753295898, - "objective/rlhf_reward": -18.523851490020753, - "objective/scores": 0.35, - "policy/approxkl_avg": 52.354454040527344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6402466297149658, - "step": 663, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9968035221099854 - }, - { - "episode": 10640, - "epoch": 0.1912499550634504, - "loss/policy_avg": 0.7618193626403809, - "lr": 9.575664621676893e-06, - "objective/entropy": 29.856212615966797, - "objective/kl": 23.130603790283203, - "objective/non_score_reward": -2.3130602836608887, - "objective/rlhf_reward": -7.892991387580318, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.8696446418762207, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.759651243686676, - "step": 664, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994723796844482 - }, - { - "episode": 10656, - "epoch": 0.19153754898083905, - "loss/policy_avg": 0.08811396360397339, - "lr": 9.57502556237219e-06, - "objective/entropy": -262.10101318359375, - "objective/kl": 41.6727294921875, - "objective/non_score_reward": -4.16727352142334, - "objective/rlhf_reward": -14.269093132019044, - "objective/scores": 0.6, - "policy/approxkl_avg": 8.957071304321289, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6705787181854248, - "step": 665, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.0116848945617676 - }, - { - "episode": 10672, - "epoch": 0.1918251428982277, - "loss/policy_avg": 1.2480721473693848, - "lr": 9.574386503067485e-06, - "objective/entropy": -60.014404296875, - "objective/kl": 38.03213119506836, - "objective/non_score_reward": -3.803213119506836, - "objective/rlhf_reward": -13.47951890627543, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 58.63201904296875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6512277722358704, - "step": 666, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9998128414154053 - }, - { - "episode": 10688, - "epoch": 0.19211273681561636, - "loss/policy_avg": 0.5962315201759338, - "lr": 9.573747443762782e-06, - "objective/entropy": 33.21426010131836, - "objective/kl": 37.888919830322266, - "objective/non_score_reward": -3.7888917922973633, - "objective/rlhf_reward": -13.813931754141478, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 41.1786994934082, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8934241533279419, - "step": 667, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.00087571144104 - }, - { - "episode": 10704, - "epoch": 0.192400330733005, - "loss/policy_avg": 1.1563293933868408, - "lr": 9.573108384458079e-06, - "objective/entropy": -12.93045425415039, - "objective/kl": 53.34501647949219, - "objective/non_score_reward": -5.334501266479492, - "objective/rlhf_reward": -19.73388603693636, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 4.820253372192383, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7993291616439819, - "step": 668, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9991778135299683 - }, - { - "episode": 10720, - "epoch": 0.19268792465039364, - "loss/policy_avg": 0.4342407286167145, - "lr": 9.572469325153375e-06, - "objective/entropy": 140.0662078857422, - "objective/kl": 49.41921615600586, - "objective/non_score_reward": -4.941922187805176, - "objective/rlhf_reward": -18.034354225794473, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 17.247631072998047, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6062232255935669, - "step": 669, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0003952980041504 - }, - { - "episode": 10736, - "epoch": 0.19297551856778228, - "loss/policy_avg": 0.5148516893386841, - "lr": 9.571830265848672e-06, - "objective/entropy": 155.82278442382812, - "objective/kl": 30.684419631958008, - "objective/non_score_reward": -3.068441867828369, - "objective/rlhf_reward": -10.326356480793889, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 4.213561534881592, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6449806690216064, - "step": 670, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0085437297821045 - }, - { - "episode": 10752, - "epoch": 0.19326311248517095, - "loss/policy_avg": 0.2710059881210327, - "lr": 9.571191206543968e-06, - "objective/entropy": -380.40130615234375, - "objective/kl": 27.177127838134766, - "objective/non_score_reward": -2.717712879180908, - "objective/rlhf_reward": -9.355079734119114, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 25.084197998046875, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6170598268508911, - "step": 671, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9980518817901611 - }, - { - "episode": 10768, - "epoch": 0.19355070640255959, - "loss/policy_avg": 0.028690431267023087, - "lr": 9.570552147239264e-06, - "objective/entropy": 32.0269775390625, - "objective/kl": 41.27011489868164, - "objective/non_score_reward": -4.127011299133301, - "objective/rlhf_reward": -12.108045673370361, - "objective/scores": 1.1, - "policy/approxkl_avg": 26.040042877197266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5785200595855713, - "step": 672, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978328943252563 - }, - { - "episode": 10784, - "epoch": 0.19383830031994823, - "loss/policy_avg": 0.27589982748031616, - "lr": 9.569913087934561e-06, - "objective/entropy": -252.5802001953125, - "objective/kl": 35.50873565673828, - "objective/non_score_reward": -3.5508739948272705, - "objective/rlhf_reward": -12.877982649832887, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 122.07984924316406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8820939064025879, - "step": 673, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9997135400772095 - }, - { - "episode": 10800, - "epoch": 0.1941258942373369, - "loss/policy_avg": 1.3692753314971924, - "lr": 9.569274028629858e-06, - "objective/entropy": -356.3477783203125, - "objective/kl": 35.032447814941406, - "objective/non_score_reward": -3.5032448768615723, - "objective/rlhf_reward": -12.065568755345282, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 7.098365783691406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6300245523452759, - "step": 674, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.0019798278808594 - }, - { - "episode": 10816, - "epoch": 0.19441348815472553, - "loss/policy_avg": 1.615112543106079, - "lr": 9.568634969325155e-06, - "objective/entropy": 15.047992706298828, - "objective/kl": 31.80112075805664, - "objective/non_score_reward": -3.180111885070801, - "objective/rlhf_reward": -11.239495160992504, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 16.252872467041016, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4491235613822937, - "step": 675, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999570608139038 - }, - { - "episode": 10832, - "epoch": 0.19470108207211417, - "loss/policy_avg": -0.033027857542037964, - "lr": 9.567995910020452e-06, - "objective/entropy": -1.499664306640625, - "objective/kl": 41.21196746826172, - "objective/non_score_reward": -4.12119722366333, - "objective/rlhf_reward": -15.03419003924881, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 2.4209108352661133, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7600862383842468, - "step": 676, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9999175071716309 - }, - { - "episode": 10848, - "epoch": 0.1949886759895028, - "loss/policy_avg": 0.8857518434524536, - "lr": 9.567356850715747e-06, - "objective/entropy": -144.43075561523438, - "objective/kl": 27.657447814941406, - "objective/non_score_reward": -2.765744924545288, - "objective/rlhf_reward": -9.684377172080378, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 104.82466125488281, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7131980061531067, - "step": 677, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0003392696380615 - }, - { - "episode": 10864, - "epoch": 0.19527626990689148, - "loss/policy_avg": 1.639385461807251, - "lr": 9.566717791411044e-06, - "objective/entropy": 48.725257873535156, - "objective/kl": 38.95005798339844, - "objective/non_score_reward": -3.895005702972412, - "objective/rlhf_reward": -14.023764221873833, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 42.96604537963867, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8440141081809998, - "step": 678, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9974513053894043 - }, - { - "episode": 10880, - "epoch": 0.19556386382428012, - "loss/policy_avg": 0.4775700569152832, - "lr": 9.56607873210634e-06, - "objective/entropy": -59.389835357666016, - "objective/kl": 39.82011032104492, - "objective/non_score_reward": -3.982011318206787, - "objective/rlhf_reward": -13.528044319152833, - "objective/scores": 0.6, - "policy/approxkl_avg": 7.1817216873168945, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5918980836868286, - "step": 679, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9976534843444824 - }, - { - "episode": 10896, - "epoch": 0.19585145774166876, - "loss/policy_avg": 0.2459838092327118, - "lr": 9.565439672801636e-06, - "objective/entropy": -161.0146484375, - "objective/kl": 49.84676742553711, - "objective/non_score_reward": -4.984676361083984, - "objective/rlhf_reward": -18.27684689086734, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 58.91631317138672, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.611931324005127, - "step": 680, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9979312419891357 - }, - { - "episode": 10912, - "epoch": 0.1961390516590574, - "loss/policy_avg": 0.01540219783782959, - "lr": 9.564800613496933e-06, - "objective/entropy": -148.67662048339844, - "objective/kl": 43.636253356933594, - "objective/non_score_reward": -4.363625526428223, - "objective/rlhf_reward": -16.075899221984248, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 23.949655532836914, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.730331301689148, - "step": 681, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982978105545044 - }, - { - "episode": 10928, - "epoch": 0.19642664557644607, - "loss/policy_avg": 0.015965640544891357, - "lr": 9.56416155419223e-06, - "objective/entropy": -276.25225830078125, - "objective/kl": 38.398780822753906, - "objective/non_score_reward": -3.8398780822753906, - "objective/rlhf_reward": -13.959512329101564, - "objective/scores": 0.35, - "policy/approxkl_avg": 1.795326590538025, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.776302695274353, - "step": 682, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.002492904663086 - }, - { - "episode": 10944, - "epoch": 0.1967142394938347, - "loss/policy_avg": 2.442565679550171, - "lr": 9.563522494887527e-06, - "objective/entropy": 58.142906188964844, - "objective/kl": 34.78215789794922, - "objective/non_score_reward": -3.4782156944274902, - "objective/rlhf_reward": -12.489030678470698, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 43.62590026855469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4453713893890381, - "step": 683, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0012001991271973 - }, - { - "episode": 10960, - "epoch": 0.19700183341122335, - "loss/policy_avg": 0.5201736092567444, - "lr": 9.562883435582822e-06, - "objective/entropy": 145.70559692382812, - "objective/kl": 38.99374771118164, - "objective/non_score_reward": -3.8993749618530273, - "objective/rlhf_reward": -14.041240303721978, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 8.502677917480469, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6028515100479126, - "step": 684, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0018045902252197 - }, - { - "episode": 10976, - "epoch": 0.197289427328612, - "loss/policy_avg": 0.7856461405754089, - "lr": 9.562244376278119e-06, - "objective/entropy": 62.90361785888672, - "objective/kl": 44.264869689941406, - "objective/non_score_reward": -4.426486492156982, - "objective/rlhf_reward": -13.305945968627931, - "objective/scores": 1.1, - "policy/approxkl_avg": 5.448941230773926, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4748787581920624, - "step": 685, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0009329319000244 - }, - { - "episode": 10992, - "epoch": 0.19757702124600066, - "loss/policy_avg": 0.051717519760131836, - "lr": 9.561605316973416e-06, - "objective/entropy": 18.32666015625, - "objective/kl": 49.36090087890625, - "objective/non_score_reward": -4.936090469360352, - "objective/rlhf_reward": -18.385112726424616, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 26.29433822631836, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.41640418767929077, - "step": 686, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9988088607788086 - }, - { - "episode": 11008, - "epoch": 0.1978646151633893, - "loss/policy_avg": 0.41617196798324585, - "lr": 9.560966257668713e-06, - "objective/entropy": -196.81280517578125, - "objective/kl": 33.58589553833008, - "objective/non_score_reward": -3.3585898876190186, - "objective/rlhf_reward": -11.77250004333316, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 13.112488746643066, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5552275776863098, - "step": 687, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998762607574463 - }, - { - "episode": 11024, - "epoch": 0.19815220908077794, - "loss/policy_avg": 1.3362700939178467, - "lr": 9.56032719836401e-06, - "objective/entropy": -52.72002029418945, - "objective/kl": 51.95423889160156, - "objective/non_score_reward": -5.195423603057861, - "objective/rlhf_reward": -19.456181798010988, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 25.351802825927734, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5189340710639954, - "step": 688, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9992269277572632 - }, - { - "episode": 11040, - "epoch": 0.19843980299816658, - "loss/policy_avg": 0.5213384628295898, - "lr": 9.559688139059306e-06, - "objective/entropy": -131.5908660888672, - "objective/kl": 40.5286865234375, - "objective/non_score_reward": -4.052868843078613, - "objective/rlhf_reward": -14.264063189701972, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 6.9836015701293945, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7457672357559204, - "step": 689, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9976084232330322 - }, - { - "episode": 11056, - "epoch": 0.19872739691555524, - "loss/policy_avg": 0.3092210590839386, - "lr": 9.559049079754601e-06, - "objective/entropy": -199.13714599609375, - "objective/kl": 43.78075408935547, - "objective/non_score_reward": -4.378075122833252, - "objective/rlhf_reward": -15.687471742900918, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 9.315277099609375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5846556425094604, - "step": 690, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9987812042236328 - }, - { - "episode": 11072, - "epoch": 0.19901499083294388, - "loss/policy_avg": 0.19393323361873627, - "lr": 9.558410020449898e-06, - "objective/entropy": -96.8709716796875, - "objective/kl": 47.84043502807617, - "objective/non_score_reward": -4.784043312072754, - "objective/rlhf_reward": -17.311345453533242, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 25.493038177490234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6347634792327881, - "step": 691, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.000093936920166 - }, - { - "episode": 11088, - "epoch": 0.19930258475033252, - "loss/policy_avg": 0.37496620416641235, - "lr": 9.557770961145195e-06, - "objective/entropy": -55.891319274902344, - "objective/kl": 49.187400817871094, - "objective/non_score_reward": -4.9187397956848145, - "objective/rlhf_reward": -17.727547953801093, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 36.81108856201172, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.47560715675354004, - "step": 692, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9989240169525146 - }, - { - "episode": 11104, - "epoch": 0.1995901786677212, - "loss/policy_avg": 0.10556544363498688, - "lr": 9.557131901840492e-06, - "objective/entropy": 51.882606506347656, - "objective/kl": 43.57653045654297, - "objective/non_score_reward": -4.357653617858887, - "objective/rlhf_reward": -17.43061327934265, - "objective/scores": 0.0, - "policy/approxkl_avg": 109.87631225585938, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5153118968009949, - "step": 693, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997976541519165 - }, - { - "episode": 11120, - "epoch": 0.19987777258510983, - "loss/policy_avg": 0.5423169136047363, - "lr": 9.556492842535789e-06, - "objective/entropy": -157.76409912109375, - "objective/kl": 35.16328430175781, - "objective/non_score_reward": -3.5163283348083496, - "objective/rlhf_reward": -12.54954107979172, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 33.95176696777344, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4616202712059021, - "step": 694, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9994275569915771 - }, - { - "episode": 11136, - "epoch": 0.20016536650249847, - "loss/policy_avg": 0.22838959097862244, - "lr": 9.555853783231084e-06, - "objective/entropy": -330.77996826171875, - "objective/kl": 36.184104919433594, - "objective/non_score_reward": -3.618410587310791, - "objective/rlhf_reward": -13.049810250003901, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 4.890125274658203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6670582294464111, - "step": 695, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9984445571899414 - }, - { - "episode": 11152, - "epoch": 0.2004529604198871, - "loss/policy_avg": 0.32133978605270386, - "lr": 9.555214723926381e-06, - "objective/entropy": -302.5234375, - "objective/kl": 42.94685745239258, - "objective/non_score_reward": -4.2946858406066895, - "objective/rlhf_reward": -15.622484176364495, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 5.337358474731445, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7116554975509644, - "step": 696, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999316692352295 - }, - { - "episode": 11168, - "epoch": 0.20074055433727578, - "loss/policy_avg": 0.7934830188751221, - "lr": 9.554575664621678e-06, - "objective/entropy": 174.83148193359375, - "objective/kl": 46.936683654785156, - "objective/non_score_reward": -4.693668365478516, - "objective/rlhf_reward": -17.218413441386772, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 9.96051025390625, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4638591408729553, - "step": 697, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000460624694824 - }, - { - "episode": 11184, - "epoch": 0.20102814825466442, - "loss/policy_avg": 1.1395208835601807, - "lr": 9.553936605316975e-06, - "objective/entropy": -101.63008880615234, - "objective/kl": 44.84364318847656, - "objective/non_score_reward": -4.4843645095825195, - "objective/rlhf_reward": -17.93745756149292, - "objective/scores": 0.0, - "policy/approxkl_avg": 35.57691192626953, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8001389503479004, - "step": 698, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9991685152053833 - }, - { - "episode": 11200, - "epoch": 0.20131574217205306, - "loss/policy_avg": 0.9136269092559814, - "lr": 9.553297546012272e-06, - "objective/entropy": 75.33238220214844, - "objective/kl": 40.499298095703125, - "objective/non_score_reward": -4.049930095672607, - "objective/rlhf_reward": -14.466386334101358, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 3.3566324710845947, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5737951993942261, - "step": 699, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9991101026535034 - }, - { - "episode": 11216, - "epoch": 0.2016033360894417, - "loss/policy_avg": 0.15398727357387543, - "lr": 9.552658486707569e-06, - "objective/entropy": -117.01931762695312, - "objective/kl": 45.11551284790039, - "objective/non_score_reward": -4.511551856994629, - "objective/rlhf_reward": -16.667604305831293, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 4.609723091125488, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5330109596252441, - "step": 700, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9992060661315918 - }, - { - "episode": 11232, - "epoch": 0.20189093000683037, - "loss/policy_avg": 0.9212744235992432, - "lr": 9.552019427402864e-06, - "objective/entropy": -53.3884391784668, - "objective/kl": 37.0269660949707, - "objective/non_score_reward": -3.7026968002319336, - "objective/rlhf_reward": -13.48527387145154, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 13.100292205810547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3935832977294922, - "step": 701, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9988267421722412 - }, - { - "episode": 11248, - "epoch": 0.202178523924219, - "loss/policy_avg": 0.03283894807100296, - "lr": 9.55138036809816e-06, - "objective/entropy": -55.63243865966797, - "objective/kl": 42.750938415527344, - "objective/non_score_reward": -4.275094032287598, - "objective/rlhf_reward": -14.176657830120298, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 40.57060241699219, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5732653141021729, - "step": 702, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9987488985061646 - }, - { - "episode": 11264, - "epoch": 0.20246611784160765, - "loss/policy_avg": 0.33969682455062866, - "lr": 9.550741308793456e-06, - "objective/entropy": -148.54368591308594, - "objective/kl": 44.687744140625, - "objective/non_score_reward": -4.468774795532227, - "objective/rlhf_reward": -17.87509775161743, - "objective/scores": 0.0, - "policy/approxkl_avg": 39.72189712524414, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8945959806442261, - "step": 703, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9970632791519165 - }, - { - "episode": 11280, - "epoch": 0.20275371175899629, - "loss/policy_avg": 1.481357455253601, - "lr": 9.550102249488753e-06, - "objective/entropy": 104.76945495605469, - "objective/kl": 51.556732177734375, - "objective/non_score_reward": -5.155673503875732, - "objective/rlhf_reward": -19.066434710231377, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 52.006526947021484, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6107035875320435, - "step": 704, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9983046054840088 - }, - { - "episode": 11296, - "epoch": 0.20304130567638495, - "loss/policy_avg": 0.9144167900085449, - "lr": 9.54946319018405e-06, - "objective/entropy": 123.02681732177734, - "objective/kl": 30.473539352416992, - "objective/non_score_reward": -3.047353982925415, - "objective/rlhf_reward": -10.765583594043818, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 23.10242462158203, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8819725513458252, - "step": 705, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0043578147888184 - }, - { - "episode": 11312, - "epoch": 0.2033288995937736, - "loss/policy_avg": 0.05131208896636963, - "lr": 9.548824130879346e-06, - "objective/entropy": 10.1641845703125, - "objective/kl": 39.38353729248047, - "objective/non_score_reward": -3.938354015350342, - "objective/rlhf_reward": -14.427902731925172, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 8.995450019836426, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8428833484649658, - "step": 706, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0014190673828125 - }, - { - "episode": 11328, - "epoch": 0.20361649351116223, - "loss/policy_avg": -0.4441620409488678, - "lr": 9.548185071574643e-06, - "objective/entropy": -176.51174926757812, - "objective/kl": 46.960792541503906, - "objective/non_score_reward": -4.696079254150391, - "objective/rlhf_reward": -17.360484202106562, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 0.7873663306236267, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7117265462875366, - "step": 707, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002187490463257 - }, - { - "episode": 11344, - "epoch": 0.20390408742855087, - "loss/policy_avg": 1.6518831253051758, - "lr": 9.547546012269938e-06, - "objective/entropy": -109.38287353515625, - "objective/kl": 40.430946350097656, - "objective/non_score_reward": -4.043094635009766, - "objective/rlhf_reward": -13.772377824783327, - "objective/scores": 0.6, - "policy/approxkl_avg": 37.53660583496094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5296775102615356, - "step": 708, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9980195760726929 - }, - { - "episode": 11360, - "epoch": 0.20419168134593954, - "loss/policy_avg": 0.0021621547639369965, - "lr": 9.546906952965235e-06, - "objective/entropy": -221.2525177001953, - "objective/kl": 41.13963317871094, - "objective/non_score_reward": -4.113963603973389, - "objective/rlhf_reward": -14.722521559397379, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 14.289579391479492, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5872384309768677, - "step": 709, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0001235008239746 - }, - { - "episode": 11376, - "epoch": 0.20447927526332818, - "loss/policy_avg": -0.4720730483531952, - "lr": 9.546267893660532e-06, - "objective/entropy": -227.78741455078125, - "objective/kl": 31.431976318359375, - "objective/non_score_reward": -3.143197774887085, - "objective/rlhf_reward": -10.910931830824005, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 5.8426594734191895, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.8071937561035156, - "step": 710, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0169754028320312 - }, - { - "episode": 11392, - "epoch": 0.20476686918071682, - "loss/policy_avg": 1.3163057565689087, - "lr": 9.545628834355829e-06, - "objective/entropy": 129.99642944335938, - "objective/kl": 35.20526885986328, - "objective/non_score_reward": -3.5205271244049072, - "objective/rlhf_reward": -14.08210825920105, - "objective/scores": 0.0, - "policy/approxkl_avg": 16.019872665405273, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.49222731590270996, - "step": 711, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999373435974121 - }, - { - "episode": 11408, - "epoch": 0.2050544630981055, - "loss/policy_avg": 0.7439494132995605, - "lr": 9.544989775051126e-06, - "objective/entropy": -157.7192840576172, - "objective/kl": 38.970420837402344, - "objective/non_score_reward": -3.8970417976379395, - "objective/rlhf_reward": -14.107215288098217, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 12.206151962280273, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5406340956687927, - "step": 712, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999123215675354 - }, - { - "episode": 11424, - "epoch": 0.20534205701549413, - "loss/policy_avg": -0.07283779978752136, - "lr": 9.544350715746423e-06, - "objective/entropy": -228.50950622558594, - "objective/kl": 39.152793884277344, - "objective/non_score_reward": -3.9152798652648926, - "objective/rlhf_reward": -14.3194828539187, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 37.80413818359375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6945517659187317, - "step": 713, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.001655101776123 - }, - { - "episode": 11440, - "epoch": 0.20562965093288277, - "loss/policy_avg": -0.2732037901878357, - "lr": 9.543711656441718e-06, - "objective/entropy": -12.787384033203125, - "objective/kl": 35.518089294433594, - "objective/non_score_reward": -3.5518088340759277, - "objective/rlhf_reward": -14.207236051559448, - "objective/scores": 0.0, - "policy/approxkl_avg": 8.075851440429688, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.48810243606567383, - "step": 714, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0166308879852295 - }, - { - "episode": 11456, - "epoch": 0.2059172448502714, - "loss/policy_avg": 0.43229636549949646, - "lr": 9.543072597137015e-06, - "objective/entropy": -66.50861358642578, - "objective/kl": 45.58790969848633, - "objective/non_score_reward": -4.558791160583496, - "objective/rlhf_reward": -15.311445151210997, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 3.3843135833740234, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6485980749130249, - "step": 715, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 12, - "val/ratio": 2.0008926391601562 - }, - { - "episode": 11472, - "epoch": 0.20620483876766008, - "loss/policy_avg": 0.8147934079170227, - "lr": 9.542433537832312e-06, - "objective/entropy": -161.17880249023438, - "objective/kl": 46.83830261230469, - "objective/non_score_reward": -4.683830738067627, - "objective/rlhf_reward": -16.612616243139776, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 18.456661224365234, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5038250684738159, - "step": 716, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985175132751465 - }, - { - "episode": 11488, - "epoch": 0.20649243268504872, - "loss/policy_avg": 0.2672369182109833, - "lr": 9.541794478527609e-06, - "objective/entropy": -305.15264892578125, - "objective/kl": 38.918190002441406, - "objective/non_score_reward": -3.8918187618255615, - "objective/rlhf_reward": -13.90541625541507, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 23.703933715820312, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6772634983062744, - "step": 717, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990991353988647 - }, - { - "episode": 11504, - "epoch": 0.20678002660243736, - "loss/policy_avg": 0.6788185238838196, - "lr": 9.541155419222906e-06, - "objective/entropy": -22.62152862548828, - "objective/kl": 48.63094711303711, - "objective/non_score_reward": -4.863094329833984, - "objective/rlhf_reward": -18.052378273010255, - "objective/scores": 0.35, - "policy/approxkl_avg": 66.53858184814453, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5422554016113281, - "step": 718, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9975666999816895 - }, - { - "episode": 11520, - "epoch": 0.207067620519826, - "loss/policy_avg": 1.2419401407241821, - "lr": 9.5405163599182e-06, - "objective/entropy": -44.057044982910156, - "objective/kl": 47.82793426513672, - "objective/non_score_reward": -4.7827935218811035, - "objective/rlhf_reward": -17.70734198828515, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 63.91344451904297, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4672902226448059, - "step": 719, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9976657629013062 - }, - { - "episode": 11536, - "epoch": 0.20735521443721466, - "loss/policy_avg": 0.23812846839427948, - "lr": 9.539877300613498e-06, - "objective/entropy": 14.730127334594727, - "objective/kl": 36.89497375488281, - "objective/non_score_reward": -3.689497470855713, - "objective/rlhf_reward": -13.15386942392977, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 64.03750610351562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5559313297271729, - "step": 720, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9973509311676025 - }, - { - "episode": 11552, - "epoch": 0.2076428083546033, - "loss/policy_avg": 2.798316478729248, - "lr": 9.539238241308795e-06, - "objective/entropy": -49.72710418701172, - "objective/kl": 45.801639556884766, - "objective/non_score_reward": -4.580163955688477, - "objective/rlhf_reward": -16.920656538009645, - "objective/scores": 0.35, - "policy/approxkl_avg": 15.340538024902344, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6497080326080322, - "step": 721, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989633560180664 - }, - { - "episode": 11568, - "epoch": 0.20793040227199194, - "loss/policy_avg": 0.8752083778381348, - "lr": 9.538599182004091e-06, - "objective/entropy": -125.80049133300781, - "objective/kl": 32.64875793457031, - "objective/non_score_reward": -3.264875888824463, - "objective/rlhf_reward": -11.733989987403078, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 67.2957534790039, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5858502984046936, - "step": 722, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998004674911499 - }, - { - "episode": 11584, - "epoch": 0.20821799618938058, - "loss/policy_avg": -0.8605256080627441, - "lr": 9.537960122699387e-06, - "objective/entropy": 32.59730529785156, - "objective/kl": 46.67155075073242, - "objective/non_score_reward": -4.667154788970947, - "objective/rlhf_reward": -18.66861915588379, - "objective/scores": 0.0, - "policy/approxkl_avg": 139.18896484375, - "policy/clipfrac_avg": 2.0, - "policy/entropy_avg": 0.39150160551071167, - "step": 723, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0014235973358154 - }, - { - "episode": 11600, - "epoch": 0.20850559010676925, - "loss/policy_avg": 1.036819577217102, - "lr": 9.537321063394683e-06, - "objective/entropy": -182.33963012695312, - "objective/kl": 40.85021209716797, - "objective/non_score_reward": -4.085021018981934, - "objective/rlhf_reward": -14.99844866087976, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 7.50022554397583, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5750303268432617, - "step": 724, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982917308807373 - }, - { - "episode": 11616, - "epoch": 0.2087931840241579, - "loss/policy_avg": 0.3512716591358185, - "lr": 9.53668200408998e-06, - "objective/entropy": -271.77294921875, - "objective/kl": 46.54193878173828, - "objective/non_score_reward": -4.654193878173828, - "objective/rlhf_reward": -15.693056498409483, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 35.207305908203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5233364105224609, - "step": 725, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9976844787597656 - }, - { - "episode": 11632, - "epoch": 0.20908077794154653, - "loss/policy_avg": -0.24283993244171143, - "lr": 9.536042944785277e-06, - "objective/entropy": 59.02741241455078, - "objective/kl": 31.218732833862305, - "objective/non_score_reward": -3.121873378753662, - "objective/rlhf_reward": -9.56377426231024, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 13.003084182739258, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9299765825271606, - "step": 726, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0005626678466797 - }, - { - "episode": 11648, - "epoch": 0.20936837185893517, - "loss/policy_avg": 1.312835931777954, - "lr": 9.535403885480572e-06, - "objective/entropy": -175.0522918701172, - "objective/kl": 53.506343841552734, - "objective/non_score_reward": -5.3506340980529785, - "objective/rlhf_reward": -19.95193825206314, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 26.03130340576172, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7993600368499756, - "step": 727, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9979145526885986 - }, - { - "episode": 11664, - "epoch": 0.20965596577632384, - "loss/policy_avg": 2.228332042694092, - "lr": 9.53476482617587e-06, - "objective/entropy": -13.038238525390625, - "objective/kl": 29.668231964111328, - "objective/non_score_reward": -2.9668235778808594, - "objective/rlhf_reward": -8.943575863481733, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 58.691162109375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5734894275665283, - "step": 728, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984681606292725 - }, - { - "episode": 11680, - "epoch": 0.20994355969371248, - "loss/policy_avg": 1.2234901189804077, - "lr": 9.534125766871166e-06, - "objective/entropy": -52.32987594604492, - "objective/kl": 42.713897705078125, - "objective/non_score_reward": -4.271389961242676, - "objective/rlhf_reward": -15.481440577570517, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 105.13751220703125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6829936504364014, - "step": 729, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984365701675415 - }, - { - "episode": 11696, - "epoch": 0.21023115361110112, - "loss/policy_avg": 0.6054737567901611, - "lr": 9.533486707566463e-06, - "objective/entropy": -4.811004638671875, - "objective/kl": 38.744468688964844, - "objective/non_score_reward": -3.8744468688964844, - "objective/rlhf_reward": -13.893667016092856, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 13.6554536819458, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5291285514831543, - "step": 730, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9962334632873535 - }, - { - "episode": 11712, - "epoch": 0.21051874752848979, - "loss/policy_avg": 0.2636476159095764, - "lr": 9.53284764826176e-06, - "objective/entropy": -290.16064453125, - "objective/kl": 34.78122329711914, - "objective/non_score_reward": -3.4781219959259033, - "objective/rlhf_reward": -12.431535365994336, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 5.398682594299316, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.40512436628341675, - "step": 731, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9987027645111084 - }, - { - "episode": 11728, - "epoch": 0.21080634144587843, - "loss/policy_avg": 0.08485618978738785, - "lr": 9.532208588957055e-06, - "objective/entropy": 4.361198425292969, - "objective/kl": 48.85911178588867, - "objective/non_score_reward": -4.88591194152832, - "objective/rlhf_reward": -18.06269371789253, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 24.168426513671875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7394464015960693, - "step": 732, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998630166053772 - }, - { - "episode": 11744, - "epoch": 0.21109393536326707, - "loss/policy_avg": 0.6081631183624268, - "lr": 9.531569529652352e-06, - "objective/entropy": -193.95896911621094, - "objective/kl": 41.25489044189453, - "objective/non_score_reward": -4.125488758087158, - "objective/rlhf_reward": -14.101954555511476, - "objective/scores": 0.6, - "policy/approxkl_avg": 10.96660041809082, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4481828808784485, - "step": 733, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9984350204467773 - }, - { - "episode": 11760, - "epoch": 0.2113815292806557, - "loss/policy_avg": 0.1706855297088623, - "lr": 9.530930470347649e-06, - "objective/entropy": -283.3249816894531, - "objective/kl": 36.63468933105469, - "objective/non_score_reward": -3.663468599319458, - "objective/rlhf_reward": -13.275272228804928, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.8939387798309326, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6289054155349731, - "step": 734, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9998714923858643 - }, - { - "episode": 11776, - "epoch": 0.21166912319804437, - "loss/policy_avg": 0.006064563989639282, - "lr": 9.530291411042946e-06, - "objective/entropy": -281.0845947265625, - "objective/kl": 40.41436767578125, - "objective/non_score_reward": -4.041437149047852, - "objective/rlhf_reward": -14.340918655666421, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 4.309089660644531, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9609556794166565, - "step": 735, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 12, - "val/ratio": 2.001737594604492 - }, - { - "episode": 11792, - "epoch": 0.211956717115433, - "loss/policy_avg": 0.9804132580757141, - "lr": 9.529652351738243e-06, - "objective/entropy": -149.67555236816406, - "objective/kl": 33.81299591064453, - "objective/non_score_reward": -3.3812994956970215, - "objective/rlhf_reward": -12.009426915439303, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 18.63791275024414, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8183978199958801, - "step": 736, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9984934329986572 - }, - { - "episode": 11808, - "epoch": 0.21224431103282165, - "loss/policy_avg": 0.8149501085281372, - "lr": 9.52901329243354e-06, - "objective/entropy": -135.65036010742188, - "objective/kl": 38.22229766845703, - "objective/non_score_reward": -3.8222296237945557, - "objective/rlhf_reward": -13.94728236487451, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 56.22487258911133, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.49448803067207336, - "step": 737, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000056743621826 - }, - { - "episode": 11824, - "epoch": 0.2125319049502103, - "loss/policy_avg": 0.9543494582176208, - "lr": 9.528374233128835e-06, - "objective/entropy": -111.98321533203125, - "objective/kl": 40.26172637939453, - "objective/non_score_reward": -4.026172637939453, - "objective/rlhf_reward": -14.680858810146418, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 11.283975601196289, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5840786695480347, - "step": 738, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9993596076965332 - }, - { - "episode": 11840, - "epoch": 0.21281949886759896, - "loss/policy_avg": -0.20723594725131989, - "lr": 9.527735173824132e-06, - "objective/entropy": 9.0609130859375, - "objective/kl": 34.74554443359375, - "objective/non_score_reward": -3.4745540618896484, - "objective/rlhf_reward": -12.447618345828399, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 8.519084930419922, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7199011445045471, - "step": 739, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001832962036133 - }, - { - "episode": 11856, - "epoch": 0.2131070927849876, - "loss/policy_avg": -0.20900213718414307, - "lr": 9.527096114519428e-06, - "objective/entropy": -103.92984008789062, - "objective/kl": 39.744930267333984, - "objective/non_score_reward": -3.9744927883148193, - "objective/rlhf_reward": -14.447373251529083, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 32.00751495361328, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.520116925239563, - "step": 740, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.0052452087402344 - }, - { - "episode": 11872, - "epoch": 0.21339468670237624, - "loss/policy_avg": 0.3564949631690979, - "lr": 9.526457055214725e-06, - "objective/entropy": -143.9356689453125, - "objective/kl": 39.302146911621094, - "objective/non_score_reward": -3.9302148818969727, - "objective/rlhf_reward": -13.896030183109353, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 41.78466033935547, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.752386748790741, - "step": 741, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0004146099090576 - }, - { - "episode": 11888, - "epoch": 0.21368228061976488, - "loss/policy_avg": 0.565791130065918, - "lr": 9.525817995910022e-06, - "objective/entropy": -147.33612060546875, - "objective/kl": 42.210853576660156, - "objective/non_score_reward": -4.221085548400879, - "objective/rlhf_reward": -15.222481732786285, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 32.17230224609375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7177502512931824, - "step": 742, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9988508224487305 - }, - { - "episode": 11904, - "epoch": 0.21396987453715355, - "loss/policy_avg": 0.9356632232666016, - "lr": 9.525178936605317e-06, - "objective/entropy": -138.57948303222656, - "objective/kl": 25.448646545410156, - "objective/non_score_reward": -2.5448646545410156, - "objective/rlhf_reward": -8.800856449691159, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 29.832378387451172, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8654073476791382, - "step": 743, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.99726140499115 - }, - { - "episode": 11920, - "epoch": 0.2142574684545422, - "loss/policy_avg": 1.411858081817627, - "lr": 9.524539877300614e-06, - "objective/entropy": -45.33397674560547, - "objective/kl": 47.14906311035156, - "objective/non_score_reward": -4.714906692504883, - "objective/rlhf_reward": -14.459626293182374, - "objective/scores": 1.1, - "policy/approxkl_avg": 48.37995529174805, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4117211699485779, - "step": 744, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9974520206451416 - }, - { - "episode": 11936, - "epoch": 0.21454506237193083, - "loss/policy_avg": 0.2626647353172302, - "lr": 9.52390081799591e-06, - "objective/entropy": -90.43316650390625, - "objective/kl": 45.553260803222656, - "objective/non_score_reward": -4.555326461791992, - "objective/rlhf_reward": -15.297585879207823, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 4.135089874267578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5962470173835754, - "step": 745, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.000563383102417 - }, - { - "episode": 11952, - "epoch": 0.21483265628931947, - "loss/policy_avg": 0.709905743598938, - "lr": 9.523261758691206e-06, - "objective/entropy": -42.35087203979492, - "objective/kl": 41.68510055541992, - "objective/non_score_reward": -4.168510437011719, - "objective/rlhf_reward": -12.274040555953981, - "objective/scores": 1.1, - "policy/approxkl_avg": 26.280433654785156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8072315454483032, - "step": 746, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989625215530396 - }, - { - "episode": 11968, - "epoch": 0.21512025020670814, - "loss/policy_avg": -0.6861017942428589, - "lr": 9.522622699386503e-06, - "objective/entropy": 90.19775390625, - "objective/kl": 53.37855911254883, - "objective/non_score_reward": -5.337856292724609, - "objective/rlhf_reward": -19.97282347926269, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 2.6925954818725586, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4838978350162506, - "step": 747, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0017528533935547 - }, - { - "episode": 11984, - "epoch": 0.21540784412409678, - "loss/policy_avg": 1.1363091468811035, - "lr": 9.5219836400818e-06, - "objective/entropy": -8.687484741210938, - "objective/kl": 51.46209716796875, - "objective/non_score_reward": -5.146209716796875, - "objective/rlhf_reward": -19.22558971617071, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 25.820655822753906, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.8138981461524963, - "step": 748, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985830783843994 - }, - { - "episode": 12000, - "epoch": 0.21569543804148542, - "loss/policy_avg": 0.04534798115491867, - "lr": 9.521344580777097e-06, - "objective/entropy": -280.7192687988281, - "objective/kl": 37.07057189941406, - "objective/non_score_reward": -3.7070577144622803, - "objective/rlhf_reward": -13.404398520191279, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 47.663856506347656, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9239341020584106, - "step": 749, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9996497631072998 - }, - { - "episode": 12016, - "epoch": 0.21598303195887408, - "loss/policy_avg": 0.0637999027967453, - "lr": 9.520705521472394e-06, - "objective/entropy": 149.28018188476562, - "objective/kl": 54.167938232421875, - "objective/non_score_reward": -5.4167938232421875, - "objective/rlhf_reward": -20.307925426696222, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 45.464962005615234, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7254760265350342, - "step": 750, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9995579719543457 - }, - { - "episode": 12032, - "epoch": 0.21627062587626272, - "loss/policy_avg": 1.039564609527588, - "lr": 9.520066462167689e-06, - "objective/entropy": -83.41825866699219, - "objective/kl": 43.81858825683594, - "objective/non_score_reward": -4.381858825683594, - "objective/rlhf_reward": -16.011662566455538, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 29.998737335205078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6821025609970093, - "step": 751, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9987900257110596 - }, - { - "episode": 12048, - "epoch": 0.21655821979365136, - "loss/policy_avg": 0.37849336862564087, - "lr": 9.519427402862986e-06, - "objective/entropy": 14.265073776245117, - "objective/kl": 56.944114685058594, - "objective/non_score_reward": -5.694411277770996, - "objective/rlhf_reward": -20.830234358982977, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 2.6588432788848877, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6893448233604431, - "step": 752, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000504493713379 - }, - { - "episode": 12064, - "epoch": 0.21684581371104, - "loss/policy_avg": 2.0812063217163086, - "lr": 9.518788343558283e-06, - "objective/entropy": 12.610939025878906, - "objective/kl": 42.907318115234375, - "objective/non_score_reward": -4.290732383728027, - "objective/rlhf_reward": -15.837415728598756, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 65.95945739746094, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.38551682233810425, - "step": 753, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.998692274093628 - }, - { - "episode": 12080, - "epoch": 0.21713340762842867, - "loss/policy_avg": 1.4952073097229004, - "lr": 9.51814928425358e-06, - "objective/entropy": 51.34489440917969, - "objective/kl": 35.404083251953125, - "objective/non_score_reward": -3.540408134460449, - "objective/rlhf_reward": -12.783030369368891, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 9.171011924743652, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8167620301246643, - "step": 754, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.997989535331726 - }, - { - "episode": 12096, - "epoch": 0.2174210015458173, - "loss/policy_avg": 0.1273561418056488, - "lr": 9.517510224948877e-06, - "objective/entropy": -277.29986572265625, - "objective/kl": 29.701295852661133, - "objective/non_score_reward": -2.9701294898986816, - "objective/rlhf_reward": -10.538882306128173, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 26.458465576171875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5248850584030151, - "step": 755, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9987549781799316 - }, - { - "episode": 12112, - "epoch": 0.21770859546320595, - "loss/policy_avg": 0.4003949761390686, - "lr": 9.516871165644172e-06, - "objective/entropy": -68.40557861328125, - "objective/kl": 39.301795959472656, - "objective/non_score_reward": -3.9301795959472656, - "objective/rlhf_reward": -14.320718860626222, - "objective/scores": 0.35, - "policy/approxkl_avg": 2.6001267433166504, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5238065719604492, - "step": 756, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9997713565826416 - }, - { - "episode": 12128, - "epoch": 0.2179961893805946, - "loss/policy_avg": 0.18117927014827728, - "lr": 9.516232106339469e-06, - "objective/entropy": -258.4045715332031, - "objective/kl": 43.05773162841797, - "objective/non_score_reward": -4.3057732582092285, - "objective/rlhf_reward": -15.707321727069552, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 51.23970031738281, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4997139275074005, - "step": 757, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0000267028808594 - }, - { - "episode": 12144, - "epoch": 0.21828378329798326, - "loss/policy_avg": 0.421768456697464, - "lr": 9.515593047034765e-06, - "objective/entropy": -100.52609252929688, - "objective/kl": 46.79566192626953, - "objective/non_score_reward": -4.679566383361816, - "objective/rlhf_reward": -16.89343583134086, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 170.1147003173828, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3974299132823944, - "step": 758, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999988317489624 - }, - { - "episode": 12160, - "epoch": 0.2185713772153719, - "loss/policy_avg": 0.6913712024688721, - "lr": 9.514953987730062e-06, - "objective/entropy": 156.790283203125, - "objective/kl": 52.020015716552734, - "objective/non_score_reward": -5.202001571655273, - "objective/rlhf_reward": -17.884287272335264, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 12.406476974487305, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6548702716827393, - "step": 759, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.998046636581421 - }, - { - "episode": 12176, - "epoch": 0.21885897113276054, - "loss/policy_avg": 1.3135895729064941, - "lr": 9.51431492842536e-06, - "objective/entropy": -228.68115234375, - "objective/kl": 26.70990562438965, - "objective/non_score_reward": -2.6709907054901123, - "objective/rlhf_reward": -9.342326930075316, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 11.945259094238281, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6857779026031494, - "step": 760, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0025386810302734 - }, - { - "episode": 12192, - "epoch": 0.21914656505014918, - "loss/policy_avg": 2.0080392360687256, - "lr": 9.513675869120656e-06, - "objective/entropy": -90.80921936035156, - "objective/kl": 38.32233428955078, - "objective/non_score_reward": -3.832233190536499, - "objective/rlhf_reward": -13.969682180617731, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 11.5086669921875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6320334076881409, - "step": 761, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999190330505371 - }, - { - "episode": 12208, - "epoch": 0.21943415896753785, - "loss/policy_avg": -0.11610303819179535, - "lr": 9.513036809815951e-06, - "objective/entropy": -58.6525764465332, - "objective/kl": 38.49602508544922, - "objective/non_score_reward": -3.849602222442627, - "objective/rlhf_reward": -14.056773713141112, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 1.3055446147918701, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4994346499443054, - "step": 762, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9999125003814697 - }, - { - "episode": 12224, - "epoch": 0.21972175288492649, - "loss/policy_avg": 1.3697398900985718, - "lr": 9.512397750511248e-06, - "objective/entropy": -144.068359375, - "objective/kl": 47.98112106323242, - "objective/non_score_reward": -4.798112392425537, - "objective/rlhf_reward": -17.069743337408575, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 17.99888801574707, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6316829919815063, - "step": 763, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998345971107483 - }, - { - "episode": 12240, - "epoch": 0.22000934680231513, - "loss/policy_avg": 0.1900498867034912, - "lr": 9.511758691206545e-06, - "objective/entropy": 36.60948181152344, - "objective/kl": 38.48204803466797, - "objective/non_score_reward": -3.8482046127319336, - "objective/rlhf_reward": -10.99281940460205, - "objective/scores": 1.1, - "policy/approxkl_avg": 25.719863891601562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7167200446128845, - "step": 764, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.998880386352539 - }, - { - "episode": 12256, - "epoch": 0.22029694071970377, - "loss/policy_avg": 0.38484030961990356, - "lr": 9.511119631901842e-06, - "objective/entropy": -45.415122985839844, - "objective/kl": 43.59566879272461, - "objective/non_score_reward": -4.359567165374756, - "objective/rlhf_reward": -16.11275533202283, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.580883026123047, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4097760319709778, - "step": 765, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9987237453460693 - }, - { - "episode": 12272, - "epoch": 0.22058453463709243, - "loss/policy_avg": 0.376749187707901, - "lr": 9.510480572597139e-06, - "objective/entropy": -264.5468444824219, - "objective/kl": 40.1629638671875, - "objective/non_score_reward": -4.01629638671875, - "objective/rlhf_reward": -14.66518578529358, - "objective/scores": 0.35, - "policy/approxkl_avg": 10.952293395996094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.788668155670166, - "step": 766, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9980183839797974 - }, - { - "episode": 12288, - "epoch": 0.22087212855448107, - "loss/policy_avg": 0.8714499473571777, - "lr": 9.509841513292434e-06, - "objective/entropy": -187.86923217773438, - "objective/kl": 65.75520324707031, - "objective/non_score_reward": -6.5755205154418945, - "objective/rlhf_reward": -24.942833626006525, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 13.150325775146484, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5890084505081177, - "step": 767, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9972370862960815 - }, - { - "episode": 12304, - "epoch": 0.2211597224718697, - "loss/policy_avg": 0.38202425837516785, - "lr": 9.509202453987731e-06, - "objective/entropy": -204.2275848388672, - "objective/kl": 45.88398742675781, - "objective/non_score_reward": -4.5883989334106445, - "objective/rlhf_reward": -15.953595256805421, - "objective/scores": 0.6, - "policy/approxkl_avg": 98.07376098632812, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7801451086997986, - "step": 768, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9973288774490356 - }, - { - "episode": 12320, - "epoch": 0.22144731638925838, - "loss/policy_avg": 0.6291148066520691, - "lr": 9.508563394683026e-06, - "objective/entropy": -30.971599578857422, - "objective/kl": 47.970767974853516, - "objective/non_score_reward": -4.797077178955078, - "objective/rlhf_reward": -17.672535979541475, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 19.533828735351562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6176853775978088, - "step": 769, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9993345737457275 - }, - { - "episode": 12336, - "epoch": 0.22173491030664702, - "loss/policy_avg": 0.48620525002479553, - "lr": 9.507924335378323e-06, - "objective/entropy": -275.8260803222656, - "objective/kl": 35.916908264160156, - "objective/non_score_reward": -3.591691017150879, - "objective/rlhf_reward": -12.76264480120333, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 11.286559104919434, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6079502105712891, - "step": 770, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9984924793243408 - }, - { - "episode": 12352, - "epoch": 0.22202250422403566, - "loss/policy_avg": 1.610249638557434, - "lr": 9.50728527607362e-06, - "objective/entropy": 54.72114562988281, - "objective/kl": 56.84210205078125, - "objective/non_score_reward": -5.684210300445557, - "objective/rlhf_reward": -21.41132882598035, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 37.109683990478516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6080986857414246, - "step": 771, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999401569366455 - }, - { - "episode": 12368, - "epoch": 0.2223100981414243, - "loss/policy_avg": -0.243692547082901, - "lr": 9.506646216768917e-06, - "objective/entropy": -64.7589340209961, - "objective/kl": 49.16008758544922, - "objective/non_score_reward": -4.916008949279785, - "objective/rlhf_reward": -18.285433867064814, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 144.7684326171875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5061931610107422, - "step": 772, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9989681243896484 - }, - { - "episode": 12384, - "epoch": 0.22259769205881297, - "loss/policy_avg": 0.5371442437171936, - "lr": 9.506007157464214e-06, - "objective/entropy": -145.7498779296875, - "objective/kl": 36.19651412963867, - "objective/non_score_reward": -3.6196513175964355, - "objective/rlhf_reward": -12.962834203036959, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 8.433561325073242, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7705793380737305, - "step": 773, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.997005581855774 - }, - { - "episode": 12400, - "epoch": 0.2228852859762016, - "loss/policy_avg": 0.8214170336723328, - "lr": 9.50536809815951e-06, - "objective/entropy": -230.78439331054688, - "objective/kl": 45.08775329589844, - "objective/non_score_reward": -4.50877571105957, - "objective/rlhf_reward": -16.210273618969033, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 45.08744812011719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6854566335678101, - "step": 774, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9984679222106934 - }, - { - "episode": 12416, - "epoch": 0.22317287989359025, - "loss/policy_avg": 0.32185256481170654, - "lr": 9.504729038854806e-06, - "objective/entropy": 17.611534118652344, - "objective/kl": 39.12417984008789, - "objective/non_score_reward": -3.9124178886413574, - "objective/rlhf_reward": -13.824842984947274, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 23.315509796142578, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.719410240650177, - "step": 775, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994924068450928 - }, - { - "episode": 12432, - "epoch": 0.2234604738109789, - "loss/policy_avg": 0.8856257200241089, - "lr": 9.504089979550103e-06, - "objective/entropy": -192.63497924804688, - "objective/kl": 46.621360778808594, - "objective/non_score_reward": -4.662136077880859, - "objective/rlhf_reward": -18.648544788360596, - "objective/scores": 0.0, - "policy/approxkl_avg": 7.01698637008667, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.47388628125190735, - "step": 776, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9977457523345947 - }, - { - "episode": 12448, - "epoch": 0.22374806772836756, - "loss/policy_avg": -0.10767285525798798, - "lr": 9.5034509202454e-06, - "objective/entropy": -169.90748596191406, - "objective/kl": 40.091983795166016, - "objective/non_score_reward": -4.009198188781738, - "objective/rlhf_reward": -14.432673726145346, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.653290271759033, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4810905158519745, - "step": 777, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0010921955108643 - }, - { - "episode": 12464, - "epoch": 0.2240356616457562, - "loss/policy_avg": 0.5116205215454102, - "lr": 9.502811860940696e-06, - "objective/entropy": 17.38312530517578, - "objective/kl": 46.06462097167969, - "objective/non_score_reward": -4.606462478637695, - "objective/rlhf_reward": -17.066599333022516, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 26.975656509399414, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3496581017971039, - "step": 778, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9981449842453003 - }, - { - "episode": 12480, - "epoch": 0.22432325556314484, - "loss/policy_avg": -0.053467996418476105, - "lr": 9.502172801635993e-06, - "objective/entropy": -109.83203125, - "objective/kl": 53.02067565917969, - "objective/non_score_reward": -5.302067756652832, - "objective/rlhf_reward": -19.88275865080945, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 2.783937454223633, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5046157836914062, - "step": 779, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.00018310546875 - }, - { - "episode": 12496, - "epoch": 0.22461084948053348, - "loss/policy_avg": 0.20323413610458374, - "lr": 9.50153374233129e-06, - "objective/entropy": -75.36346435546875, - "objective/kl": 52.81346893310547, - "objective/non_score_reward": -5.281346797943115, - "objective/rlhf_reward": -16.725387191772462, - "objective/scores": 1.1, - "policy/approxkl_avg": 22.060535430908203, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.367484986782074, - "step": 780, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0009407997131348 - }, - { - "episode": 12512, - "epoch": 0.22489844339792214, - "loss/policy_avg": 0.4564368724822998, - "lr": 9.500894683026585e-06, - "objective/entropy": -37.82079315185547, - "objective/kl": 40.027137756347656, - "objective/non_score_reward": -4.002713203430176, - "objective/rlhf_reward": -13.61085424423218, - "objective/scores": 0.6, - "policy/approxkl_avg": 1.6412748098373413, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6870338320732117, - "step": 781, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000164270401001 - }, - { - "episode": 12528, - "epoch": 0.22518603731531078, - "loss/policy_avg": 0.059905484318733215, - "lr": 9.500255623721882e-06, - "objective/entropy": -47.8739013671875, - "objective/kl": 48.443641662597656, - "objective/non_score_reward": -4.844364166259766, - "objective/rlhf_reward": -17.430046866612372, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 14.049509048461914, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6184900999069214, - "step": 782, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0001115798950195 - }, - { - "episode": 12544, - "epoch": 0.22547363123269942, - "loss/policy_avg": -0.06788864731788635, - "lr": 9.499616564417179e-06, - "objective/entropy": -196.17083740234375, - "objective/kl": 36.355308532714844, - "objective/non_score_reward": -3.635531187057495, - "objective/rlhf_reward": -13.216611895590944, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 22.939311981201172, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6597442626953125, - "step": 783, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9999268054962158 - }, - { - "episode": 12560, - "epoch": 0.22576122515008806, - "loss/policy_avg": 0.66241455078125, - "lr": 9.498977505112476e-06, - "objective/entropy": -202.2138671875, - "objective/kl": 51.065711975097656, - "objective/non_score_reward": -5.106571197509766, - "objective/rlhf_reward": -19.002451498706904, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 1.2135827541351318, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4113037586212158, - "step": 784, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.001005172729492 - }, - { - "episode": 12576, - "epoch": 0.22604881906747673, - "loss/policy_avg": 1.2668356895446777, - "lr": 9.498338445807773e-06, - "objective/entropy": 59.70518493652344, - "objective/kl": 40.02257537841797, - "objective/non_score_reward": -4.002257823944092, - "objective/rlhf_reward": -14.585199673374262, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 21.518054962158203, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.600081205368042, - "step": 785, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000067949295044 - }, - { - "episode": 12592, - "epoch": 0.22633641298486537, - "loss/policy_avg": 0.017837971448898315, - "lr": 9.497699386503068e-06, - "objective/entropy": -106.43355560302734, - "objective/kl": 45.42462921142578, - "objective/non_score_reward": -4.542463302612305, - "objective/rlhf_reward": -16.828217556982665, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 1.8331397771835327, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4324771463871002, - "step": 786, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0006937980651855 - }, - { - "episode": 12608, - "epoch": 0.226624006902254, - "loss/policy_avg": 1.1101531982421875, - "lr": 9.497060327198365e-06, - "objective/entropy": -100.28545379638672, - "objective/kl": 38.377403259277344, - "objective/non_score_reward": -3.837740898132324, - "objective/rlhf_reward": -14.009327462225585, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 21.72365951538086, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8042199611663818, - "step": 787, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9995393753051758 - }, - { - "episode": 12624, - "epoch": 0.22691160081964268, - "loss/policy_avg": 0.9154256582260132, - "lr": 9.496421267893662e-06, - "objective/entropy": -13.352066040039062, - "objective/kl": 41.567588806152344, - "objective/non_score_reward": -4.156759262084961, - "objective/rlhf_reward": -15.176439146609649, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 3.859703540802002, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.7200378775596619, - "step": 788, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981780052185059 - }, - { - "episode": 12640, - "epoch": 0.22719919473703132, - "loss/policy_avg": 4.065809726715088, - "lr": 9.495782208588959e-06, - "objective/entropy": -88.02486419677734, - "objective/kl": 41.876869201660156, - "objective/non_score_reward": -4.187687397003174, - "objective/rlhf_reward": -15.409113696127562, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 10.830172538757324, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.42192888259887695, - "step": 789, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001798152923584 - }, - { - "episode": 12656, - "epoch": 0.22748678865441996, - "loss/policy_avg": 1.4616761207580566, - "lr": 9.495143149284254e-06, - "objective/entropy": 12.189279556274414, - "objective/kl": 44.4468994140625, - "objective/non_score_reward": -4.444689750671387, - "objective/rlhf_reward": -16.453245434790773, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 83.95680236816406, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7362450957298279, - "step": 790, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9999191761016846 - }, - { - "episode": 12672, - "epoch": 0.2277743825718086, - "loss/policy_avg": 0.05288725346326828, - "lr": 9.49450408997955e-06, - "objective/entropy": 31.454437255859375, - "objective/kl": 40.4713020324707, - "objective/non_score_reward": -4.047130107879639, - "objective/rlhf_reward": -13.264801417232725, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 4.5849409103393555, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.9175825119018555, - "step": 791, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000244140625 - }, - { - "episode": 12688, - "epoch": 0.22806197648919727, - "loss/policy_avg": 0.20884034037590027, - "lr": 9.493865030674848e-06, - "objective/entropy": 129.760986328125, - "objective/kl": 42.595794677734375, - "objective/non_score_reward": -4.259579658508301, - "objective/rlhf_reward": -15.30498506228129, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 18.363981246948242, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.688461184501648, - "step": 792, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999577522277832 - }, - { - "episode": 12704, - "epoch": 0.2283495704065859, - "loss/policy_avg": 0.10811804234981537, - "lr": 9.493225971370144e-06, - "objective/entropy": -88.04064178466797, - "objective/kl": 41.53803253173828, - "objective/non_score_reward": -4.153803825378418, - "objective/rlhf_reward": -15.215214347839357, - "objective/scores": 0.35, - "policy/approxkl_avg": 79.52337646484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7973237037658691, - "step": 793, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9963350296020508 - }, - { - "episode": 12720, - "epoch": 0.22863716432397455, - "loss/policy_avg": 1.1484191417694092, - "lr": 9.49258691206544e-06, - "objective/entropy": 142.89540100097656, - "objective/kl": 47.12466049194336, - "objective/non_score_reward": -4.712466239929199, - "objective/rlhf_reward": -17.49061485502569, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 23.96436882019043, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.38031205534935, - "step": 794, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9986920356750488 - }, - { - "episode": 12736, - "epoch": 0.22892475824136319, - "loss/policy_avg": -0.1955292522907257, - "lr": 9.491947852760736e-06, - "objective/entropy": -43.12257385253906, - "objective/kl": 50.86421203613281, - "objective/non_score_reward": -5.086421012878418, - "objective/rlhf_reward": -18.398272822575507, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 0.7069367170333862, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.529199481010437, - "step": 795, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0014419555664062 - }, - { - "episode": 12752, - "epoch": 0.22921235215875185, - "loss/policy_avg": 0.12587346136569977, - "lr": 9.491308793456033e-06, - "objective/entropy": -115.25009155273438, - "objective/kl": 24.33888053894043, - "objective/non_score_reward": -2.4338879585266113, - "objective/rlhf_reward": -8.284953455539092, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 0.4508776366710663, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6505329012870789, - "step": 796, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0010581016540527 - }, - { - "episode": 12768, - "epoch": 0.2294999460761405, - "loss/policy_avg": 0.8157011270523071, - "lr": 9.49066973415133e-06, - "objective/entropy": -63.3074951171875, - "objective/kl": 36.60755920410156, - "objective/non_score_reward": -3.6607556343078613, - "objective/rlhf_reward": -13.283773624633234, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 144.46054077148438, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7697768211364746, - "step": 797, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990262985229492 - }, - { - "episode": 12784, - "epoch": 0.22978753999352913, - "loss/policy_avg": 0.22765851020812988, - "lr": 9.490030674846627e-06, - "objective/entropy": -187.6090545654297, - "objective/kl": 37.852291107177734, - "objective/non_score_reward": -3.7852296829223633, - "objective/rlhf_reward": -13.40758492151896, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 24.931396484375, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6455093622207642, - "step": 798, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9983055591583252 - }, - { - "episode": 12800, - "epoch": 0.23007513391091777, - "loss/policy_avg": 0.5693266987800598, - "lr": 9.489391615541922e-06, - "objective/entropy": -236.22152709960938, - "objective/kl": 38.294769287109375, - "objective/non_score_reward": -3.829477310180664, - "objective/rlhf_reward": -13.836956623013378, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 6.2298102378845215, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6318610906600952, - "step": 799, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0006356239318848 - }, - { - "episode": 12816, - "epoch": 0.23036272782830644, - "loss/policy_avg": 0.7033164501190186, - "lr": 9.488752556237219e-06, - "objective/entropy": 144.00018310546875, - "objective/kl": 39.03528594970703, - "objective/non_score_reward": -3.9035286903381348, - "objective/rlhf_reward": -13.789286489757608, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 4.471881866455078, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6523069143295288, - "step": 800, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.000467300415039 - }, - { - "episode": 12832, - "epoch": 0.23065032174569508, - "loss/policy_avg": 3.3897581100463867, - "lr": 9.488113496932516e-06, - "objective/entropy": 65.07791137695312, - "objective/kl": 33.08277130126953, - "objective/non_score_reward": -3.3082773685455322, - "objective/rlhf_reward": -11.717337691577608, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 5.002783298492432, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.46239709854125977, - "step": 801, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0012996196746826 - }, - { - "episode": 12848, - "epoch": 0.23093791566308372, - "loss/policy_avg": 0.06800729036331177, - "lr": 9.487474437627813e-06, - "objective/entropy": -189.33834838867188, - "objective/kl": 43.8226432800293, - "objective/non_score_reward": -4.382264614105225, - "objective/rlhf_reward": -16.1290584564209, - "objective/scores": 0.35, - "policy/approxkl_avg": 41.669837951660156, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6085605621337891, - "step": 802, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0008633136749268 - }, - { - "episode": 12864, - "epoch": 0.23122550958047236, - "loss/policy_avg": 0.5624558329582214, - "lr": 9.48683537832311e-06, - "objective/entropy": -188.01849365234375, - "objective/kl": 42.200706481933594, - "objective/non_score_reward": -4.220070838928223, - "objective/rlhf_reward": -15.42968450030838, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 14.247980117797852, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6492480635643005, - "step": 803, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999314308166504 - }, - { - "episode": 12880, - "epoch": 0.23151310349786103, - "loss/policy_avg": 1.8033558130264282, - "lr": 9.486196319018407e-06, - "objective/entropy": -126.62744140625, - "objective/kl": 41.638519287109375, - "objective/non_score_reward": -4.163851737976074, - "objective/rlhf_reward": -14.993547921598541, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 3.053473711013794, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5263950824737549, - "step": 804, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9994040727615356 - }, - { - "episode": 12896, - "epoch": 0.23180069741524967, - "loss/policy_avg": 0.5185251235961914, - "lr": 9.485557259713702e-06, - "objective/entropy": 25.416759490966797, - "objective/kl": 47.11408615112305, - "objective/non_score_reward": -4.711408615112305, - "objective/rlhf_reward": -17.32986172417038, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 9.99197769165039, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7477856874465942, - "step": 805, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9990828037261963 - }, - { - "episode": 12912, - "epoch": 0.2320882913326383, - "loss/policy_avg": 0.3774372935295105, - "lr": 9.484918200408999e-06, - "objective/entropy": 182.41983032226562, - "objective/kl": 56.3293342590332, - "objective/non_score_reward": -5.632933616638184, - "objective/rlhf_reward": -21.01596125343674, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 109.34550476074219, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6443891525268555, - "step": 806, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9989750385284424 - }, - { - "episode": 12928, - "epoch": 0.23237588525002698, - "loss/policy_avg": 0.4774811267852783, - "lr": 9.484279141104296e-06, - "objective/entropy": 19.027976989746094, - "objective/kl": 35.1290168762207, - "objective/non_score_reward": -3.512901782989502, - "objective/rlhf_reward": -12.495347588267876, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 3.8011960983276367, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.4730362296104431, - "step": 807, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.999955415725708 - }, - { - "episode": 12944, - "epoch": 0.23266347916741562, - "loss/policy_avg": 0.3621135354042053, - "lr": 9.483640081799592e-06, - "objective/entropy": 42.96700668334961, - "objective/kl": 50.60865020751953, - "objective/non_score_reward": -5.0608649253845215, - "objective/rlhf_reward": -18.792861084552154, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 4.702620029449463, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6127219796180725, - "step": 808, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9987151622772217 - }, - { - "episode": 12960, - "epoch": 0.23295107308480426, - "loss/policy_avg": 0.24576711654663086, - "lr": 9.48300102249489e-06, - "objective/entropy": -266.4676513671875, - "objective/kl": 42.15456771850586, - "objective/non_score_reward": -4.215456962585449, - "objective/rlhf_reward": -15.46182737350464, - "objective/scores": 0.35, - "policy/approxkl_avg": 1.908921718597412, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6165835857391357, - "step": 809, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9997169971466064 - }, - { - "episode": 12976, - "epoch": 0.2332386670021929, - "loss/policy_avg": 0.4992474317550659, - "lr": 9.482361963190185e-06, - "objective/entropy": 35.967933654785156, - "objective/kl": 50.87047576904297, - "objective/non_score_reward": -5.087048053741455, - "objective/rlhf_reward": -18.74407175547274, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 6.219294548034668, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6987083554267883, - "step": 810, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0008749961853027 - }, - { - "episode": 12992, - "epoch": 0.23352626091958156, - "loss/policy_avg": 0.47408565878868103, - "lr": 9.481722903885481e-06, - "objective/entropy": 42.86602783203125, - "objective/kl": 46.878440856933594, - "objective/non_score_reward": -4.687844276428223, - "objective/rlhf_reward": -14.35137782096863, - "objective/scores": 1.1, - "policy/approxkl_avg": 35.51860046386719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3832439184188843, - "step": 811, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0014867782592773 - }, - { - "episode": 13008, - "epoch": 0.2338138548369702, - "loss/policy_avg": 0.1151239275932312, - "lr": 9.481083844580777e-06, - "objective/entropy": -227.46157836914062, - "objective/kl": 39.24964904785156, - "objective/non_score_reward": -3.924964666366577, - "objective/rlhf_reward": -14.321256973830561, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 21.682876586914062, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5061876773834229, - "step": 812, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9995512962341309 - }, - { - "episode": 13024, - "epoch": 0.23410144875435884, - "loss/policy_avg": 1.2807607650756836, - "lr": 9.480444785276073e-06, - "objective/entropy": 219.10873413085938, - "objective/kl": 60.6810188293457, - "objective/non_score_reward": -6.06810188293457, - "objective/rlhf_reward": -22.756634318622286, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 107.69674682617188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.541488528251648, - "step": 813, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9991263151168823 - }, - { - "episode": 13040, - "epoch": 0.23438904267174748, - "loss/policy_avg": 0.5082242488861084, - "lr": 9.47980572597137e-06, - "objective/entropy": -262.63189697265625, - "objective/kl": 41.65293884277344, - "objective/non_score_reward": -4.1652936935424805, - "objective/rlhf_reward": -15.180222394879223, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 41.77598571777344, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4324526786804199, - "step": 814, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.003247022628784 - }, - { - "episode": 13056, - "epoch": 0.23467663658913615, - "loss/policy_avg": -0.10035756230354309, - "lr": 9.479166666666667e-06, - "objective/entropy": 134.06137084960938, - "objective/kl": 44.254085540771484, - "objective/non_score_reward": -4.425408363342285, - "objective/rlhf_reward": -15.876805897029946, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 23.891380310058594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.621482253074646, - "step": 815, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0046777725219727 - }, - { - "episode": 13072, - "epoch": 0.2349642305065248, - "loss/policy_avg": 0.27624276280403137, - "lr": 9.478527607361964e-06, - "objective/entropy": -135.5484619140625, - "objective/kl": 44.93092346191406, - "objective/non_score_reward": -4.4930925369262695, - "objective/rlhf_reward": -16.147539730342935, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 1.5283265113830566, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6181260347366333, - "step": 816, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000763416290283 - }, - { - "episode": 13088, - "epoch": 0.23525182442391343, - "loss/policy_avg": 0.5251176357269287, - "lr": 9.477888548057261e-06, - "objective/entropy": -254.23443603515625, - "objective/kl": 47.79570770263672, - "objective/non_score_reward": -4.779571533203125, - "objective/rlhf_reward": -17.739683725921015, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 1.8940521478652954, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5922014713287354, - "step": 817, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.001004934310913 - }, - { - "episode": 13104, - "epoch": 0.23553941834130207, - "loss/policy_avg": 0.9870522022247314, - "lr": 9.477249488752556e-06, - "objective/entropy": 18.714473724365234, - "objective/kl": 32.423614501953125, - "objective/non_score_reward": -3.242361545562744, - "objective/rlhf_reward": -11.545614500244227, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 83.56348419189453, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4189128279685974, - "step": 818, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0020997524261475 - }, - { - "episode": 13120, - "epoch": 0.23582701225869074, - "loss/policy_avg": 0.4610193073749542, - "lr": 9.476610429447853e-06, - "objective/entropy": -88.04772186279297, - "objective/kl": 39.214500427246094, - "objective/non_score_reward": -3.921450138092041, - "objective/rlhf_reward": -14.285800552368164, - "objective/scores": 0.35, - "policy/approxkl_avg": 13.349451065063477, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5404248237609863, - "step": 819, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9971152544021606 - }, - { - "episode": 13136, - "epoch": 0.23611460617607938, - "loss/policy_avg": 0.6160886287689209, - "lr": 9.47597137014315e-06, - "objective/entropy": -271.970458984375, - "objective/kl": 37.36911392211914, - "objective/non_score_reward": -3.7369112968444824, - "objective/rlhf_reward": -13.523813803394404, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 18.014442443847656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6574091911315918, - "step": 820, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9968414306640625 - }, - { - "episode": 13152, - "epoch": 0.23640220009346802, - "loss/policy_avg": 4.127373695373535, - "lr": 9.475332310838447e-06, - "objective/entropy": 22.463993072509766, - "objective/kl": 45.64144515991211, - "objective/non_score_reward": -4.564144611358643, - "objective/rlhf_reward": -16.523244873682657, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 48.066158294677734, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.48151886463165283, - "step": 821, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0003304481506348 - }, - { - "episode": 13168, - "epoch": 0.23668979401085666, - "loss/policy_avg": 0.785558819770813, - "lr": 9.474693251533744e-06, - "objective/entropy": -93.47967529296875, - "objective/kl": 45.30268096923828, - "objective/non_score_reward": -4.530268669128418, - "objective/rlhf_reward": -16.79556087020032, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 50.31500244140625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4456283450126648, - "step": 822, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997105360031128 - }, - { - "episode": 13184, - "epoch": 0.23697738792824533, - "loss/policy_avg": 1.3671177625656128, - "lr": 9.474054192229039e-06, - "objective/entropy": -68.2393798828125, - "objective/kl": 52.268829345703125, - "objective/non_score_reward": -5.2268829345703125, - "objective/rlhf_reward": -18.507531261444093, - "objective/scores": 0.6, - "policy/approxkl_avg": 21.187942504882812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5112272500991821, - "step": 823, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0015244483947754 - }, - { - "episode": 13200, - "epoch": 0.23726498184563397, - "loss/policy_avg": 1.411192536354065, - "lr": 9.473415132924336e-06, - "objective/entropy": 139.8941192626953, - "objective/kl": 52.27011489868164, - "objective/non_score_reward": -5.227011680603027, - "objective/rlhf_reward": -19.50804648399353, - "objective/scores": 0.35, - "policy/approxkl_avg": 10.510537147521973, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.588684618473053, - "step": 824, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999821424484253 - }, - { - "episode": 13216, - "epoch": 0.2375525757630226, - "loss/policy_avg": 2.240239143371582, - "lr": 9.472776073619633e-06, - "objective/entropy": 27.470577239990234, - "objective/kl": 59.331642150878906, - "objective/non_score_reward": -5.933164119720459, - "objective/rlhf_reward": -20.808937226177427, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 7.889102458953857, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.567642331123352, - "step": 825, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9986729621887207 - }, - { - "episode": 13232, - "epoch": 0.23784016968041127, - "loss/policy_avg": 0.21238625049591064, - "lr": 9.47213701431493e-06, - "objective/entropy": -97.9186782836914, - "objective/kl": 42.333152770996094, - "objective/non_score_reward": -4.2333149909973145, - "objective/rlhf_reward": -15.482662062259063, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 11.206239700317383, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5994842052459717, - "step": 826, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0001232624053955 - }, - { - "episode": 13248, - "epoch": 0.2381277635977999, - "loss/policy_avg": 0.1261046975851059, - "lr": 9.471497955010226e-06, - "objective/entropy": -33.41431427001953, - "objective/kl": 44.24114990234375, - "objective/non_score_reward": -4.42411470413208, - "objective/rlhf_reward": -14.772739563823912, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 17.684558868408203, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6327311396598816, - "step": 827, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.000532865524292 - }, - { - "episode": 13264, - "epoch": 0.23841535751518855, - "loss/policy_avg": 0.5212262868881226, - "lr": 9.470858895705523e-06, - "objective/entropy": -101.5107192993164, - "objective/kl": 46.10678482055664, - "objective/non_score_reward": -4.610678672790527, - "objective/rlhf_reward": -16.838595185343344, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 15.35481071472168, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5887008905410767, - "step": 828, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9996135234832764 - }, - { - "episode": 13280, - "epoch": 0.2387029514325772, - "loss/policy_avg": 0.6297311782836914, - "lr": 9.470219836400818e-06, - "objective/entropy": 17.642807006835938, - "objective/kl": 45.446571350097656, - "objective/non_score_reward": -4.544657230377197, - "objective/rlhf_reward": -16.445295349756876, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 25.065134048461914, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6857582330703735, - "step": 829, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.999565839767456 - }, - { - "episode": 13296, - "epoch": 0.23899054534996586, - "loss/policy_avg": 1.2870714664459229, - "lr": 9.469580777096115e-06, - "objective/entropy": -112.94920349121094, - "objective/kl": 41.829681396484375, - "objective/non_score_reward": -4.182968616485596, - "objective/rlhf_reward": -14.331874227523805, - "objective/scores": 0.6, - "policy/approxkl_avg": 43.81898498535156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5229564905166626, - "step": 830, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9994326829910278 - }, - { - "episode": 13312, - "epoch": 0.2392781392673545, - "loss/policy_avg": 1.0273782014846802, - "lr": 9.468941717791412e-06, - "objective/entropy": -7.699493408203125, - "objective/kl": 37.84484100341797, - "objective/non_score_reward": -3.7844836711883545, - "objective/rlhf_reward": -13.015229167715582, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 17.57620620727539, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4471469223499298, - "step": 831, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9978184700012207 - }, - { - "episode": 13328, - "epoch": 0.23956573318474314, - "loss/policy_avg": 0.7235412001609802, - "lr": 9.468302658486709e-06, - "objective/entropy": 106.56259155273438, - "objective/kl": 60.04669189453125, - "objective/non_score_reward": -6.004669189453125, - "objective/rlhf_reward": -22.640074827758173, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 64.07603454589844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5924360752105713, - "step": 832, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.997698426246643 - }, - { - "episode": 13344, - "epoch": 0.23985332710213178, - "loss/policy_avg": 0.26122790575027466, - "lr": 9.467663599182006e-06, - "objective/entropy": -15.060523986816406, - "objective/kl": 56.78717803955078, - "objective/non_score_reward": -5.678718090057373, - "objective/rlhf_reward": -21.373235753088622, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 123.58401489257812, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.44901490211486816, - "step": 833, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9980285167694092 - }, - { - "episode": 13360, - "epoch": 0.24014092101952045, - "loss/policy_avg": -0.059971150010824203, - "lr": 9.467024539877301e-06, - "objective/entropy": -209.9939422607422, - "objective/kl": 39.02555847167969, - "objective/non_score_reward": -3.9025564193725586, - "objective/rlhf_reward": -13.948365693510162, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.0669580698013306, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6912230253219604, - "step": 834, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0006463527679443 - }, - { - "episode": 13376, - "epoch": 0.2404285149369091, - "loss/policy_avg": 0.13007503747940063, - "lr": 9.466385480572598e-06, - "objective/entropy": 166.1580352783203, - "objective/kl": 43.581153869628906, - "objective/non_score_reward": -4.358116149902344, - "objective/rlhf_reward": -13.03246364593506, - "objective/scores": 1.1, - "policy/approxkl_avg": 8.686347007751465, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7475668787956238, - "step": 835, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001697540283203 - }, - { - "episode": 13392, - "epoch": 0.24071610885429773, - "loss/policy_avg": 0.07029886543750763, - "lr": 9.465746421267893e-06, - "objective/entropy": -160.5312957763672, - "objective/kl": 39.0033073425293, - "objective/non_score_reward": -3.9003307819366455, - "objective/rlhf_reward": -14.120369794781567, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 38.296531677246094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3903728723526001, - "step": 836, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9994573593139648 - }, - { - "episode": 13408, - "epoch": 0.24100370277168637, - "loss/policy_avg": -0.24450919032096863, - "lr": 9.46510736196319e-06, - "objective/entropy": 225.25355529785156, - "objective/kl": 36.30988693237305, - "objective/non_score_reward": -3.630988597869873, - "objective/rlhf_reward": -13.043001535351634, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 33.346378326416016, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5566978454589844, - "step": 837, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 2.000542163848877 - }, - { - "episode": 13424, - "epoch": 0.24129129668907504, - "loss/policy_avg": 0.28000321984291077, - "lr": 9.464468302658487e-06, - "objective/entropy": -151.39920043945312, - "objective/kl": 36.01958465576172, - "objective/non_score_reward": -3.6019582748413086, - "objective/rlhf_reward": -12.674499527613321, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 6.0946550369262695, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.2960823178291321, - "step": 838, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0004286766052246 - }, - { - "episode": 13440, - "epoch": 0.24157889060646368, - "loss/policy_avg": 1.0798143148422241, - "lr": 9.463829243353784e-06, - "objective/entropy": -37.02276611328125, - "objective/kl": 38.723487854003906, - "objective/non_score_reward": -3.8723487854003906, - "objective/rlhf_reward": -14.0655630423623, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 69.36178588867188, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5190849304199219, - "step": 839, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9978101253509521 - }, - { - "episode": 13456, - "epoch": 0.24186648452385232, - "loss/policy_avg": 0.8364774584770203, - "lr": 9.46319018404908e-06, - "objective/entropy": 62.59022521972656, - "objective/kl": 39.46584701538086, - "objective/non_score_reward": -3.946584939956665, - "objective/rlhf_reward": -14.444703629522948, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 19.215335845947266, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.44274285435676575, - "step": 840, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9990289211273193 - }, - { - "episode": 13472, - "epoch": 0.24215407844124096, - "loss/policy_avg": 0.9193323254585266, - "lr": 9.462551124744378e-06, - "objective/entropy": 101.40837097167969, - "objective/kl": 41.96873474121094, - "objective/non_score_reward": -4.196873664855957, - "objective/rlhf_reward": -15.336895804019317, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 9.205939292907715, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.28277623653411865, - "step": 841, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 0, - "val/ratio": 1.9998894929885864 - }, - { - "episode": 13488, - "epoch": 0.24244167235862962, - "loss/policy_avg": 0.03237959370017052, - "lr": 9.461912065439673e-06, - "objective/entropy": -146.46066284179688, - "objective/kl": 36.374385833740234, - "objective/non_score_reward": -3.6374387741088867, - "objective/rlhf_reward": -10.149754858016967, - "objective/scores": 1.1, - "policy/approxkl_avg": 0.5134851336479187, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3451271057128906, - "step": 842, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.002326726913452 - }, - { - "episode": 13504, - "epoch": 0.24272926627601826, - "loss/policy_avg": 0.12626682221889496, - "lr": 9.46127300613497e-06, - "objective/entropy": -148.53372192382812, - "objective/kl": 43.185630798339844, - "objective/non_score_reward": -4.318563461303711, - "objective/rlhf_reward": -14.87425241470337, - "objective/scores": 0.6, - "policy/approxkl_avg": 5.643270969390869, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6915034651756287, - "step": 843, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.01680326461792 - }, - { - "episode": 13520, - "epoch": 0.2430168601934069, - "loss/policy_avg": -0.19212225079536438, - "lr": 9.460633946830267e-06, - "objective/entropy": -70.89179229736328, - "objective/kl": 49.2342529296875, - "objective/non_score_reward": -4.923425197601318, - "objective/rlhf_reward": -15.293701267242433, - "objective/scores": 1.1, - "policy/approxkl_avg": 3.8346972465515137, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.40477171540260315, - "step": 844, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0019445419311523 - }, - { - "episode": 13536, - "epoch": 0.24330445411079557, - "loss/policy_avg": 2.627497434616089, - "lr": 9.459994887525563e-06, - "objective/entropy": 129.54180908203125, - "objective/kl": 53.16524887084961, - "objective/non_score_reward": -5.316524505615234, - "objective/rlhf_reward": -18.866099452972414, - "objective/scores": 0.6, - "policy/approxkl_avg": 23.311412811279297, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5597689747810364, - "step": 845, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9994431734085083 - }, - { - "episode": 13552, - "epoch": 0.2435920480281842, - "loss/policy_avg": 0.1320989578962326, - "lr": 9.45935582822086e-06, - "objective/entropy": 152.18173217773438, - "objective/kl": 43.19120788574219, - "objective/non_score_reward": -4.319120407104492, - "objective/rlhf_reward": -15.451653595241616, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 2.6365280151367188, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.70622318983078, - "step": 846, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.003366470336914 - }, - { - "episode": 13568, - "epoch": 0.24387964194557285, - "loss/policy_avg": -0.20065978169441223, - "lr": 9.458716768916156e-06, - "objective/entropy": -52.614356994628906, - "objective/kl": 30.15610694885254, - "objective/non_score_reward": -3.015610694885254, - "objective/rlhf_reward": -10.736930403739137, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 3.9257869720458984, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.4547704756259918, - "step": 847, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0025835037231445 - }, - { - "episode": 13584, - "epoch": 0.2441672358629615, - "loss/policy_avg": 2.7808499336242676, - "lr": 9.458077709611452e-06, - "objective/entropy": -88.72569274902344, - "objective/kl": 41.056114196777344, - "objective/non_score_reward": -4.105611324310303, - "objective/rlhf_reward": -14.99861272116479, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 13.171248435974121, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.636700451374054, - "step": 848, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9982078075408936 - }, - { - "episode": 13600, - "epoch": 0.24445482978035016, - "loss/policy_avg": 0.17049476504325867, - "lr": 9.45743865030675e-06, - "objective/entropy": -14.615028381347656, - "objective/kl": 44.74107360839844, - "objective/non_score_reward": -4.47410774230957, - "objective/rlhf_reward": -16.380659901889498, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 23.5351619720459, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.42227408289909363, - "step": 849, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9997400045394897 - }, - { - "episode": 13616, - "epoch": 0.2447424236977388, - "loss/policy_avg": -0.5285428166389465, - "lr": 9.456799591002046e-06, - "objective/entropy": -304.7969970703125, - "objective/kl": 40.3505973815918, - "objective/non_score_reward": -4.035059928894043, - "objective/rlhf_reward": -14.814726147681398, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 14.866556167602539, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.5371728539466858, - "step": 850, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0025596618652344 - }, - { - "episode": 13632, - "epoch": 0.24503001761512744, - "loss/policy_avg": 0.7969543933868408, - "lr": 9.456160531697343e-06, - "objective/entropy": 73.470947265625, - "objective/kl": 38.98177719116211, - "objective/non_score_reward": -3.8981776237487793, - "objective/rlhf_reward": -13.859377400080362, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 9.109411239624023, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5251634120941162, - "step": 851, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0007219314575195 - }, - { - "episode": 13648, - "epoch": 0.24531761153251608, - "loss/policy_avg": 1.060788631439209, - "lr": 9.45552147239264e-06, - "objective/entropy": -220.52978515625, - "objective/kl": 39.64278793334961, - "objective/non_score_reward": -3.96427845954895, - "objective/rlhf_reward": -11.457114553451538, - "objective/scores": 1.1, - "policy/approxkl_avg": 20.378562927246094, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5584784150123596, - "step": 852, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.998639702796936 - }, - { - "episode": 13664, - "epoch": 0.24560520544990475, - "loss/policy_avg": 1.2480220794677734, - "lr": 9.454882413087935e-06, - "objective/entropy": 107.21774291992188, - "objective/kl": 45.63555908203125, - "objective/non_score_reward": -4.563555717468262, - "objective/rlhf_reward": -16.429394836696694, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 19.441165924072266, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.575886607170105, - "step": 853, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9971050024032593 - }, - { - "episode": 13680, - "epoch": 0.24589279936729339, - "loss/policy_avg": 0.4997670352458954, - "lr": 9.454243353783232e-06, - "objective/entropy": 55.699459075927734, - "objective/kl": 50.389007568359375, - "objective/non_score_reward": -5.038900375366211, - "objective/rlhf_reward": -18.63983019569748, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 0.9880640506744385, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5993074178695679, - "step": 854, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.001983880996704 - }, - { - "episode": 13696, - "epoch": 0.24618039328468203, - "loss/policy_avg": 0.6334704756736755, - "lr": 9.453604294478529e-06, - "objective/entropy": -116.13612365722656, - "objective/kl": 49.5648193359375, - "objective/non_score_reward": -4.956482410430908, - "objective/rlhf_reward": -18.425929164886476, - "objective/scores": 0.35, - "policy/approxkl_avg": 26.174985885620117, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7205100059509277, - "step": 855, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990193843841553 - }, - { - "episode": 13712, - "epoch": 0.24646798720207067, - "loss/policy_avg": -0.25992709398269653, - "lr": 9.452965235173824e-06, - "objective/entropy": -101.61711120605469, - "objective/kl": 60.37242889404297, - "objective/non_score_reward": -6.037242889404297, - "objective/rlhf_reward": -22.59271368285711, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 28.445724487304688, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7341337203979492, - "step": 856, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9983941316604614 - }, - { - "episode": 13728, - "epoch": 0.24675558111945933, - "loss/policy_avg": 1.5085573196411133, - "lr": 9.452326175869121e-06, - "objective/entropy": -4.551849365234375, - "objective/kl": 51.28207015991211, - "objective/non_score_reward": -5.128207206726074, - "objective/rlhf_reward": -18.95656809112127, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 35.152862548828125, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5039411187171936, - "step": 857, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9979238510131836 - }, - { - "episode": 13744, - "epoch": 0.24704317503684797, - "loss/policy_avg": 0.3509300947189331, - "lr": 9.451687116564418e-06, - "objective/entropy": -296.17901611328125, - "objective/kl": 22.41075897216797, - "objective/non_score_reward": -2.2410757541656494, - "objective/rlhf_reward": -6.841596784369026, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 24.681076049804688, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7068237662315369, - "step": 858, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 15, - "val/ratio": 1.9986300468444824 - }, - { - "episode": 13760, - "epoch": 0.2473307689542366, - "loss/policy_avg": 0.4679286777973175, - "lr": 9.451048057259715e-06, - "objective/entropy": -0.08905029296875, - "objective/kl": 46.414649963378906, - "objective/non_score_reward": -4.641464710235596, - "objective/rlhf_reward": -17.240346703559084, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 44.908973693847656, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.44072186946868896, - "step": 859, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.997888207435608 - }, - { - "episode": 13776, - "epoch": 0.24761836287162525, - "loss/policy_avg": 1.0680372714996338, - "lr": 9.45040899795501e-06, - "objective/entropy": -286.050537109375, - "objective/kl": 41.66375732421875, - "objective/non_score_reward": -4.166375637054443, - "objective/rlhf_reward": -16.665502786636353, - "objective/scores": 0.0, - "policy/approxkl_avg": 30.8399715423584, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6297662854194641, - "step": 860, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.997706413269043 - }, - { - "episode": 13792, - "epoch": 0.24790595678901392, - "loss/policy_avg": 0.23324424028396606, - "lr": 9.449769938650307e-06, - "objective/entropy": -387.7367858886719, - "objective/kl": 42.02001953125, - "objective/non_score_reward": -4.202002048492432, - "objective/rlhf_reward": -14.860596249775824, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 12.314942359924316, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7003234028816223, - "step": 861, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9979217052459717 - }, - { - "episode": 13808, - "epoch": 0.24819355070640256, - "loss/policy_avg": 0.7436450719833374, - "lr": 9.449130879345604e-06, - "objective/entropy": -354.4407958984375, - "objective/kl": 36.00145721435547, - "objective/non_score_reward": -3.6001460552215576, - "objective/rlhf_reward": -12.919631603176953, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 3.0306484699249268, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5300300717353821, - "step": 862, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0001163482666016 - }, - { - "episode": 13824, - "epoch": 0.2484811446237912, - "loss/policy_avg": 0.6046145558357239, - "lr": 9.4484918200409e-06, - "objective/entropy": -235.1295166015625, - "objective/kl": 38.761863708496094, - "objective/non_score_reward": -3.8761868476867676, - "objective/rlhf_reward": -13.557335923390326, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 30.976497650146484, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6000299453735352, - "step": 863, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9970474243164062 - }, - { - "episode": 13840, - "epoch": 0.24876873854117984, - "loss/policy_avg": 0.21184206008911133, - "lr": 9.447852760736197e-06, - "objective/entropy": -39.172943115234375, - "objective/kl": 42.02351379394531, - "objective/non_score_reward": -4.2023515701293945, - "objective/rlhf_reward": -15.385573704441157, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 19.583112716674805, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5341065526008606, - "step": 864, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9986238479614258 - }, - { - "episode": 13856, - "epoch": 0.2490563324585685, - "loss/policy_avg": 0.4282435178756714, - "lr": 9.447213701431494e-06, - "objective/entropy": -285.52001953125, - "objective/kl": 42.929405212402344, - "objective/non_score_reward": -4.292940139770508, - "objective/rlhf_reward": -15.771761751174928, - "objective/scores": 0.35, - "policy/approxkl_avg": 3.497749090194702, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7426670789718628, - "step": 865, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9992247819900513 - }, - { - "episode": 13872, - "epoch": 0.24934392637595715, - "loss/policy_avg": 0.29745015501976013, - "lr": 9.44657464212679e-06, - "objective/entropy": 46.30607604980469, - "objective/kl": 45.067955017089844, - "objective/non_score_reward": -4.506795406341553, - "objective/rlhf_reward": -16.701668295890016, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 41.62073516845703, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.501271665096283, - "step": 866, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9972460269927979 - }, - { - "episode": 13888, - "epoch": 0.2496315202933458, - "loss/policy_avg": 0.5706069469451904, - "lr": 9.445935582822086e-06, - "objective/entropy": 61.499298095703125, - "objective/kl": 44.0521240234375, - "objective/non_score_reward": -4.405212879180908, - "objective/rlhf_reward": -16.064591257777764, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 14.744949340820312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4016445279121399, - "step": 867, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9990034103393555 - }, - { - "episode": 13904, - "epoch": 0.24991911421073446, - "loss/policy_avg": 1.6848845481872559, - "lr": 9.445296523517383e-06, - "objective/entropy": -261.97088623046875, - "objective/kl": 37.381866455078125, - "objective/non_score_reward": -3.738186836242676, - "objective/rlhf_reward": -13.127918358120034, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 22.18919563293457, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4207575023174286, - "step": 868, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9996562004089355 - }, - { - "episode": 13920, - "epoch": 0.25020670812812307, - "loss/policy_avg": 1.0744154453277588, - "lr": 9.44465746421268e-06, - "objective/entropy": -269.3877258300781, - "objective/kl": 41.17782974243164, - "objective/non_score_reward": -4.117783069610596, - "objective/rlhf_reward": -14.990179660733105, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 44.93556213378906, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.567723274230957, - "step": 869, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.998097538948059 - }, - { - "episode": 13936, - "epoch": 0.25049430204551176, - "loss/policy_avg": -0.518259584903717, - "lr": 9.444018404907977e-06, - "objective/entropy": -99.76119995117188, - "objective/kl": 58.50865936279297, - "objective/non_score_reward": -5.850865840911865, - "objective/rlhf_reward": -21.456051657872138, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 17.84606170654297, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.45481520891189575, - "step": 870, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9988923072814941 - }, - { - "episode": 13952, - "epoch": 0.2507818959629004, - "loss/policy_avg": -0.3090100586414337, - "lr": 9.443379345603272e-06, - "objective/entropy": -71.82542419433594, - "objective/kl": 40.61528778076172, - "objective/non_score_reward": -4.06152868270874, - "objective/rlhf_reward": -14.641994628969748, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 4.262547492980957, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.408467173576355, - "step": 871, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0034618377685547 - }, - { - "episode": 13968, - "epoch": 0.25106948988028904, - "loss/policy_avg": 0.9268704652786255, - "lr": 9.442740286298569e-06, - "objective/entropy": -257.0228271484375, - "objective/kl": 41.92462158203125, - "objective/non_score_reward": -4.192461967468262, - "objective/rlhf_reward": -15.036515728632608, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 27.330509185791016, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.49810606241226196, - "step": 872, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9987335205078125 - }, - { - "episode": 13984, - "epoch": 0.2513570837976777, - "loss/policy_avg": 0.6296570301055908, - "lr": 9.442101226993866e-06, - "objective/entropy": -389.610595703125, - "objective/kl": 35.68389892578125, - "objective/non_score_reward": -3.568390130996704, - "objective/rlhf_reward": -12.448731775554727, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 16.5224609375, - "policy/clipfrac_avg": 0.25, - "policy/entropy_avg": 0.7835655212402344, - "step": 873, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999846339225769 - }, - { - "episode": 14000, - "epoch": 0.2516446777150663, - "loss/policy_avg": 0.4277964234352112, - "lr": 9.441462167689163e-06, - "objective/entropy": -170.8277587890625, - "objective/kl": 38.57947540283203, - "objective/non_score_reward": -3.85794734954834, - "objective/rlhf_reward": -13.827669653956015, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 44.01490783691406, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6223859786987305, - "step": 874, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.000417470932007 - }, - { - "episode": 14016, - "epoch": 0.25193227163245496, - "loss/policy_avg": 0.8091301918029785, - "lr": 9.44082310838446e-06, - "objective/entropy": -39.32908630371094, - "objective/kl": 39.68909454345703, - "objective/non_score_reward": -3.968909740447998, - "objective/rlhf_reward": -13.752931775824102, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 28.2178897857666, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6742968559265137, - "step": 875, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9996860027313232 - }, - { - "episode": 14032, - "epoch": 0.2522198655498436, - "loss/policy_avg": 0.743851900100708, - "lr": 9.440184049079757e-06, - "objective/entropy": -351.21734619140625, - "objective/kl": 39.85813903808594, - "objective/non_score_reward": -3.985814094543457, - "objective/rlhf_reward": -14.209922806421915, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 59.21538543701172, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5436392426490784, - "step": 876, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.997950792312622 - }, - { - "episode": 14048, - "epoch": 0.25250745946723224, - "loss/policy_avg": 1.5350085496902466, - "lr": 9.439544989775052e-06, - "objective/entropy": -65.44673156738281, - "objective/kl": 50.07268524169922, - "objective/non_score_reward": -5.00726842880249, - "objective/rlhf_reward": -17.906367959753545, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 29.94268226623535, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7696713209152222, - "step": 877, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0026378631591797 - }, - { - "episode": 14064, - "epoch": 0.25279505338462094, - "loss/policy_avg": 3.7316019535064697, - "lr": 9.438905930470349e-06, - "objective/entropy": -307.8963623046875, - "objective/kl": 38.285011291503906, - "objective/non_score_reward": -3.828500747680664, - "objective/rlhf_reward": -13.652143721998321, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 28.130430221557617, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8377779722213745, - "step": 878, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 2.0074470043182373 - }, - { - "episode": 14080, - "epoch": 0.2530826473020096, - "loss/policy_avg": 0.38605859875679016, - "lr": 9.438266871165644e-06, - "objective/entropy": -106.09515380859375, - "objective/kl": 36.02558517456055, - "objective/non_score_reward": -3.6025586128234863, - "objective/rlhf_reward": -12.46282274551862, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 7.791407585144043, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.48136669397354126, - "step": 879, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9982140064239502 - }, - { - "episode": 14096, - "epoch": 0.2533702412193982, - "loss/policy_avg": -0.01890498399734497, - "lr": 9.43762781186094e-06, - "objective/entropy": -181.742431640625, - "objective/kl": 42.437530517578125, - "objective/non_score_reward": -4.243752956390381, - "objective/rlhf_reward": -15.649499688178224, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 15.317750930786133, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.511642575263977, - "step": 880, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0014102458953857 - }, - { - "episode": 14112, - "epoch": 0.25365783513678686, - "loss/policy_avg": 0.030795343220233917, - "lr": 9.436988752556238e-06, - "objective/entropy": -230.95361328125, - "objective/kl": 30.112140655517578, - "objective/non_score_reward": -3.011213779449463, - "objective/rlhf_reward": -9.12113658034918, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 12.038021087646484, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5518423914909363, - "step": 881, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9984687566757202 - }, - { - "episode": 14128, - "epoch": 0.2539454290541755, - "loss/policy_avg": 0.4279516935348511, - "lr": 9.436349693251534e-06, - "objective/entropy": -245.98117065429688, - "objective/kl": 43.55865478515625, - "objective/non_score_reward": -4.355865478515625, - "objective/rlhf_reward": -15.023462629318239, - "objective/scores": 0.6, - "policy/approxkl_avg": 2.3968582153320312, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7229801416397095, - "step": 882, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0013785362243652 - }, - { - "episode": 14144, - "epoch": 0.25423302297156414, - "loss/policy_avg": 0.7990102767944336, - "lr": 9.435710633946831e-06, - "objective/entropy": -141.9425506591797, - "objective/kl": 36.01177215576172, - "objective/non_score_reward": -3.60117769241333, - "objective/rlhf_reward": -12.95411191424881, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 35.52224349975586, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6370751857757568, - "step": 883, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9986944198608398 - }, - { - "episode": 14160, - "epoch": 0.2545206168889528, - "loss/policy_avg": 1.1767592430114746, - "lr": 9.435071574642126e-06, - "objective/entropy": -3.1976966857910156, - "objective/kl": 52.58678436279297, - "objective/non_score_reward": -5.258677959442139, - "objective/rlhf_reward": -18.11099377715704, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 64.22396850585938, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5332536697387695, - "step": 884, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9978070259094238 - }, - { - "episode": 14176, - "epoch": 0.2548082108063415, - "loss/policy_avg": 0.09251243621110916, - "lr": 9.434432515337423e-06, - "objective/entropy": -235.44993591308594, - "objective/kl": 47.086631774902344, - "objective/non_score_reward": -4.708662986755371, - "objective/rlhf_reward": -17.10131909052531, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 12.522943496704102, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6954100131988525, - "step": 885, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9970701932907104 - }, - { - "episode": 14192, - "epoch": 0.2550958047237301, - "loss/policy_avg": 0.7762466073036194, - "lr": 9.43379345603272e-06, - "objective/entropy": -65.08514404296875, - "objective/kl": 34.25334930419922, - "objective/non_score_reward": -3.425334930419922, - "objective/rlhf_reward": -12.37582734587781, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 10.137767791748047, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6340222358703613, - "step": 886, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.997950792312622 - }, - { - "episode": 14208, - "epoch": 0.25538339864111875, - "loss/policy_avg": 0.4137943387031555, - "lr": 9.433154396728017e-06, - "objective/entropy": -208.6219940185547, - "objective/kl": 48.540523529052734, - "objective/non_score_reward": -4.854052543640137, - "objective/rlhf_reward": -19.41620969772339, - "objective/scores": 0.0, - "policy/approxkl_avg": 6.9052228927612305, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6014610528945923, - "step": 887, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9984641075134277 - }, - { - "episode": 14224, - "epoch": 0.2556709925585074, - "loss/policy_avg": 0.041199300438165665, - "lr": 9.432515337423314e-06, - "objective/entropy": 46.358150482177734, - "objective/kl": 47.68404769897461, - "objective/non_score_reward": -4.768404960632324, - "objective/rlhf_reward": -16.14990035140631, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 10.196189880371094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6130638122558594, - "step": 888, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0053634643554688 - }, - { - "episode": 14240, - "epoch": 0.25595858647589603, - "loss/policy_avg": 0.9656521081924438, - "lr": 9.431876278118611e-06, - "objective/entropy": -300.8107604980469, - "objective/kl": 40.519744873046875, - "objective/non_score_reward": -4.051974296569824, - "objective/rlhf_reward": -14.829295017806391, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 155.98867797851562, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6544891595840454, - "step": 889, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 13, - "val/ratio": 1.9983346462249756 - }, - { - "episode": 14256, - "epoch": 0.2562461803932847, - "loss/policy_avg": 1.5469386577606201, - "lr": 9.431237218813906e-06, - "objective/entropy": -53.928260803222656, - "objective/kl": 51.69367599487305, - "objective/non_score_reward": -5.169367790222168, - "objective/rlhf_reward": -19.318220340941828, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 15.555410385131836, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5645780563354492, - "step": 890, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9977774620056152 - }, - { - "episode": 14272, - "epoch": 0.2565337743106733, - "loss/policy_avg": 1.4335881471633911, - "lr": 9.430598159509203e-06, - "objective/entropy": -98.00782775878906, - "objective/kl": 39.847450256347656, - "objective/non_score_reward": -3.9847452640533447, - "objective/rlhf_reward": -14.334861550394614, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 28.548376083374023, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.697210431098938, - "step": 891, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9980844259262085 - }, - { - "episode": 14288, - "epoch": 0.25682136822806195, - "loss/policy_avg": 1.1344060897827148, - "lr": 9.4299591002045e-06, - "objective/entropy": -38.29204177856445, - "objective/kl": 40.507850646972656, - "objective/non_score_reward": -4.050785064697266, - "objective/rlhf_reward": -14.541280036390411, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 48.63218307495117, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5049742460250854, - "step": 892, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9978201389312744 - }, - { - "episode": 14304, - "epoch": 0.25710896214545065, - "loss/policy_avg": -0.35933157801628113, - "lr": 9.429320040899797e-06, - "objective/entropy": -117.44267272949219, - "objective/kl": 30.698312759399414, - "objective/non_score_reward": -3.069831371307373, - "objective/rlhf_reward": -10.617466216505157, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 1.3441765308380127, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3006622791290283, - "step": 893, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0008156299591064 - }, - { - "episode": 14320, - "epoch": 0.2573965560628393, - "loss/policy_avg": 0.8382015824317932, - "lr": 9.428680981595094e-06, - "objective/entropy": -200.59832763671875, - "objective/kl": 43.460121154785156, - "objective/non_score_reward": -4.346012115478516, - "objective/rlhf_reward": -15.868276679309542, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 12.426078796386719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5876235961914062, - "step": 894, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994069337844849 - }, - { - "episode": 14336, - "epoch": 0.25768414998022793, - "loss/policy_avg": 0.687677264213562, - "lr": 9.42804192229039e-06, - "objective/entropy": 203.2930908203125, - "objective/kl": 59.16074752807617, - "objective/non_score_reward": -5.916074752807617, - "objective/rlhf_reward": -21.839470739635537, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 170.25958251953125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6621222496032715, - "step": 895, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9983516931533813 - }, - { - "episode": 14352, - "epoch": 0.25797174389761657, - "loss/policy_avg": -0.23158738017082214, - "lr": 9.427402862985686e-06, - "objective/entropy": -174.0965118408203, - "objective/kl": 45.058555603027344, - "objective/non_score_reward": -4.505855560302734, - "objective/rlhf_reward": -16.697908911734743, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 19.966630935668945, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6067174673080444, - "step": 896, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.999699592590332 - }, - { - "episode": 14368, - "epoch": 0.2582593378150052, - "loss/policy_avg": -0.5135414600372314, - "lr": 9.426763803680982e-06, - "objective/entropy": 179.40419006347656, - "objective/kl": 59.1436882019043, - "objective/non_score_reward": -5.914369106292725, - "objective/rlhf_reward": -22.233643849094477, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 36.81080627441406, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 1.0215977430343628, - "step": 897, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994231462478638 - }, - { - "episode": 14384, - "epoch": 0.25854693173239385, - "loss/policy_avg": 0.39005035161972046, - "lr": 9.42612474437628e-06, - "objective/entropy": -332.71807861328125, - "objective/kl": 42.75315856933594, - "objective/non_score_reward": -4.27531623840332, - "objective/rlhf_reward": -12.70126543045044, - "objective/scores": 1.1, - "policy/approxkl_avg": 27.193714141845703, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46848946809768677, - "step": 898, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9977561235427856 - }, - { - "episode": 14400, - "epoch": 0.2588345256497825, - "loss/policy_avg": -0.6668556928634644, - "lr": 9.425485685071576e-06, - "objective/entropy": -235.47409057617188, - "objective/kl": 35.54128646850586, - "objective/non_score_reward": -3.554128646850586, - "objective/rlhf_reward": -12.735561969693066, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 42.73648452758789, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5194951295852661, - "step": 899, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.0026841163635254 - }, - { - "episode": 14416, - "epoch": 0.2591221195671711, - "loss/policy_avg": 0.5247483849525452, - "lr": 9.424846625766873e-06, - "objective/entropy": -204.9716339111328, - "objective/kl": 38.4921989440918, - "objective/non_score_reward": -3.849219799041748, - "objective/rlhf_reward": -13.663545624415079, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 50.06997299194336, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6830487251281738, - "step": 900, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986555576324463 - }, - { - "episode": 14432, - "epoch": 0.2594097134845598, - "loss/policy_avg": 0.6800730228424072, - "lr": 9.424207566462168e-06, - "objective/entropy": -32.36799240112305, - "objective/kl": 47.80828857421875, - "objective/non_score_reward": -4.780828952789307, - "objective/rlhf_reward": -17.519195351664145, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 79.11212921142578, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6230674982070923, - "step": 901, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.99847412109375 - }, - { - "episode": 14448, - "epoch": 0.25969730740194846, - "loss/policy_avg": 0.290429025888443, - "lr": 9.423568507157465e-06, - "objective/entropy": -182.06155395507812, - "objective/kl": 50.57691955566406, - "objective/non_score_reward": -5.057692050933838, - "objective/rlhf_reward": -18.780170063586578, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 78.39713287353516, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7395067811012268, - "step": 902, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9985625743865967 - }, - { - "episode": 14464, - "epoch": 0.2599849013193371, - "loss/policy_avg": 1.0716335773468018, - "lr": 9.42292944785276e-06, - "objective/entropy": -12.325759887695312, - "objective/kl": 50.18354415893555, - "objective/non_score_reward": -5.018354415893555, - "objective/rlhf_reward": -18.622819285006866, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 11.29092025756836, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5627316236495972, - "step": 903, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9985690116882324 - }, - { - "episode": 14480, - "epoch": 0.26027249523672574, - "loss/policy_avg": 0.7096864581108093, - "lr": 9.422290388548057e-06, - "objective/entropy": 128.26751708984375, - "objective/kl": 47.753265380859375, - "objective/non_score_reward": -4.775326728820801, - "objective/rlhf_reward": -17.58553596714371, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 14.658417701721191, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7661948204040527, - "step": 904, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9982168674468994 - }, - { - "episode": 14496, - "epoch": 0.2605600891541144, - "loss/policy_avg": 0.7332016229629517, - "lr": 9.421651329243354e-06, - "objective/entropy": -9.548530578613281, - "objective/kl": 35.575138092041016, - "objective/non_score_reward": -3.557513952255249, - "objective/rlhf_reward": -12.405227299007485, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 9.975175857543945, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7684756517410278, - "step": 905, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9979183673858643 - }, - { - "episode": 14512, - "epoch": 0.260847683071503, - "loss/policy_avg": 0.16122013330459595, - "lr": 9.421012269938651e-06, - "objective/entropy": 57.9202880859375, - "objective/kl": 40.8089599609375, - "objective/non_score_reward": -4.080896377563477, - "objective/rlhf_reward": -14.719464097086508, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 33.88508605957031, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.3608490824699402, - "step": 906, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9993462562561035 - }, - { - "episode": 14528, - "epoch": 0.26113527698889166, - "loss/policy_avg": 2.275667190551758, - "lr": 9.420373210633948e-06, - "objective/entropy": -2.799551010131836, - "objective/kl": 57.71617126464844, - "objective/non_score_reward": -5.7716169357299805, - "objective/rlhf_reward": -18.686467742919923, - "objective/scores": 1.1, - "policy/approxkl_avg": 110.55852508544922, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5820337533950806, - "step": 907, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9986871480941772 - }, - { - "episode": 14544, - "epoch": 0.26142287090628036, - "loss/policy_avg": 0.9861481785774231, - "lr": 9.419734151329245e-06, - "objective/entropy": 91.0115737915039, - "objective/kl": 45.30067825317383, - "objective/non_score_reward": -4.5300679206848145, - "objective/rlhf_reward": -16.741669514266352, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 6.376153469085693, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.39600497484207153, - "step": 908, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 2.0001068115234375 - }, - { - "episode": 14560, - "epoch": 0.261710464823669, - "loss/policy_avg": 1.1333844661712646, - "lr": 9.41909509202454e-06, - "objective/entropy": -235.7226104736328, - "objective/kl": 30.271202087402344, - "objective/non_score_reward": -3.027120351791382, - "objective/rlhf_reward": -10.70848128795624, - "objective/scores": 0.35, - "policy/approxkl_avg": 32.180904388427734, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.667456865310669, - "step": 909, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9989008903503418 - }, - { - "episode": 14576, - "epoch": 0.26199805874105764, - "loss/policy_avg": 0.17538058757781982, - "lr": 9.418456032719837e-06, - "objective/entropy": 134.0635986328125, - "objective/kl": 42.076332092285156, - "objective/non_score_reward": -4.2076334953308105, - "objective/rlhf_reward": -15.48889785101953, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 32.24264144897461, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.513163685798645, - "step": 910, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9992715120315552 - }, - { - "episode": 14592, - "epoch": 0.2622856526584463, - "loss/policy_avg": -0.15949714183807373, - "lr": 9.417816973415134e-06, - "objective/entropy": -46.453277587890625, - "objective/kl": 38.9816780090332, - "objective/non_score_reward": -3.8981680870056152, - "objective/rlhf_reward": -13.645260284619269, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 21.381032943725586, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7189059257507324, - "step": 911, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.00034761428833 - }, - { - "episode": 14608, - "epoch": 0.2625732465758349, - "loss/policy_avg": 0.20729732513427734, - "lr": 9.41717791411043e-06, - "objective/entropy": -49.87384033203125, - "objective/kl": 42.82794952392578, - "objective/non_score_reward": -4.282794952392578, - "objective/rlhf_reward": -15.527060303751547, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 1.6806117296218872, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6336564421653748, - "step": 912, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0001604557037354 - }, - { - "episode": 14624, - "epoch": 0.26286084049322356, - "loss/policy_avg": -0.13027964532375336, - "lr": 9.416538854805727e-06, - "objective/entropy": -235.79547119140625, - "objective/kl": 46.248512268066406, - "objective/non_score_reward": -4.624851226806641, - "objective/rlhf_reward": -17.14015623304693, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 54.118743896484375, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5618636012077332, - "step": 913, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9989900588989258 - }, - { - "episode": 14640, - "epoch": 0.2631484344106122, - "loss/policy_avg": 0.32277536392211914, - "lr": 9.415899795501023e-06, - "objective/entropy": 57.70044708251953, - "objective/kl": 59.526153564453125, - "objective/non_score_reward": -5.952615737915039, - "objective/rlhf_reward": -22.468827298193602, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 20.54634666442871, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7771313190460205, - "step": 914, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9986742734909058 - }, - { - "episode": 14656, - "epoch": 0.26343602832800084, - "loss/policy_avg": 0.6588897705078125, - "lr": 9.41526073619632e-06, - "objective/entropy": -144.39913940429688, - "objective/kl": 53.891265869140625, - "objective/non_score_reward": -5.389126777648926, - "objective/rlhf_reward": -20.230992827445192, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 29.271865844726562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.46916764974594116, - "step": 915, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9994072914123535 - }, - { - "episode": 14672, - "epoch": 0.26372362224538953, - "loss/policy_avg": 0.040606118738651276, - "lr": 9.414621676891616e-06, - "objective/entropy": -236.26051330566406, - "objective/kl": 35.742431640625, - "objective/non_score_reward": -3.5742433071136475, - "objective/rlhf_reward": -12.781201684268648, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 4.916508674621582, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5409867167472839, - "step": 916, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002603769302368 - }, - { - "episode": 14688, - "epoch": 0.2640112161627782, - "loss/policy_avg": 0.4728504419326782, - "lr": 9.413982617586913e-06, - "objective/entropy": -3.219512939453125, - "objective/kl": 52.13475036621094, - "objective/non_score_reward": -5.213474750518799, - "objective/rlhf_reward": -19.47529683360229, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 11.293510437011719, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7324811220169067, - "step": 917, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9995346069335938 - }, - { - "episode": 14704, - "epoch": 0.2642988100801668, - "loss/policy_avg": 0.7524695992469788, - "lr": 9.41334355828221e-06, - "objective/entropy": 69.59600830078125, - "objective/kl": 40.76435852050781, - "objective/non_score_reward": -4.0764360427856445, - "objective/rlhf_reward": -14.358331988530097, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 22.529842376708984, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7941198348999023, - "step": 918, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9969267845153809 - }, - { - "episode": 14720, - "epoch": 0.26458640399755545, - "loss/policy_avg": 1.3578912019729614, - "lr": 9.412704498977507e-06, - "objective/entropy": -154.57412719726562, - "objective/kl": 49.238441467285156, - "objective/non_score_reward": -4.923844337463379, - "objective/rlhf_reward": -19.695377826690674, - "objective/scores": 0.0, - "policy/approxkl_avg": 3.7411012649536133, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6235677003860474, - "step": 919, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.004084348678589 - }, - { - "episode": 14736, - "epoch": 0.2648739979149441, - "loss/policy_avg": 0.38867104053497314, - "lr": 9.412065439672802e-06, - "objective/entropy": -79.98590850830078, - "objective/kl": 41.547691345214844, - "objective/non_score_reward": -4.1547698974609375, - "objective/rlhf_reward": -14.88574506441752, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 137.96481323242188, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.2868984043598175, - "step": 920, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.999016523361206 - }, - { - "episode": 14752, - "epoch": 0.26516159183233273, - "loss/policy_avg": 0.6230953931808472, - "lr": 9.411426380368099e-06, - "objective/entropy": -143.03982543945312, - "objective/kl": 49.02768325805664, - "objective/non_score_reward": -4.902768611907959, - "objective/rlhf_reward": -17.488367976919683, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 4.535545349121094, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5886020660400391, - "step": 921, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.999190330505371 - }, - { - "episode": 14768, - "epoch": 0.2654491857497214, - "loss/policy_avg": -0.35152608156204224, - "lr": 9.410787321063396e-06, - "objective/entropy": -254.19827270507812, - "objective/kl": 39.34025573730469, - "objective/non_score_reward": -3.934025764465332, - "objective/rlhf_reward": -13.788691113667426, - "objective/scores": 0.4868528072345416, - "policy/approxkl_avg": 18.294979095458984, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6331444978713989, - "step": 922, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 2.0006368160247803 - }, - { - "episode": 14784, - "epoch": 0.26573677966711007, - "loss/policy_avg": 1.660903811454773, - "lr": 9.410148261758691e-06, - "objective/entropy": 51.40530014038086, - "objective/kl": 54.351776123046875, - "objective/non_score_reward": -5.435177326202393, - "objective/rlhf_reward": -20.224937999042208, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 30.1667423248291, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.40519797801971436, - "step": 923, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9972550868988037 - }, - { - "episode": 14800, - "epoch": 0.2660243735844987, - "loss/policy_avg": -0.4044073820114136, - "lr": 9.409509202453988e-06, - "objective/entropy": 100.78778839111328, - "objective/kl": 55.377384185791016, - "objective/non_score_reward": -5.537738800048828, - "objective/rlhf_reward": -20.77235255488525, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 0.32742178440093994, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5611143708229065, - "step": 924, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.0023300647735596 - }, - { - "episode": 14816, - "epoch": 0.26631196750188735, - "loss/policy_avg": 0.9976423382759094, - "lr": 9.408870143149285e-06, - "objective/entropy": -75.03382873535156, - "objective/kl": 41.136863708496094, - "objective/non_score_reward": -4.113686561584473, - "objective/rlhf_reward": -15.129233155280275, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 12.370454788208008, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3627585470676422, - "step": 925, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000096559524536 - }, - { - "episode": 14832, - "epoch": 0.266599561419276, - "loss/policy_avg": 0.8356518745422363, - "lr": 9.408231083844582e-06, - "objective/entropy": -184.39404296875, - "objective/kl": 42.4769401550293, - "objective/non_score_reward": -4.24769401550293, - "objective/rlhf_reward": -15.386656556192953, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 18.93747329711914, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.49698585271835327, - "step": 926, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9984275102615356 - }, - { - "episode": 14848, - "epoch": 0.2668871553366646, - "loss/policy_avg": 0.9323163628578186, - "lr": 9.407592024539877e-06, - "objective/entropy": -184.23748779296875, - "objective/kl": 45.12583923339844, - "objective/non_score_reward": -4.512584209442139, - "objective/rlhf_reward": -18.050336837768555, - "objective/scores": 0.0, - "policy/approxkl_avg": 3.957935333251953, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6059246063232422, - "step": 927, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9981895685195923 - }, - { - "episode": 14864, - "epoch": 0.26717474925405327, - "loss/policy_avg": 1.5501456260681152, - "lr": 9.406952965235174e-06, - "objective/entropy": -222.2357635498047, - "objective/kl": 45.133087158203125, - "objective/non_score_reward": -4.513309001922607, - "objective/rlhf_reward": -16.602638105960235, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 5.245251655578613, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.663169264793396, - "step": 928, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.002445697784424 - }, - { - "episode": 14880, - "epoch": 0.2674623431714419, - "loss/policy_avg": 0.06294722855091095, - "lr": 9.40631390593047e-06, - "objective/entropy": -219.2621307373047, - "objective/kl": 35.440128326416016, - "objective/non_score_reward": -3.544013023376465, - "objective/rlhf_reward": -12.797449925032954, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 153.74978637695312, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.46772995591163635, - "step": 929, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9976612329483032 - }, - { - "episode": 14896, - "epoch": 0.26774993708883055, - "loss/policy_avg": 1.227535605430603, - "lr": 9.405674846625768e-06, - "objective/entropy": -223.289794921875, - "objective/kl": 52.25310516357422, - "objective/non_score_reward": -5.225310325622559, - "objective/rlhf_reward": -18.501241779327394, - "objective/scores": 0.6, - "policy/approxkl_avg": 27.705047607421875, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7249962091445923, - "step": 930, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.999952793121338 - }, - { - "episode": 14912, - "epoch": 0.26803753100621924, - "loss/policy_avg": 0.8206846714019775, - "lr": 9.405035787321065e-06, - "objective/entropy": -198.30308532714844, - "objective/kl": 39.47509002685547, - "objective/non_score_reward": -3.947509527206421, - "objective/rlhf_reward": -14.390037870407106, - "objective/scores": 0.35, - "policy/approxkl_avg": 59.78777313232422, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.5694193840026855, - "step": 931, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9995026588439941 - }, - { - "episode": 14928, - "epoch": 0.2683251249236079, - "loss/policy_avg": 0.9043534994125366, - "lr": 9.404396728016361e-06, - "objective/entropy": -184.1514434814453, - "objective/kl": 29.442230224609375, - "objective/non_score_reward": -2.944223165512085, - "objective/rlhf_reward": -8.853173409343931, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 5.002543926239014, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4746544361114502, - "step": 932, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0039162635803223 - }, - { - "episode": 14944, - "epoch": 0.2686127188409965, - "loss/policy_avg": 0.00976651906967163, - "lr": 9.403757668711657e-06, - "objective/entropy": -182.20498657226562, - "objective/kl": 47.182945251464844, - "objective/non_score_reward": -4.718294620513916, - "objective/rlhf_reward": -14.473178243637086, - "objective/scores": 1.1, - "policy/approxkl_avg": 5.810610294342041, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.7361587285995483, - "step": 933, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.00186824798584 - }, - { - "episode": 14960, - "epoch": 0.26890031275838516, - "loss/policy_avg": 1.3497049808502197, - "lr": 9.403118609406953e-06, - "objective/entropy": -47.7762451171875, - "objective/kl": 49.33927536010742, - "objective/non_score_reward": -4.933927536010742, - "objective/rlhf_reward": -18.41019752982251, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 4.110077857971191, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.627051830291748, - "step": 934, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 2.0001015663146973 - }, - { - "episode": 14976, - "epoch": 0.2691879066757738, - "loss/policy_avg": 0.9171154499053955, - "lr": 9.40247955010225e-06, - "objective/entropy": -59.27357482910156, - "objective/kl": 37.14779281616211, - "objective/non_score_reward": -3.7147793769836426, - "objective/rlhf_reward": -13.480515101043085, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 52.2471809387207, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5264706611633301, - "step": 935, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.000478982925415 - }, - { - "episode": 14992, - "epoch": 0.26947550059316244, - "loss/policy_avg": 0.06482569873332977, - "lr": 9.401840490797547e-06, - "objective/entropy": -391.59271240234375, - "objective/kl": 33.689430236816406, - "objective/non_score_reward": -3.368943214416504, - "objective/rlhf_reward": -12.11652275297491, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.8014378547668457, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.760464072227478, - "step": 936, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 13, - "val/ratio": 1.9995503425598145 - }, - { - "episode": 15008, - "epoch": 0.2697630945105511, - "loss/policy_avg": 0.03722277283668518, - "lr": 9.401201431492844e-06, - "objective/entropy": -255.18324279785156, - "objective/kl": 55.505332946777344, - "objective/non_score_reward": -5.550533771514893, - "objective/rlhf_reward": -20.823532440749506, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 39.2294921875, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.66856449842453, - "step": 937, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9981707334518433 - }, - { - "episode": 15024, - "epoch": 0.2700506884279397, - "loss/policy_avg": 0.4313279092311859, - "lr": 9.40056237218814e-06, - "objective/entropy": -258.69293212890625, - "objective/kl": 38.366519927978516, - "objective/non_score_reward": -3.8366520404815674, - "objective/rlhf_reward": -13.922776301105586, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 35.569087982177734, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8422372341156006, - "step": 938, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9997689723968506 - }, - { - "episode": 15040, - "epoch": 0.2703382823453284, - "loss/policy_avg": 0.5069754123687744, - "lr": 9.399923312883436e-06, - "objective/entropy": -305.361083984375, - "objective/kl": 45.78974151611328, - "objective/non_score_reward": -4.57897424697876, - "objective/rlhf_reward": -16.93729529627929, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 18.944904327392578, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.30599963665008545, - "step": 939, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.998727560043335 - }, - { - "episode": 15056, - "epoch": 0.27062587626271706, - "loss/policy_avg": 0.995228111743927, - "lr": 9.399284253578733e-06, - "objective/entropy": 184.52796936035156, - "objective/kl": 53.43628692626953, - "objective/non_score_reward": -5.343628883361816, - "objective/rlhf_reward": -16.974516487121583, - "objective/scores": 1.1, - "policy/approxkl_avg": 65.83000183105469, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4442574977874756, - "step": 940, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 1.9986300468444824 - }, - { - "episode": 15072, - "epoch": 0.2709134701801057, - "loss/policy_avg": 0.5919173955917358, - "lr": 9.39864519427403e-06, - "objective/entropy": -17.191753387451172, - "objective/kl": 56.80730438232422, - "objective/non_score_reward": -5.680730819702148, - "objective/rlhf_reward": -18.32292232513428, - "objective/scores": 1.1, - "policy/approxkl_avg": 3.7202348709106445, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.762370228767395, - "step": 941, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9992384910583496 - }, - { - "episode": 15088, - "epoch": 0.27120106409749434, - "loss/policy_avg": 1.1367782354354858, - "lr": 9.398006134969327e-06, - "objective/entropy": 24.78857421875, - "objective/kl": 50.70988464355469, - "objective/non_score_reward": -5.07098913192749, - "objective/rlhf_reward": -18.883956527709962, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.020835876464844, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6023867130279541, - "step": 942, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999258041381836 - }, - { - "episode": 15104, - "epoch": 0.271488658014883, - "loss/policy_avg": 0.28944456577301025, - "lr": 9.397367075664624e-06, - "objective/entropy": 38.79130172729492, - "objective/kl": 51.324073791503906, - "objective/non_score_reward": -5.132407188415527, - "objective/rlhf_reward": -19.20411637786023, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 18.69689178466797, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.4386768937110901, - "step": 943, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9981820583343506 - }, - { - "episode": 15120, - "epoch": 0.2717762519322716, - "loss/policy_avg": 0.04539201408624649, - "lr": 9.396728016359919e-06, - "objective/entropy": -393.70343017578125, - "objective/kl": 29.64443016052246, - "objective/non_score_reward": -2.9644432067871094, - "objective/rlhf_reward": -11.8577721118927, - "objective/scores": 0.0, - "policy/approxkl_avg": 12.188756942749023, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7838531136512756, - "step": 944, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 12, - "val/ratio": 1.9975636005401611 - }, - { - "episode": 15136, - "epoch": 0.27206384584966026, - "loss/policy_avg": -0.8645926713943481, - "lr": 9.396088957055216e-06, - "objective/entropy": -124.75596618652344, - "objective/kl": 37.55431365966797, - "objective/non_score_reward": -3.7554311752319336, - "objective/rlhf_reward": -13.41760495669039, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 30.16514778137207, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.9663676023483276, - "step": 945, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 2.009145498275757 - }, - { - "episode": 15152, - "epoch": 0.27235143976704895, - "loss/policy_avg": -0.9885836839675903, - "lr": 9.395449897750511e-06, - "objective/entropy": 23.736434936523438, - "objective/kl": 47.477176666259766, - "objective/non_score_reward": -4.74771785736084, - "objective/rlhf_reward": -17.329011922300445, - "objective/scores": 0.41546487678572874, - "policy/approxkl_avg": 21.30612564086914, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.6642924547195435, - "step": 946, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0021979808807373 - }, - { - "episode": 15168, - "epoch": 0.2726390336844376, - "loss/policy_avg": 0.5419988632202148, - "lr": 9.394810838445808e-06, - "objective/entropy": -163.80274963378906, - "objective/kl": 49.423614501953125, - "objective/non_score_reward": -4.942361831665039, - "objective/rlhf_reward": -18.42781191161218, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 63.652008056640625, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7244300842285156, - "step": 947, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9981995820999146 - }, - { - "episode": 15184, - "epoch": 0.27292662760182623, - "loss/policy_avg": -0.12660138309001923, - "lr": 9.394171779141105e-06, - "objective/entropy": -329.560302734375, - "objective/kl": 39.731605529785156, - "objective/non_score_reward": -3.9731602668762207, - "objective/rlhf_reward": -14.514039137450556, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 17.309505462646484, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6581008434295654, - "step": 948, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9999020099639893 - }, - { - "episode": 15200, - "epoch": 0.2732142215192149, - "loss/policy_avg": -0.06109565496444702, - "lr": 9.393532719836402e-06, - "objective/entropy": -217.1125946044922, - "objective/kl": 47.109092712402344, - "objective/non_score_reward": -4.710909366607666, - "objective/rlhf_reward": -17.46503529795776, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 93.11270141601562, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6009324193000793, - "step": 949, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9982292652130127 - }, - { - "episode": 15216, - "epoch": 0.2735018154366035, - "loss/policy_avg": 0.6483868956565857, - "lr": 9.392893660531698e-06, - "objective/entropy": -35.96177673339844, - "objective/kl": 52.633846282958984, - "objective/non_score_reward": -5.263384819030762, - "objective/rlhf_reward": -21.05353856086731, - "objective/scores": 0.0, - "policy/approxkl_avg": 8.303674697875977, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5491381883621216, - "step": 950, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0011985301971436 - }, - { - "episode": 15232, - "epoch": 0.27378940935399215, - "loss/policy_avg": 1.7503316402435303, - "lr": 9.392254601226994e-06, - "objective/entropy": -13.0999755859375, - "objective/kl": 52.63844299316406, - "objective/non_score_reward": -5.2638444900512695, - "objective/rlhf_reward": -18.932671727911504, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 6.220101833343506, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6228674650192261, - "step": 951, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9985336065292358 - }, - { - "episode": 15248, - "epoch": 0.2740770032713808, - "loss/policy_avg": -0.4814819097518921, - "lr": 9.39161554192229e-06, - "objective/entropy": -400.868408203125, - "objective/kl": 46.192840576171875, - "objective/non_score_reward": -4.619284152984619, - "objective/rlhf_reward": -17.077136611938478, - "objective/scores": 0.35, - "policy/approxkl_avg": 7.3005571365356445, - "policy/clipfrac_avg": 1.75, - "policy/entropy_avg": 0.3778308033943176, - "step": 952, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 2.002164602279663 - }, - { - "episode": 15264, - "epoch": 0.27436459718876943, - "loss/policy_avg": 0.23375429213047028, - "lr": 9.390976482617587e-06, - "objective/entropy": -265.75054931640625, - "objective/kl": 36.87724685668945, - "objective/non_score_reward": -3.6877243518829346, - "objective/rlhf_reward": -14.750897645950317, - "objective/scores": 0.0, - "policy/approxkl_avg": 1.4932501316070557, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.8672538995742798, - "step": 953, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 2.0001888275146484 - }, - { - "episode": 15280, - "epoch": 0.27465219110615813, - "loss/policy_avg": 0.3528614044189453, - "lr": 9.390337423312884e-06, - "objective/entropy": -80.12568664550781, - "objective/kl": 54.801692962646484, - "objective/non_score_reward": -5.480169296264648, - "objective/rlhf_reward": -20.595164570838136, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.5583480596542358, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.41291695833206177, - "step": 954, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9998396635055542 - }, - { - "episode": 15296, - "epoch": 0.27493978502354677, - "loss/policy_avg": 1.2501368522644043, - "lr": 9.389698364008181e-06, - "objective/entropy": -262.31793212890625, - "objective/kl": 40.21039581298828, - "objective/non_score_reward": -4.0210394859313965, - "objective/rlhf_reward": -13.160439883114073, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 41.2574462890625, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5917608141899109, - "step": 955, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 9, - "val/ratio": 1.9988981485366821 - }, - { - "episode": 15312, - "epoch": 0.2752273789409354, - "loss/policy_avg": 3.28550386428833, - "lr": 9.389059304703478e-06, - "objective/entropy": -228.41085815429688, - "objective/kl": 46.59629440307617, - "objective/non_score_reward": -4.6596293449401855, - "objective/rlhf_reward": -17.313004527121706, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 3.093324661254883, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5825387239456177, - "step": 956, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.00113582611084 - }, - { - "episode": 15328, - "epoch": 0.27551497285832405, - "loss/policy_avg": 0.926065981388092, - "lr": 9.388420245398773e-06, - "objective/entropy": -161.94918823242188, - "objective/kl": 40.23236083984375, - "objective/non_score_reward": -4.023235321044922, - "objective/rlhf_reward": -13.692942714691164, - "objective/scores": 0.6, - "policy/approxkl_avg": 36.19329833984375, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5999109148979187, - "step": 957, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.995762586593628 - }, - { - "episode": 15344, - "epoch": 0.2758025667757127, - "loss/policy_avg": 0.9499435424804688, - "lr": 9.38778118609407e-06, - "objective/entropy": -131.8387451171875, - "objective/kl": 52.344879150390625, - "objective/non_score_reward": -5.234487533569336, - "objective/rlhf_reward": -19.514119465549555, - "objective/scores": 0.35595802480981553, - "policy/approxkl_avg": 97.26606750488281, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5245336294174194, - "step": 958, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9986298084259033 - }, - { - "episode": 15360, - "epoch": 0.2760901606931013, - "loss/policy_avg": -0.003145521506667137, - "lr": 9.387142126789367e-06, - "objective/entropy": -165.37252807617188, - "objective/kl": 54.373069763183594, - "objective/non_score_reward": -5.437307357788086, - "objective/rlhf_reward": -18.82550874793646, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 13.774616241455078, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5953001976013184, - "step": 959, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0011730194091797 - }, - { - "episode": 15376, - "epoch": 0.27637775461048997, - "loss/policy_avg": 0.4764801859855652, - "lr": 9.386503067484664e-06, - "objective/entropy": -242.42982482910156, - "objective/kl": 56.915626525878906, - "objective/non_score_reward": -5.691562652587891, - "objective/rlhf_reward": -22.766250133514404, - "objective/scores": 0.0, - "policy/approxkl_avg": 29.58062744140625, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6144382953643799, - "step": 960, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9959168434143066 - }, - { - "episode": 15392, - "epoch": 0.27666534852787866, - "loss/policy_avg": 1.376183271408081, - "lr": 9.38586400817996e-06, - "objective/entropy": -322.4050598144531, - "objective/kl": 36.3888053894043, - "objective/non_score_reward": -3.638880729675293, - "objective/rlhf_reward": -13.213887742071776, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 44.334632873535156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.613652765750885, - "step": 961, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9986045360565186 - }, - { - "episode": 15408, - "epoch": 0.2769529424452673, - "loss/policy_avg": 1.6320432424545288, - "lr": 9.385224948875256e-06, - "objective/entropy": -79.29718017578125, - "objective/kl": 59.607242584228516, - "objective/non_score_reward": -5.960724353790283, - "objective/rlhf_reward": -22.51738360884778, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 87.03724670410156, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5909746885299683, - "step": 962, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.998765468597412 - }, - { - "episode": 15424, - "epoch": 0.27724053636265594, - "loss/policy_avg": 2.325925588607788, - "lr": 9.384585889570553e-06, - "objective/entropy": -113.67591857910156, - "objective/kl": 55.53913116455078, - "objective/non_score_reward": -5.553913116455078, - "objective/rlhf_reward": -20.482319132486978, - "objective/scores": 0.43333333333333335, - "policy/approxkl_avg": 8.882341384887695, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6297260522842407, - "step": 963, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9990952014923096 - }, - { - "episode": 15440, - "epoch": 0.2775281302800446, - "loss/policy_avg": 0.5869482159614563, - "lr": 9.38394683026585e-06, - "objective/entropy": -206.8366241455078, - "objective/kl": 48.33332824707031, - "objective/non_score_reward": -4.833332538604736, - "objective/rlhf_reward": -14.933330869674684, - "objective/scores": 1.1, - "policy/approxkl_avg": 49.717071533203125, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.7187210917472839, - "step": 964, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9980008602142334 - }, - { - "episode": 15456, - "epoch": 0.2778157241974332, - "loss/policy_avg": 1.1654845476150513, - "lr": 9.383307770961147e-06, - "objective/entropy": -379.75067138671875, - "objective/kl": 57.3199462890625, - "objective/non_score_reward": -5.73199462890625, - "objective/rlhf_reward": -20.8052737138429, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 37.82645034790039, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5788445472717285, - "step": 965, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9972472190856934 - }, - { - "episode": 15472, - "epoch": 0.27810331811482186, - "loss/policy_avg": 0.9802603721618652, - "lr": 9.382668711656443e-06, - "objective/entropy": 101.25556945800781, - "objective/kl": 61.29494857788086, - "objective/non_score_reward": -6.129494667053223, - "objective/rlhf_reward": -21.5942603691828, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 45.391502380371094, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5674448013305664, - "step": 966, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9998470544815063 - }, - { - "episode": 15488, - "epoch": 0.2783909120322105, - "loss/policy_avg": 1.8580645322799683, - "lr": 9.382029652351739e-06, - "objective/entropy": -102.23580932617188, - "objective/kl": 37.78134536743164, - "objective/non_score_reward": -3.77813458442688, - "objective/rlhf_reward": -13.787025485068483, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 1.6718518733978271, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.3454323410987854, - "step": 967, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 2, - "val/ratio": 2.001361131668091 - }, - { - "episode": 15504, - "epoch": 0.27867850594959914, - "loss/policy_avg": 0.8354310393333435, - "lr": 9.381390593047035e-06, - "objective/entropy": -306.50030517578125, - "objective/kl": 31.03956413269043, - "objective/non_score_reward": -3.103956460952759, - "objective/rlhf_reward": -10.859566538539484, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 0.7312620878219604, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.37779200077056885, - "step": 968, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.000714063644409 - }, - { - "episode": 15520, - "epoch": 0.27896609986698784, - "loss/policy_avg": 0.009227469563484192, - "lr": 9.380751533742332e-06, - "objective/entropy": -336.61578369140625, - "objective/kl": 41.42436599731445, - "objective/non_score_reward": -4.142436504364014, - "objective/rlhf_reward": -15.119147877307281, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 8.629606246948242, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.3866472542285919, - "step": 969, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.999985933303833 - }, - { - "episode": 15536, - "epoch": 0.2792536937843765, - "loss/policy_avg": -0.041605472564697266, - "lr": 9.380112474437628e-06, - "objective/entropy": -282.89971923828125, - "objective/kl": 48.89763259887695, - "objective/non_score_reward": -4.889763355255127, - "objective/rlhf_reward": -17.436346950308355, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 5.264449596405029, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6422872543334961, - "step": 970, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9998466968536377 - }, - { - "episode": 15552, - "epoch": 0.2795412877017651, - "loss/policy_avg": 0.36232057213783264, - "lr": 9.379473415132924e-06, - "objective/entropy": -300.8648681640625, - "objective/kl": 41.13935852050781, - "objective/non_score_reward": -4.113935470581055, - "objective/rlhf_reward": -15.130230460196657, - "objective/scores": 0.3313782131597591, - "policy/approxkl_avg": 50.30149841308594, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5609759092330933, - "step": 971, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985394477844238 - }, - { - "episode": 15568, - "epoch": 0.27982888161915376, - "loss/policy_avg": 0.5932891964912415, - "lr": 9.378834355828221e-06, - "objective/entropy": -174.238037109375, - "objective/kl": 50.591854095458984, - "objective/non_score_reward": -5.059185981750488, - "objective/rlhf_reward": -18.7861455484346, - "objective/scores": 0.36264953503719355, - "policy/approxkl_avg": 36.18544006347656, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.4679606854915619, - "step": 972, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 1.9989001750946045 - }, - { - "episode": 15584, - "epoch": 0.2801164755365424, - "loss/policy_avg": 1.0932896137237549, - "lr": 9.378195296523518e-06, - "objective/entropy": -144.99090576171875, - "objective/kl": 58.75678253173828, - "objective/non_score_reward": -5.875679016113281, - "objective/rlhf_reward": -22.021762016232373, - "objective/scores": 0.3702381544273198, - "policy/approxkl_avg": 71.56244659423828, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5289046764373779, - "step": 973, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 1.9984965324401855 - }, - { - "episode": 15600, - "epoch": 0.28040406945393104, - "loss/policy_avg": 0.6273190379142761, - "lr": 9.377556237218815e-06, - "objective/entropy": -122.2640151977539, - "objective/kl": 44.235450744628906, - "objective/non_score_reward": -4.423544883728027, - "objective/rlhf_reward": -17.694178819656372, - "objective/scores": 0.0, - "policy/approxkl_avg": 11.347247123718262, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.3611798584461212, - "step": 974, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.999008297920227 - }, - { - "episode": 15616, - "epoch": 0.2806916633713197, - "loss/policy_avg": 0.09601998329162598, - "lr": 9.37691717791411e-06, - "objective/entropy": -131.6696014404297, - "objective/kl": 48.32147979736328, - "objective/non_score_reward": -4.832147598266602, - "objective/rlhf_reward": -17.81282004097336, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 7.570517063140869, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5157725811004639, - "step": 975, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0004305839538574 - }, - { - "episode": 15632, - "epoch": 0.2809792572887083, - "loss/policy_avg": -0.13817770779132843, - "lr": 9.376278118609407e-06, - "objective/entropy": -255.62921142578125, - "objective/kl": 53.164588928222656, - "objective/non_score_reward": -5.316458702087402, - "objective/rlhf_reward": -19.750063502582247, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 2.5946006774902344, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.5641098618507385, - "step": 976, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0020365715026855 - }, - { - "episode": 15648, - "epoch": 0.281266851206097, - "loss/policy_avg": 1.142382264137268, - "lr": 9.375639059304704e-06, - "objective/entropy": -25.651283264160156, - "objective/kl": 67.4949722290039, - "objective/non_score_reward": -6.749497413635254, - "objective/rlhf_reward": -25.619387486068113, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 5.648348808288574, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.6707190275192261, - "step": 977, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 4, - "val/ratio": 1.9988956451416016 - }, - { - "episode": 15664, - "epoch": 0.28155444512348565, - "loss/policy_avg": 0.6753113269805908, - "lr": 9.375000000000001e-06, - "objective/entropy": -391.8768005371094, - "objective/kl": 44.86485290527344, - "objective/non_score_reward": -4.486485481262207, - "objective/rlhf_reward": -13.545940971374513, - "objective/scores": 1.1, - "policy/approxkl_avg": 6.812070846557617, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.8541865348815918, - "step": 978, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 17, - "val/ratio": 2.000732183456421 - }, - { - "episode": 15680, - "epoch": 0.2818420390408743, - "loss/policy_avg": 0.3577782213687897, - "lr": 9.374360940695298e-06, - "objective/entropy": -254.20706176757812, - "objective/kl": 47.57148742675781, - "objective/non_score_reward": -4.757148742675781, - "objective/rlhf_reward": -17.203765745433877, - "objective/scores": 0.4562071871080222, - "policy/approxkl_avg": 93.71537780761719, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6484947800636292, - "step": 979, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9985871315002441 - }, - { - "episode": 15696, - "epoch": 0.28212963295826293, - "loss/policy_avg": -0.1328345537185669, - "lr": 9.373721881390595e-06, - "objective/entropy": 28.385881423950195, - "objective/kl": 49.55193328857422, - "objective/non_score_reward": -4.955193042755127, - "objective/rlhf_reward": -18.44217047938476, - "objective/scores": 0.34465054211822604, - "policy/approxkl_avg": 10.326236724853516, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.7927839159965515, - "step": 980, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 3, - "val/ratio": 2.0010528564453125 - }, - { - "episode": 15712, - "epoch": 0.2824172268756516, - "loss/policy_avg": 0.12570828199386597, - "lr": 9.37308282208589e-06, - "objective/entropy": -141.36134338378906, - "objective/kl": 54.92476272583008, - "objective/non_score_reward": -5.492476463317871, - "objective/rlhf_reward": -20.454133355411226, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 3.49436354637146, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.5999050140380859, - "step": 981, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 2.0009312629699707 - }, - { - "episode": 15728, - "epoch": 0.2827048207930402, - "loss/policy_avg": 2.4626569747924805, - "lr": 9.372443762781187e-06, - "objective/entropy": -266.347900390625, - "objective/kl": 47.33668899536133, - "objective/non_score_reward": -4.733669281005859, - "objective/rlhf_reward": -17.330556426111777, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 3.0285089015960693, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5679370164871216, - "step": 982, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 2.0021414756774902 - }, - { - "episode": 15744, - "epoch": 0.28299241471042885, - "loss/policy_avg": 0.1242997869849205, - "lr": 9.371804703476484e-06, - "objective/entropy": -346.93878173828125, - "objective/kl": 35.437347412109375, - "objective/non_score_reward": -3.5437347888946533, - "objective/rlhf_reward": -11.774938917160034, - "objective/scores": 0.6, - "policy/approxkl_avg": 42.03936004638672, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.617620587348938, - "step": 983, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9993083477020264 - }, - { - "episode": 15760, - "epoch": 0.28328000862781755, - "loss/policy_avg": 0.14840182662010193, - "lr": 9.37116564417178e-06, - "objective/entropy": -368.4073486328125, - "objective/kl": 28.31252670288086, - "objective/non_score_reward": -2.8312525749206543, - "objective/rlhf_reward": -9.202304425016914, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 11.93632698059082, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6155978441238403, - "step": 984, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 11, - "val/ratio": 1.9987332820892334 - }, - { - "episode": 15776, - "epoch": 0.2835676025452062, - "loss/policy_avg": 0.47417038679122925, - "lr": 9.370526584867077e-06, - "objective/entropy": -141.45626831054688, - "objective/kl": 50.39579772949219, - "objective/non_score_reward": -5.039579391479492, - "objective/rlhf_reward": -18.79906793806402, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 97.74449157714844, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5000198483467102, - "step": 985, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.001434803009033 - }, - { - "episode": 15792, - "epoch": 0.2838551964625948, - "loss/policy_avg": 1.3729290962219238, - "lr": 9.369887525562373e-06, - "objective/entropy": -411.28570556640625, - "objective/kl": 35.94482421875, - "objective/non_score_reward": -3.594482421875, - "objective/rlhf_reward": -9.97793016433716, - "objective/scores": 1.1, - "policy/approxkl_avg": 10.14614486694336, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6557059288024902, - "step": 986, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 2.0011909008026123 - }, - { - "episode": 15808, - "epoch": 0.28414279037998347, - "loss/policy_avg": 3.4433889389038086, - "lr": 9.36924846625767e-06, - "objective/entropy": -201.43887329101562, - "objective/kl": 51.010643005371094, - "objective/non_score_reward": -5.1010637283325195, - "objective/rlhf_reward": -18.28155011154798, - "objective/scores": 0.5306765580733931, - "policy/approxkl_avg": 24.718257904052734, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.6495320200920105, - "step": 987, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9984331130981445 - }, - { - "episode": 15824, - "epoch": 0.2844303842973721, - "loss/policy_avg": 2.152106761932373, - "lr": 9.368609406952966e-06, - "objective/entropy": -109.68426513671875, - "objective/kl": 36.905975341796875, - "objective/non_score_reward": -3.6905977725982666, - "objective/rlhf_reward": -13.206131546702935, - "objective/scores": 0.38906482631788786, - "policy/approxkl_avg": 6.616457939147949, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.632394552230835, - "step": 988, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9983766078948975 - }, - { - "episode": 15840, - "epoch": 0.28471797821476075, - "loss/policy_avg": 0.4088176488876343, - "lr": 9.367970347648263e-06, - "objective/entropy": -241.9044647216797, - "objective/kl": 47.368995666503906, - "objective/non_score_reward": -4.736899375915527, - "objective/rlhf_reward": -17.58834787580816, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 2.87666654586792, - "policy/clipfrac_avg": 0.5, - "policy/entropy_avg": 0.7490575313568115, - "step": 989, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9989919662475586 - }, - { - "episode": 15856, - "epoch": 0.2850055721321494, - "loss/policy_avg": 6.076893329620361, - "lr": 9.367331288343558e-06, - "objective/entropy": -96.00433349609375, - "objective/kl": 45.789695739746094, - "objective/non_score_reward": -4.578969955444336, - "objective/rlhf_reward": -15.39215913856146, - "objective/scores": 0.7309297535714575, - "policy/approxkl_avg": 15.434422492980957, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5452468991279602, - "step": 990, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 5, - "val/ratio": 2.0014736652374268 - }, - { - "episode": 15872, - "epoch": 0.285293166049538, - "loss/policy_avg": -0.9232574701309204, - "lr": 9.366692229038855e-06, - "objective/entropy": -144.09912109375, - "objective/kl": 46.76097869873047, - "objective/non_score_reward": -4.676098346710205, - "objective/rlhf_reward": -17.345143043731134, - "objective/scores": 0.33981246656813147, - "policy/approxkl_avg": 1.897438645362854, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6364270448684692, - "step": 991, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.0024867057800293 - }, - { - "episode": 15888, - "epoch": 0.2855807599669267, - "loss/policy_avg": 0.041624486446380615, - "lr": 9.366053169734152e-06, - "objective/entropy": -139.56771850585938, - "objective/kl": 38.63270950317383, - "objective/non_score_reward": -3.8632712364196777, - "objective/rlhf_reward": -11.05308494567871, - "objective/scores": 1.1, - "policy/approxkl_avg": 9.528964042663574, - "policy/clipfrac_avg": 1.5, - "policy/entropy_avg": 0.6615604162216187, - "step": 992, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 2.002514600753784 - }, - { - "episode": 15904, - "epoch": 0.28586835388431536, - "loss/policy_avg": 0.31544625759124756, - "lr": 9.365414110429449e-06, - "objective/entropy": -208.21640014648438, - "objective/kl": 53.08674240112305, - "objective/non_score_reward": -5.308674335479736, - "objective/rlhf_reward": -19.834697103500368, - "objective/scores": 0.35, - "policy/approxkl_avg": 6.563029766082764, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.6068991422653198, - "step": 993, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 7, - "val/ratio": 1.9984232187271118 - }, - { - "episode": 15920, - "epoch": 0.286155947801704, - "loss/policy_avg": -0.1134636178612709, - "lr": 9.364775051124744e-06, - "objective/entropy": -341.1083984375, - "objective/kl": 38.53087615966797, - "objective/non_score_reward": -3.8530876636505127, - "objective/rlhf_reward": -13.808230671946127, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 2.8780159950256348, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.47948139905929565, - "step": 994, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 2.002434015274048 - }, - { - "episode": 15936, - "epoch": 0.28644354171909264, - "loss/policy_avg": 0.7322211265563965, - "lr": 9.364135991820041e-06, - "objective/entropy": -335.6710510253906, - "objective/kl": 40.26041793823242, - "objective/non_score_reward": -4.0260419845581055, - "objective/rlhf_reward": -14.588396155627901, - "objective/scores": 0.37894294565112985, - "policy/approxkl_avg": 4.264369010925293, - "policy/clipfrac_avg": 1.0, - "policy/entropy_avg": 0.6806702017784119, - "step": 995, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 12, - "val/ratio": 2.0014522075653076 - }, - { - "episode": 15952, - "epoch": 0.2867311356364813, - "loss/policy_avg": 0.9409044981002808, - "lr": 9.363496932515338e-06, - "objective/entropy": -169.52256774902344, - "objective/kl": 33.46562576293945, - "objective/non_score_reward": -3.346562385559082, - "objective/rlhf_reward": -10.98624954223633, - "objective/scores": 0.6, - "policy/approxkl_avg": 16.522533416748047, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.552007794380188, - "step": 996, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 6, - "val/ratio": 1.9994235038757324 - }, - { - "episode": 15968, - "epoch": 0.2870187295538699, - "loss/policy_avg": 0.1874466985464096, - "lr": 9.362857873210635e-06, - "objective/entropy": 49.27545928955078, - "objective/kl": 53.142845153808594, - "objective/non_score_reward": -5.314284324645996, - "objective/rlhf_reward": -19.653018269602377, - "objective/scores": 0.40102999566398123, - "policy/approxkl_avg": 20.86511993408203, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.4496247172355652, - "step": 997, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 1, - "val/ratio": 1.9997801780700684 - }, - { - "episode": 15984, - "epoch": 0.28730632347125856, - "loss/policy_avg": 0.2505379319190979, - "lr": 9.362218813905932e-06, - "objective/entropy": -353.1224365234375, - "objective/kl": 32.73716354370117, - "objective/non_score_reward": -3.273716449737549, - "objective/rlhf_reward": -11.753230860739379, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 0.9558770656585693, - "policy/clipfrac_avg": 0.75, - "policy/entropy_avg": 0.5755044221878052, - "step": 998, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 10, - "val/ratio": 1.9991295337677002 - }, - { - "episode": 16000, - "epoch": 0.28759391738864726, - "loss/policy_avg": 0.1776025891304016, - "lr": 9.361579754601227e-06, - "objective/entropy": -239.99009704589844, - "objective/kl": 40.456748962402344, - "objective/non_score_reward": -4.045674800872803, - "objective/rlhf_reward": -14.841064026861815, - "objective/scores": 0.33540891336663825, - "policy/approxkl_avg": 104.76905822753906, - "policy/clipfrac_avg": 1.25, - "policy/entropy_avg": 0.5998741388320923, - "step": 999, - "val/clipfrac_avg": 0.0, - "val/num_eos_tokens": 8, - "val/ratio": 1.9990432262420654 - } - ], - "logging_steps": 500, - "max_steps": 7824, - "num_input_tokens_seen": 0, - "num_train_epochs": 9.0, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": true, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 0, - "train_batch_size": null, - "trial_name": null, - "trial_params": null -}