import d4rl # noqa | |
import gym | |
import tqdm | |
from diffusers.experimental import ValueGuidedRLPipeline | |
config = dict( | |
n_samples=64, | |
horizon=32, | |
num_inference_steps=20, | |
n_guide_steps=0, | |
scale_grad_by_std=True, | |
scale=0.1, | |
eta=0.0, | |
t_grad_cutoff=2, | |
device="cpu", | |
) | |
if __name__ == "__main__": | |
env_name = "hopper-medium-v2" | |
env = gym.make(env_name) | |
pipeline = ValueGuidedRLPipeline.from_pretrained( | |
"bglick13/hopper-medium-v2-value-function-hor32", | |
env=env, | |
) | |
env.seed(0) | |
obs = env.reset() | |
total_reward = 0 | |
total_score = 0 | |
T = 1000 | |
rollout = [obs.copy()] | |
try: | |
for t in tqdm.tqdm(range(T)): | |
# Call the policy | |
denorm_actions = pipeline(obs, planning_horizon=32) | |
# execute action in environment | |
next_observation, reward, terminal, _ = env.step(denorm_actions) | |
score = env.get_normalized_score(total_reward) | |
# update return | |
total_reward += reward | |
total_score += score | |
print( | |
f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:" | |
f" {total_score}" | |
) | |
# save observations for rendering | |
rollout.append(next_observation.copy()) | |
obs = next_observation | |
except KeyboardInterrupt: | |
pass | |
print(f"Total reward: {total_reward}") | |