|
import argparse |
|
import numpy as np |
|
import torch |
|
|
|
from agent.DiPo import DiPo |
|
from agent.replay_memory import ReplayMemory, DiffusionMemory |
|
|
|
from tensorboardX import SummaryWriter |
|
import gym |
|
import os |
|
|
|
|
|
def readParser(): |
|
parser = argparse.ArgumentParser(description='Diffusion Policy') |
|
parser.add_argument('--env_name', default="Hopper-v3", |
|
help='Mujoco Gym environment (default: Hopper-v3)') |
|
parser.add_argument('--seed', type=int, default=0, metavar='N', |
|
help='random seed (default: 0)') |
|
|
|
parser.add_argument('--num_steps', type=int, default=1000000, metavar='N', |
|
help='env timesteps (default: 1000000)') |
|
|
|
parser.add_argument('--batch_size', type=int, default=256, metavar='N', |
|
help='batch size (default: 256)') |
|
parser.add_argument('--gamma', type=float, default=0.99, metavar='G', |
|
help='discount factor for reward (default: 0.99)') |
|
parser.add_argument('--tau', type=float, default=0.005, metavar='G', |
|
help='target smoothing coefficient(τ) (default: 0.005)') |
|
parser.add_argument('--update_actor_target_every', type=int, default=1, metavar='N', |
|
help='update actor target per iteration (default: 1)') |
|
|
|
parser.add_argument("--policy_type", type=str, default="Diffusion", metavar='S', |
|
help="Diffusion, VAE or MLP") |
|
parser.add_argument("--beta_schedule", type=str, default="cosine", metavar='S', |
|
help="linear, cosine or vp") |
|
parser.add_argument('--n_timesteps', type=int, default=100, metavar='N', |
|
help='diffusion timesteps (default: 100)') |
|
parser.add_argument('--diffusion_lr', type=float, default=0.0003, metavar='G', |
|
help='diffusion learning rate (default: 0.0003)') |
|
parser.add_argument('--critic_lr', type=float, default=0.0003, metavar='G', |
|
help='critic learning rate (default: 0.0003)') |
|
parser.add_argument('--action_lr', type=float, default=0.03, metavar='G', |
|
help='diffusion learning rate (default: 0.03)') |
|
parser.add_argument('--noise_ratio', type=float, default=1.0, metavar='G', |
|
help='noise ratio in sample process (default: 1.0)') |
|
|
|
parser.add_argument('--action_gradient_steps', type=int, default=20, metavar='N', |
|
help='action gradient steps (default: 20)') |
|
parser.add_argument('--ratio', type=float, default=0.1, metavar='G', |
|
help='the ratio of action grad norm to action_dim (default: 0.1)') |
|
parser.add_argument('--ac_grad_norm', type=float, default=2.0, metavar='G', |
|
help='actor and critic grad norm (default: 1.0)') |
|
|
|
parser.add_argument('--cuda', default='cuda:0', |
|
help='run on CUDA (default: cuda:0)') |
|
|
|
return parser.parse_args() |
|
|
|
|
|
def evaluate(env, agent, writer, steps): |
|
episodes = 10 |
|
returns = np.zeros((episodes,), dtype=np.float32) |
|
|
|
for i in range(episodes): |
|
state = env.reset() |
|
episode_reward = 0. |
|
done = False |
|
while not done: |
|
action = agent.sample_action(state, eval=True) |
|
next_state, reward, done, _ = env.step(action) |
|
episode_reward += reward |
|
state = next_state |
|
returns[i] = episode_reward |
|
|
|
mean_return = np.mean(returns) |
|
|
|
writer.add_scalar( |
|
'reward/test', mean_return, steps) |
|
print('-' * 60) |
|
print(f'Num steps: {steps:<5} ' |
|
f'reward: {mean_return:<5.1f}') |
|
print('-' * 60) |
|
|
|
|
|
def main(args=None): |
|
if args is None: |
|
args = readParser() |
|
|
|
device = torch.device(args.cuda) |
|
|
|
dir = "record" |
|
|
|
log_dir = os.path.join(dir, f'{args.env_name}', f'policy_type={args.policy_type}', f'ratio={args.ratio}', f'seed={args.seed}') |
|
writer = SummaryWriter(log_dir) |
|
|
|
|
|
env = gym.make(args.env_name) |
|
state_size = int(np.prod(env.observation_space.shape)) |
|
action_size = int(np.prod(env.action_space.shape)) |
|
print(action_size) |
|
|
|
|
|
torch.manual_seed(args.seed) |
|
np.random.seed(args.seed) |
|
env.seed(args.seed) |
|
|
|
memory_size = 1e6 |
|
num_steps = args.num_steps |
|
start_steps = 10000 |
|
eval_interval = 10000 |
|
updates_per_step = 1 |
|
batch_size = args.batch_size |
|
log_interval = 10 |
|
|
|
memory = ReplayMemory(state_size, action_size, memory_size, device) |
|
diffusion_memory = DiffusionMemory(state_size, action_size, memory_size, device) |
|
|
|
agent = DiPo(args, state_size, env.action_space, memory, diffusion_memory, device) |
|
|
|
steps = 0 |
|
episodes = 0 |
|
|
|
while steps < num_steps: |
|
episode_reward = 0. |
|
episode_steps = 0 |
|
done = False |
|
state = env.reset() |
|
episodes += 1 |
|
while not done: |
|
if start_steps > steps: |
|
action = env.action_space.sample() |
|
else: |
|
action = agent.sample_action(state, eval=False) |
|
next_state, reward, done, _ = env.step(action) |
|
|
|
mask = 0.0 if done else args.gamma |
|
|
|
steps += 1 |
|
episode_steps += 1 |
|
episode_reward += reward |
|
|
|
agent.append_memory(state, action, reward, next_state, mask) |
|
|
|
if steps >= start_steps: |
|
agent.train(updates_per_step, batch_size=batch_size, log_writer=writer) |
|
|
|
if steps % eval_interval == 0: |
|
evaluate(env, agent, writer, steps) |
|
|
|
done =True |
|
|
|
state = next_state |
|
|
|
|
|
|
|
|
|
print(f'episode: {episodes:<4} ' |
|
f'episode steps: {episode_steps:<4} ' |
|
f'reward: {episode_reward:<5.1f}') |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|