NCERL-Diverse-PCG / src /rlkit /samplers /rollout_functions.py
baiyanlali-zhao's picture
init
eaf2e33
raw
history blame
16.3 kB
import numpy as np
import torch
from src.rlkit.torch import pytorch_util as ptu
def multitask_rollout(
env,
agent,
max_path_length=np.inf,
render=False,
render_kwargs=None,
observation_key=None,
desired_goal_key=None,
get_action_kwargs=None,
return_dict_obs=False,
):
if render_kwargs is None:
render_kwargs = {}
if get_action_kwargs is None:
get_action_kwargs = {}
dict_obs = []
dict_next_obs = []
observations = []
actions = []
rewards = []
terminals = []
agent_infos = []
env_infos = []
next_observations = []
path_length = 0
agent.reset()
o = env.reset()
if render:
env.render(**render_kwargs)
goal = o[desired_goal_key]
while path_length < max_path_length:
dict_obs.append(o)
if observation_key:
o = o[observation_key]
new_obs = np.hstack((o, goal))
a, agent_info = agent.get_action(new_obs, **get_action_kwargs)
next_o, r, d, env_info = env.step(a)
if render:
env.render(**render_kwargs)
observations.append(o)
rewards.append(r)
terminals.append(d)
actions.append(a)
next_observations.append(next_o)
dict_next_obs.append(next_o)
agent_infos.append(agent_info)
env_infos.append(env_info)
path_length += 1
if d:
break
o = next_o
actions = np.array(actions)
if len(actions.shape) == 1:
actions = np.expand_dims(actions, 1)
observations = np.array(observations)
next_observations = np.array(next_observations)
if return_dict_obs:
observations = dict_obs
next_observations = dict_next_obs
return dict(
observations=observations,
actions=actions,
rewards=np.array(rewards).reshape(-1, 1),
next_observations=next_observations,
terminals=np.array(terminals).reshape(-1, 1),
agent_infos=agent_infos,
env_infos=env_infos,
goals=np.repeat(goal[None], path_length, 0),
full_observations=dict_obs,
)
def rollout(
env,
agent,
noise_flag=0,
max_path_length=np.inf,
render=False,
render_kwargs=None,
):
"""
The following value for the following keys will be a 2D array, with the
first dimension corresponding to the time dimension.
- observations
- actions
- rewards
- next_observations
- terminals
The next two elements will be lists of dictionaries, with the index into
the list being the index into the time
- agent_infos
- env_infos
"""
if render_kwargs is None:
render_kwargs = {}
observations = []
actions = []
rewards = []
terminals = []
agent_infos = []
env_infos = []
o = env.reset()
agent.reset()
next_o = None
path_length = 0
if render:
env.render(**render_kwargs)
while path_length < max_path_length:
a, agent_info = agent.get_action(o)
next_o, r, d, env_info = env.step(a)
if noise_flag == 1:
r += np.random.normal(0,1,1)[0]
observations.append(o)
rewards.append(r)
terminals.append(d)
actions.append(a)
agent_infos.append(agent_info)
env_infos.append(env_info)
path_length += 1
if d:
break
o = next_o
if render:
env.render(**render_kwargs)
actions = np.array(actions)
if len(actions.shape) == 1:
actions = np.expand_dims(actions, 1)
observations = np.array(observations)
if len(observations.shape) == 1:
observations = np.expand_dims(observations, 1)
next_o = np.array([next_o])
next_observations = np.vstack(
(
observations[1:, :],
np.expand_dims(next_o, 0)
)
)
return dict(
observations=observations,
actions=actions,
rewards=np.array(rewards).reshape(-1, 1),
next_observations=next_observations,
terminals=np.array(terminals).reshape(-1, 1),
agent_infos=agent_infos,
env_infos=env_infos,
)
def ensemble_rollout(
env,
agent,
num_ensemble,
noise_flag=0,
max_path_length=np.inf,
ber_mean=0.5,
render=False,
render_kwargs=None,
):
"""
The following value for the following keys will be a 2D array, with the
first dimension corresponding to the time dimension.
- observations
- actions
- rewards
- next_observations
- terminals
The next two elements will be lists of dictionaries, with the index into
the list being the index into the time
- agent_infos
- env_infos
"""
if render_kwargs is None:
render_kwargs = {}
observations = []
actions = []
rewards = []
terminals = []
agent_infos = []
env_infos = []
masks = [] # mask for bootstrapping
o = env.reset()
en_index = np.random.randint(num_ensemble)
agent[en_index].reset()
next_o = None
path_length = 0
if render:
env.render(**render_kwargs)
while path_length < max_path_length:
a, agent_info = agent[en_index].get_action(o)
next_o, r, d, env_info = env.step(a)
if noise_flag == 1:
r += np.random.normal(0,1,1)[0]
observations.append(o)
rewards.append(r)
terminals.append(d)
actions.append(a)
agent_infos.append(agent_info)
env_infos.append(env_info)
mask = torch.bernoulli(torch.Tensor([ber_mean]*num_ensemble))
if mask.sum() == 0:
rand_index = np.random.randint(num_ensemble, size=1)
mask[rand_index] = 1
mask = mask.numpy()
masks.append(mask)
path_length += 1
if d:
break
o = next_o
if render:
env.render(**render_kwargs)
actions = np.array(actions)
if len(actions.shape) == 1:
actions = np.expand_dims(actions, 1)
observations = np.array(observations)
if len(observations.shape) == 1:
observations = np.expand_dims(observations, 1)
next_o = np.array([next_o])
next_observations = np.vstack(
(
observations[1:, :],
np.expand_dims(next_o, 0)
)
)
masks = np.array(masks)
return dict(
observations=observations,
actions=actions,
rewards=np.array(rewards).reshape(-1, 1),
next_observations=next_observations,
terminals=np.array(terminals).reshape(-1, 1),
agent_infos=agent_infos,
env_infos=env_infos,
masks=masks,
)
def get_ucb_std(obs, policy_action, inference_type, critic1, critic2,
feedback_type, en_index, num_ensemble):
obs = ptu.from_numpy(obs).float()
policy_action = ptu.from_numpy(policy_action).float()
obs = obs.reshape(1,-1)
policy_action = policy_action.reshape(1,-1)
if feedback_type == 0 or feedback_type==2:
with torch.no_grad():
target_Q1 = critic1[en_index](obs, policy_action)
target_Q2 = critic2[en_index](obs, policy_action)
mean_Q = 0.5*(target_Q1.detach() + target_Q2.detach())
var_Q = 0.5*((target_Q1.detach() - mean_Q)**2 + (target_Q2.detach() - mean_Q)**2)
ucb_score = mean_Q + inference_type * torch.sqrt(var_Q).detach()
elif feedback_type == 1 or feedback_type==3:
mean_Q, var_Q = None, None
L_target_Q = []
for en_index in range(num_ensemble):
with torch.no_grad():
target_Q1 = critic1[en_index](obs, policy_action)
target_Q2 = critic2[en_index](obs, policy_action)
L_target_Q.append(target_Q1)
L_target_Q.append(target_Q2)
if en_index == 0:
mean_Q = 0.5*(target_Q1 + target_Q2) / num_ensemble
else:
mean_Q += 0.5*(target_Q1 + target_Q2) / num_ensemble
temp_count = 0
for target_Q in L_target_Q:
if temp_count == 0:
var_Q = (target_Q.detach() - mean_Q)**2
else:
var_Q += (target_Q.detach() - mean_Q)**2
temp_count += 1
var_Q = var_Q / temp_count
ucb_score = mean_Q + inference_type * torch.sqrt(var_Q).detach()
return ucb_score
def ensemble_ucb_rollout(
env,
agent,
critic1,
critic2,
inference_type,
feedback_type,
num_ensemble,
noise_flag=0,
max_path_length=np.inf,
ber_mean=0.5,
render=False,
render_kwargs=None,
):
"""
The following value for the following keys will be a 2D array, with the
first dimension corresponding to the time dimension.
- observations
- actions
- rewards
- next_observations
- terminals
The next two elements will be lists of dictionaries, with the index into
the list being the index into the time
- agent_infos
- env_infos
"""
if render_kwargs is None:
render_kwargs = {}
observations = []
actions = []
rewards = []
terminals = []
agent_infos = []
env_infos = []
masks = [] # mask for bootstrapping
o = env.reset()
for en_index in range(num_ensemble):
agent[en_index].reset()
next_o = None
path_length = 0
if render:
env.render(**render_kwargs)
while path_length < max_path_length:
a_max, ucb_max, agent_info_max = None, None, None
for en_index in range(num_ensemble):
_a, agent_info = agent[en_index].get_action(o)
ucb_score = get_ucb_std(o, _a, inference_type, critic1, critic2,
feedback_type, en_index, num_ensemble)
if en_index == 0:
a_max = _a
ucb_max = ucb_score
agent_info_max = agent_info
else:
if ucb_score > ucb_max:
ucb_max = ucb_score
a_max = _a
agent_info_max = agent_info
next_o, r, d, env_info = env.step(a_max)
if noise_flag == 1:
r += np.random.normal(0,1,1)[0]
observations.append(o)
rewards.append(r)
terminals.append(d)
actions.append(a_max)
agent_infos.append(agent_info_max)
env_infos.append(env_info)
mask = torch.bernoulli(torch.Tensor([ber_mean]*num_ensemble))
if mask.sum() == 0:
rand_index = np.random.randint(num_ensemble, size=1)
mask[rand_index] = 1
mask = mask.numpy()
masks.append(mask)
path_length += 1
if d:
break
o = next_o
if render:
env.render(**render_kwargs)
actions = np.array(actions)
if len(actions.shape) == 1:
actions = np.expand_dims(actions, 1)
observations = np.array(observations)
if len(observations.shape) == 1:
observations = np.expand_dims(observations, 1)
next_o = np.array([next_o])
next_observations = np.vstack(
(
observations[1:, :],
np.expand_dims(next_o, 0)
)
)
masks = np.array(masks)
return dict(
observations=observations,
actions=actions,
rewards=np.array(rewards).reshape(-1, 1),
next_observations=next_observations,
terminals=np.array(terminals).reshape(-1, 1),
agent_infos=agent_infos,
env_infos=env_infos,
masks=masks,
)
def ensemble_eval_rollout(
env,
agent,
num_ensemble,
max_path_length=np.inf,
render=False,
render_kwargs=None,
):
"""
The following value for the following keys will be a 2D array, with the
first dimension corresponding to the time dimension.
- observations
- actions
- rewards
- next_observations
- terminals
The next two elements will be lists of dictionaries, with the index into
the list being the index into the time
- agent_infos
- env_infos
"""
if render_kwargs is None:
render_kwargs = {}
observations = []
actions = []
rewards = []
terminals = []
agent_infos = []
env_infos = []
o = env.reset()
for en_index in range(num_ensemble):
agent[en_index].reset()
next_o = None
path_length = 0
if render:
env.render(**render_kwargs)
while path_length < max_path_length:
a = None
for en_index in range(num_ensemble):
_a, agent_info = agent[en_index].get_action(o)
if en_index == 0:
a = _a
else:
a += _a
a = a / num_ensemble
next_o, r, d, env_info = env.step(a)
observations.append(o)
rewards.append(r)
terminals.append(d)
actions.append(a)
agent_infos.append(agent_info)
env_infos.append(env_info)
path_length += 1
if d:
break
o = next_o
if render:
env.render(**render_kwargs)
actions = np.array(actions)
if len(actions.shape) == 1:
actions = np.expand_dims(actions, 1)
observations = np.array(observations)
if len(observations.shape) == 1:
observations = np.expand_dims(observations, 1)
next_o = np.array([next_o])
next_observations = np.vstack(
(
observations[1:, :],
np.expand_dims(next_o, 0)
)
)
return dict(
observations=observations,
actions=actions,
rewards=np.array(rewards).reshape(-1, 1),
next_observations=next_observations,
terminals=np.array(terminals).reshape(-1, 1),
agent_infos=agent_infos,
env_infos=env_infos,
)
def async_ensemble_eval_rollout(
env,
agent,
num_ensemble,
max_path_length=np.inf,
render=False,
render_kwargs=None,
):
"""
The following value for the following keys will be a 2D array, with the
first dimension corresponding to the time dimension.
- observations
- actions
- rewards
- next_observations
- terminals
The next two elements will be lists of dictionaries, with the index into
the list being the index into the time
- agent_infos
- env_infos
"""
if render_kwargs is None:
render_kwargs = {}
# observations = []
# actions = []
# rewards = []
# terminals = []
# agent_infos = []
# env_infos = []
o = env.reset()
for en_index in range(num_ensemble):
agent[en_index].reset()
next_o = None
path_length = 0
if render:
env.render(**render_kwargs)
while path_length < max_path_length:
a = None
for en_index in range(num_ensemble):
_a, agent_info = agent[en_index].get_action(o)
if en_index == 0:
a = _a
else:
a += _a
a = a / num_ensemble
next_o, d = env.step(a)
# observations.append(o)
# rewards.append(r)
# terminals.append(d)
# actions.append(a)
# agent_infos.append(agent_info)
# env_infos.append(env_info)
path_length += 1
if d:
break
o = next_o
if render:
env.render(**render_kwargs)
actions = np.array(actions)
if len(actions.shape) == 1:
actions = np.expand_dims(actions, 1)
observations = np.array(observations)
if len(observations.shape) == 1:
observations = np.expand_dims(observations, 1)
next_o = np.array([next_o])
next_observations = np.vstack(
(
observations[1:, :],
np.expand_dims(next_o, 0)
)
)
return dict(
observations=observations,
actions=actions,
rewards=np.array(rewards).reshape(-1, 1),
next_observations=next_observations,
terminals=np.array(terminals).reshape(-1, 1),
agent_infos=agent_infos,
env_infos=env_infos,
)