Spaces:
Sleeping
Sleeping
import numpy as np | |
import torch | |
from src.rlkit.torch import pytorch_util as ptu | |
def multitask_rollout( | |
env, | |
agent, | |
max_path_length=np.inf, | |
render=False, | |
render_kwargs=None, | |
observation_key=None, | |
desired_goal_key=None, | |
get_action_kwargs=None, | |
return_dict_obs=False, | |
): | |
if render_kwargs is None: | |
render_kwargs = {} | |
if get_action_kwargs is None: | |
get_action_kwargs = {} | |
dict_obs = [] | |
dict_next_obs = [] | |
observations = [] | |
actions = [] | |
rewards = [] | |
terminals = [] | |
agent_infos = [] | |
env_infos = [] | |
next_observations = [] | |
path_length = 0 | |
agent.reset() | |
o = env.reset() | |
if render: | |
env.render(**render_kwargs) | |
goal = o[desired_goal_key] | |
while path_length < max_path_length: | |
dict_obs.append(o) | |
if observation_key: | |
o = o[observation_key] | |
new_obs = np.hstack((o, goal)) | |
a, agent_info = agent.get_action(new_obs, **get_action_kwargs) | |
next_o, r, d, env_info = env.step(a) | |
if render: | |
env.render(**render_kwargs) | |
observations.append(o) | |
rewards.append(r) | |
terminals.append(d) | |
actions.append(a) | |
next_observations.append(next_o) | |
dict_next_obs.append(next_o) | |
agent_infos.append(agent_info) | |
env_infos.append(env_info) | |
path_length += 1 | |
if d: | |
break | |
o = next_o | |
actions = np.array(actions) | |
if len(actions.shape) == 1: | |
actions = np.expand_dims(actions, 1) | |
observations = np.array(observations) | |
next_observations = np.array(next_observations) | |
if return_dict_obs: | |
observations = dict_obs | |
next_observations = dict_next_obs | |
return dict( | |
observations=observations, | |
actions=actions, | |
rewards=np.array(rewards).reshape(-1, 1), | |
next_observations=next_observations, | |
terminals=np.array(terminals).reshape(-1, 1), | |
agent_infos=agent_infos, | |
env_infos=env_infos, | |
goals=np.repeat(goal[None], path_length, 0), | |
full_observations=dict_obs, | |
) | |
def rollout( | |
env, | |
agent, | |
noise_flag=0, | |
max_path_length=np.inf, | |
render=False, | |
render_kwargs=None, | |
): | |
""" | |
The following value for the following keys will be a 2D array, with the | |
first dimension corresponding to the time dimension. | |
- observations | |
- actions | |
- rewards | |
- next_observations | |
- terminals | |
The next two elements will be lists of dictionaries, with the index into | |
the list being the index into the time | |
- agent_infos | |
- env_infos | |
""" | |
if render_kwargs is None: | |
render_kwargs = {} | |
observations = [] | |
actions = [] | |
rewards = [] | |
terminals = [] | |
agent_infos = [] | |
env_infos = [] | |
o = env.reset() | |
agent.reset() | |
next_o = None | |
path_length = 0 | |
if render: | |
env.render(**render_kwargs) | |
while path_length < max_path_length: | |
a, agent_info = agent.get_action(o) | |
next_o, r, d, env_info = env.step(a) | |
if noise_flag == 1: | |
r += np.random.normal(0,1,1)[0] | |
observations.append(o) | |
rewards.append(r) | |
terminals.append(d) | |
actions.append(a) | |
agent_infos.append(agent_info) | |
env_infos.append(env_info) | |
path_length += 1 | |
if d: | |
break | |
o = next_o | |
if render: | |
env.render(**render_kwargs) | |
actions = np.array(actions) | |
if len(actions.shape) == 1: | |
actions = np.expand_dims(actions, 1) | |
observations = np.array(observations) | |
if len(observations.shape) == 1: | |
observations = np.expand_dims(observations, 1) | |
next_o = np.array([next_o]) | |
next_observations = np.vstack( | |
( | |
observations[1:, :], | |
np.expand_dims(next_o, 0) | |
) | |
) | |
return dict( | |
observations=observations, | |
actions=actions, | |
rewards=np.array(rewards).reshape(-1, 1), | |
next_observations=next_observations, | |
terminals=np.array(terminals).reshape(-1, 1), | |
agent_infos=agent_infos, | |
env_infos=env_infos, | |
) | |
def ensemble_rollout( | |
env, | |
agent, | |
num_ensemble, | |
noise_flag=0, | |
max_path_length=np.inf, | |
ber_mean=0.5, | |
render=False, | |
render_kwargs=None, | |
): | |
""" | |
The following value for the following keys will be a 2D array, with the | |
first dimension corresponding to the time dimension. | |
- observations | |
- actions | |
- rewards | |
- next_observations | |
- terminals | |
The next two elements will be lists of dictionaries, with the index into | |
the list being the index into the time | |
- agent_infos | |
- env_infos | |
""" | |
if render_kwargs is None: | |
render_kwargs = {} | |
observations = [] | |
actions = [] | |
rewards = [] | |
terminals = [] | |
agent_infos = [] | |
env_infos = [] | |
masks = [] # mask for bootstrapping | |
o = env.reset() | |
en_index = np.random.randint(num_ensemble) | |
agent[en_index].reset() | |
next_o = None | |
path_length = 0 | |
if render: | |
env.render(**render_kwargs) | |
while path_length < max_path_length: | |
a, agent_info = agent[en_index].get_action(o) | |
next_o, r, d, env_info = env.step(a) | |
if noise_flag == 1: | |
r += np.random.normal(0,1,1)[0] | |
observations.append(o) | |
rewards.append(r) | |
terminals.append(d) | |
actions.append(a) | |
agent_infos.append(agent_info) | |
env_infos.append(env_info) | |
mask = torch.bernoulli(torch.Tensor([ber_mean]*num_ensemble)) | |
if mask.sum() == 0: | |
rand_index = np.random.randint(num_ensemble, size=1) | |
mask[rand_index] = 1 | |
mask = mask.numpy() | |
masks.append(mask) | |
path_length += 1 | |
if d: | |
break | |
o = next_o | |
if render: | |
env.render(**render_kwargs) | |
actions = np.array(actions) | |
if len(actions.shape) == 1: | |
actions = np.expand_dims(actions, 1) | |
observations = np.array(observations) | |
if len(observations.shape) == 1: | |
observations = np.expand_dims(observations, 1) | |
next_o = np.array([next_o]) | |
next_observations = np.vstack( | |
( | |
observations[1:, :], | |
np.expand_dims(next_o, 0) | |
) | |
) | |
masks = np.array(masks) | |
return dict( | |
observations=observations, | |
actions=actions, | |
rewards=np.array(rewards).reshape(-1, 1), | |
next_observations=next_observations, | |
terminals=np.array(terminals).reshape(-1, 1), | |
agent_infos=agent_infos, | |
env_infos=env_infos, | |
masks=masks, | |
) | |
def get_ucb_std(obs, policy_action, inference_type, critic1, critic2, | |
feedback_type, en_index, num_ensemble): | |
obs = ptu.from_numpy(obs).float() | |
policy_action = ptu.from_numpy(policy_action).float() | |
obs = obs.reshape(1,-1) | |
policy_action = policy_action.reshape(1,-1) | |
if feedback_type == 0 or feedback_type==2: | |
with torch.no_grad(): | |
target_Q1 = critic1[en_index](obs, policy_action) | |
target_Q2 = critic2[en_index](obs, policy_action) | |
mean_Q = 0.5*(target_Q1.detach() + target_Q2.detach()) | |
var_Q = 0.5*((target_Q1.detach() - mean_Q)**2 + (target_Q2.detach() - mean_Q)**2) | |
ucb_score = mean_Q + inference_type * torch.sqrt(var_Q).detach() | |
elif feedback_type == 1 or feedback_type==3: | |
mean_Q, var_Q = None, None | |
L_target_Q = [] | |
for en_index in range(num_ensemble): | |
with torch.no_grad(): | |
target_Q1 = critic1[en_index](obs, policy_action) | |
target_Q2 = critic2[en_index](obs, policy_action) | |
L_target_Q.append(target_Q1) | |
L_target_Q.append(target_Q2) | |
if en_index == 0: | |
mean_Q = 0.5*(target_Q1 + target_Q2) / num_ensemble | |
else: | |
mean_Q += 0.5*(target_Q1 + target_Q2) / num_ensemble | |
temp_count = 0 | |
for target_Q in L_target_Q: | |
if temp_count == 0: | |
var_Q = (target_Q.detach() - mean_Q)**2 | |
else: | |
var_Q += (target_Q.detach() - mean_Q)**2 | |
temp_count += 1 | |
var_Q = var_Q / temp_count | |
ucb_score = mean_Q + inference_type * torch.sqrt(var_Q).detach() | |
return ucb_score | |
def ensemble_ucb_rollout( | |
env, | |
agent, | |
critic1, | |
critic2, | |
inference_type, | |
feedback_type, | |
num_ensemble, | |
noise_flag=0, | |
max_path_length=np.inf, | |
ber_mean=0.5, | |
render=False, | |
render_kwargs=None, | |
): | |
""" | |
The following value for the following keys will be a 2D array, with the | |
first dimension corresponding to the time dimension. | |
- observations | |
- actions | |
- rewards | |
- next_observations | |
- terminals | |
The next two elements will be lists of dictionaries, with the index into | |
the list being the index into the time | |
- agent_infos | |
- env_infos | |
""" | |
if render_kwargs is None: | |
render_kwargs = {} | |
observations = [] | |
actions = [] | |
rewards = [] | |
terminals = [] | |
agent_infos = [] | |
env_infos = [] | |
masks = [] # mask for bootstrapping | |
o = env.reset() | |
for en_index in range(num_ensemble): | |
agent[en_index].reset() | |
next_o = None | |
path_length = 0 | |
if render: | |
env.render(**render_kwargs) | |
while path_length < max_path_length: | |
a_max, ucb_max, agent_info_max = None, None, None | |
for en_index in range(num_ensemble): | |
_a, agent_info = agent[en_index].get_action(o) | |
ucb_score = get_ucb_std(o, _a, inference_type, critic1, critic2, | |
feedback_type, en_index, num_ensemble) | |
if en_index == 0: | |
a_max = _a | |
ucb_max = ucb_score | |
agent_info_max = agent_info | |
else: | |
if ucb_score > ucb_max: | |
ucb_max = ucb_score | |
a_max = _a | |
agent_info_max = agent_info | |
next_o, r, d, env_info = env.step(a_max) | |
if noise_flag == 1: | |
r += np.random.normal(0,1,1)[0] | |
observations.append(o) | |
rewards.append(r) | |
terminals.append(d) | |
actions.append(a_max) | |
agent_infos.append(agent_info_max) | |
env_infos.append(env_info) | |
mask = torch.bernoulli(torch.Tensor([ber_mean]*num_ensemble)) | |
if mask.sum() == 0: | |
rand_index = np.random.randint(num_ensemble, size=1) | |
mask[rand_index] = 1 | |
mask = mask.numpy() | |
masks.append(mask) | |
path_length += 1 | |
if d: | |
break | |
o = next_o | |
if render: | |
env.render(**render_kwargs) | |
actions = np.array(actions) | |
if len(actions.shape) == 1: | |
actions = np.expand_dims(actions, 1) | |
observations = np.array(observations) | |
if len(observations.shape) == 1: | |
observations = np.expand_dims(observations, 1) | |
next_o = np.array([next_o]) | |
next_observations = np.vstack( | |
( | |
observations[1:, :], | |
np.expand_dims(next_o, 0) | |
) | |
) | |
masks = np.array(masks) | |
return dict( | |
observations=observations, | |
actions=actions, | |
rewards=np.array(rewards).reshape(-1, 1), | |
next_observations=next_observations, | |
terminals=np.array(terminals).reshape(-1, 1), | |
agent_infos=agent_infos, | |
env_infos=env_infos, | |
masks=masks, | |
) | |
def ensemble_eval_rollout( | |
env, | |
agent, | |
num_ensemble, | |
max_path_length=np.inf, | |
render=False, | |
render_kwargs=None, | |
): | |
""" | |
The following value for the following keys will be a 2D array, with the | |
first dimension corresponding to the time dimension. | |
- observations | |
- actions | |
- rewards | |
- next_observations | |
- terminals | |
The next two elements will be lists of dictionaries, with the index into | |
the list being the index into the time | |
- agent_infos | |
- env_infos | |
""" | |
if render_kwargs is None: | |
render_kwargs = {} | |
observations = [] | |
actions = [] | |
rewards = [] | |
terminals = [] | |
agent_infos = [] | |
env_infos = [] | |
o = env.reset() | |
for en_index in range(num_ensemble): | |
agent[en_index].reset() | |
next_o = None | |
path_length = 0 | |
if render: | |
env.render(**render_kwargs) | |
while path_length < max_path_length: | |
a = None | |
for en_index in range(num_ensemble): | |
_a, agent_info = agent[en_index].get_action(o) | |
if en_index == 0: | |
a = _a | |
else: | |
a += _a | |
a = a / num_ensemble | |
next_o, r, d, env_info = env.step(a) | |
observations.append(o) | |
rewards.append(r) | |
terminals.append(d) | |
actions.append(a) | |
agent_infos.append(agent_info) | |
env_infos.append(env_info) | |
path_length += 1 | |
if d: | |
break | |
o = next_o | |
if render: | |
env.render(**render_kwargs) | |
actions = np.array(actions) | |
if len(actions.shape) == 1: | |
actions = np.expand_dims(actions, 1) | |
observations = np.array(observations) | |
if len(observations.shape) == 1: | |
observations = np.expand_dims(observations, 1) | |
next_o = np.array([next_o]) | |
next_observations = np.vstack( | |
( | |
observations[1:, :], | |
np.expand_dims(next_o, 0) | |
) | |
) | |
return dict( | |
observations=observations, | |
actions=actions, | |
rewards=np.array(rewards).reshape(-1, 1), | |
next_observations=next_observations, | |
terminals=np.array(terminals).reshape(-1, 1), | |
agent_infos=agent_infos, | |
env_infos=env_infos, | |
) | |
def async_ensemble_eval_rollout( | |
env, | |
agent, | |
num_ensemble, | |
max_path_length=np.inf, | |
render=False, | |
render_kwargs=None, | |
): | |
""" | |
The following value for the following keys will be a 2D array, with the | |
first dimension corresponding to the time dimension. | |
- observations | |
- actions | |
- rewards | |
- next_observations | |
- terminals | |
The next two elements will be lists of dictionaries, with the index into | |
the list being the index into the time | |
- agent_infos | |
- env_infos | |
""" | |
if render_kwargs is None: | |
render_kwargs = {} | |
# observations = [] | |
# actions = [] | |
# rewards = [] | |
# terminals = [] | |
# agent_infos = [] | |
# env_infos = [] | |
o = env.reset() | |
for en_index in range(num_ensemble): | |
agent[en_index].reset() | |
next_o = None | |
path_length = 0 | |
if render: | |
env.render(**render_kwargs) | |
while path_length < max_path_length: | |
a = None | |
for en_index in range(num_ensemble): | |
_a, agent_info = agent[en_index].get_action(o) | |
if en_index == 0: | |
a = _a | |
else: | |
a += _a | |
a = a / num_ensemble | |
next_o, d = env.step(a) | |
# observations.append(o) | |
# rewards.append(r) | |
# terminals.append(d) | |
# actions.append(a) | |
# agent_infos.append(agent_info) | |
# env_infos.append(env_info) | |
path_length += 1 | |
if d: | |
break | |
o = next_o | |
if render: | |
env.render(**render_kwargs) | |
actions = np.array(actions) | |
if len(actions.shape) == 1: | |
actions = np.expand_dims(actions, 1) | |
observations = np.array(observations) | |
if len(observations.shape) == 1: | |
observations = np.expand_dims(observations, 1) | |
next_o = np.array([next_o]) | |
next_observations = np.vstack( | |
( | |
observations[1:, :], | |
np.expand_dims(next_o, 0) | |
) | |
) | |
return dict( | |
observations=observations, | |
actions=actions, | |
rewards=np.array(rewards).reshape(-1, 1), | |
next_observations=next_observations, | |
terminals=np.array(terminals).reshape(-1, 1), | |
agent_infos=agent_infos, | |
env_infos=env_infos, | |
) | |