Spaces:

baiyanlali-zhao
/

NCERL-Diverse-PCG

Sleeping

App Files Files Community

NCERL-Diverse-PCG / src /rlkit /samplers /rollout_functions.py

baiyanlali-zhao

init

eaf2e33 8 months ago

raw

history blame

16.3 kB

	import numpy as np
	import torch
	from src.rlkit.torch import pytorch_util as ptu

	def multitask_rollout(
	env,
	agent,
	max_path_length=np.inf,
	render=False,
	render_kwargs=None,
	observation_key=None,
	desired_goal_key=None,
	get_action_kwargs=None,
	return_dict_obs=False,
	):
	if render_kwargs is None:
	render_kwargs = {}
	if get_action_kwargs is None:
	get_action_kwargs = {}
	dict_obs = []
	dict_next_obs = []
	observations = []
	actions = []
	rewards = []
	terminals = []
	agent_infos = []
	env_infos = []
	next_observations = []
	path_length = 0
	agent.reset()
	o = env.reset()
	if render:
	env.render(**render_kwargs)
	goal = o[desired_goal_key]
	while path_length < max_path_length:
	dict_obs.append(o)
	if observation_key:
	o = o[observation_key]
	new_obs = np.hstack((o, goal))
	a, agent_info = agent.get_action(new_obs, **get_action_kwargs)
	next_o, r, d, env_info = env.step(a)
	if render:
	env.render(**render_kwargs)
	observations.append(o)
	rewards.append(r)
	terminals.append(d)
	actions.append(a)
	next_observations.append(next_o)
	dict_next_obs.append(next_o)
	agent_infos.append(agent_info)
	env_infos.append(env_info)
	path_length += 1
	if d:
	break
	o = next_o
	actions = np.array(actions)
	if len(actions.shape) == 1:
	actions = np.expand_dims(actions, 1)
	observations = np.array(observations)
	next_observations = np.array(next_observations)
	if return_dict_obs:
	observations = dict_obs
	next_observations = dict_next_obs
	return dict(
	observations=observations,
	actions=actions,
	rewards=np.array(rewards).reshape(-1, 1),
	next_observations=next_observations,
	terminals=np.array(terminals).reshape(-1, 1),
	agent_infos=agent_infos,
	env_infos=env_infos,
	goals=np.repeat(goal[None], path_length, 0),
	full_observations=dict_obs,
	)


	def rollout(
	env,
	agent,
	noise_flag=0,
	max_path_length=np.inf,
	render=False,
	render_kwargs=None,
	):
	"""
	The following value for the following keys will be a 2D array, with the
	first dimension corresponding to the time dimension.
	- observations
	- actions
	- rewards
	- next_observations
	- terminals

	The next two elements will be lists of dictionaries, with the index into
	the list being the index into the time
	- agent_infos
	- env_infos
	"""
	if render_kwargs is None:
	render_kwargs = {}
	observations = []
	actions = []
	rewards = []
	terminals = []
	agent_infos = []
	env_infos = []
	o = env.reset()
	agent.reset()
	next_o = None
	path_length = 0
	if render:
	env.render(**render_kwargs)
	while path_length < max_path_length:
	a, agent_info = agent.get_action(o)
	next_o, r, d, env_info = env.step(a)
	if noise_flag == 1:
	r += np.random.normal(0,1,1)[0]
	observations.append(o)
	rewards.append(r)
	terminals.append(d)
	actions.append(a)
	agent_infos.append(agent_info)
	env_infos.append(env_info)
	path_length += 1
	if d:
	break
	o = next_o
	if render:
	env.render(**render_kwargs)

	actions = np.array(actions)
	if len(actions.shape) == 1:
	actions = np.expand_dims(actions, 1)
	observations = np.array(observations)
	if len(observations.shape) == 1:
	observations = np.expand_dims(observations, 1)
	next_o = np.array([next_o])
	next_observations = np.vstack(
	(
	observations[1:, :],
	np.expand_dims(next_o, 0)
	)
	)
	return dict(
	observations=observations,
	actions=actions,
	rewards=np.array(rewards).reshape(-1, 1),
	next_observations=next_observations,
	terminals=np.array(terminals).reshape(-1, 1),
	agent_infos=agent_infos,
	env_infos=env_infos,
	)


	def ensemble_rollout(
	env,
	agent,
	num_ensemble,
	noise_flag=0,
	max_path_length=np.inf,
	ber_mean=0.5,
	render=False,
	render_kwargs=None,
	):
	"""
	The following value for the following keys will be a 2D array, with the
	first dimension corresponding to the time dimension.
	- observations
	- actions
	- rewards
	- next_observations
	- terminals

	The next two elements will be lists of dictionaries, with the index into
	the list being the index into the time
	- agent_infos
	- env_infos
	"""
	if render_kwargs is None:
	render_kwargs = {}
	observations = []
	actions = []
	rewards = []
	terminals = []
	agent_infos = []
	env_infos = []
	masks = [] # mask for bootstrapping
	o = env.reset()
	en_index = np.random.randint(num_ensemble)
	agent[en_index].reset()
	next_o = None
	path_length = 0
	if render:
	env.render(**render_kwargs)
	while path_length < max_path_length:
	a, agent_info = agent[en_index].get_action(o)
	next_o, r, d, env_info = env.step(a)
	if noise_flag == 1:
	r += np.random.normal(0,1,1)[0]
	observations.append(o)
	rewards.append(r)
	terminals.append(d)
	actions.append(a)
	agent_infos.append(agent_info)
	env_infos.append(env_info)
	mask = torch.bernoulli(torch.Tensor([ber_mean]*num_ensemble))
	if mask.sum() == 0:
	rand_index = np.random.randint(num_ensemble, size=1)
	mask[rand_index] = 1
	mask = mask.numpy()
	masks.append(mask)

	path_length += 1
	if d:
	break
	o = next_o
	if render:
	env.render(**render_kwargs)

	actions = np.array(actions)
	if len(actions.shape) == 1:
	actions = np.expand_dims(actions, 1)
	observations = np.array(observations)
	if len(observations.shape) == 1:
	observations = np.expand_dims(observations, 1)
	next_o = np.array([next_o])
	next_observations = np.vstack(
	(
	observations[1:, :],
	np.expand_dims(next_o, 0)
	)
	)
	masks = np.array(masks)

	return dict(
	observations=observations,
	actions=actions,
	rewards=np.array(rewards).reshape(-1, 1),
	next_observations=next_observations,
	terminals=np.array(terminals).reshape(-1, 1),
	agent_infos=agent_infos,
	env_infos=env_infos,
	masks=masks,
	)


	def get_ucb_std(obs, policy_action, inference_type, critic1, critic2,
	feedback_type, en_index, num_ensemble):
	obs = ptu.from_numpy(obs).float()
	policy_action = ptu.from_numpy(policy_action).float()
	obs = obs.reshape(1,-1)
	policy_action = policy_action.reshape(1,-1)

	if feedback_type == 0 or feedback_type==2:
	with torch.no_grad():
	target_Q1 = critic1[en_index](obs, policy_action)
	target_Q2 = critic2[en_index](obs, policy_action)
	mean_Q = 0.5*(target_Q1.detach() + target_Q2.detach())
	var_Q = 0.5((target_Q1.detach() - mean_Q)2 + (target_Q2.detach() - mean_Q)*2)
	ucb_score = mean_Q + inference_type * torch.sqrt(var_Q).detach()

	elif feedback_type == 1 or feedback_type==3:
	mean_Q, var_Q = None, None
	L_target_Q = []
	for en_index in range(num_ensemble):
	with torch.no_grad():
	target_Q1 = critic1[en_index](obs, policy_action)
	target_Q2 = critic2[en_index](obs, policy_action)
	L_target_Q.append(target_Q1)
	L_target_Q.append(target_Q2)
	if en_index == 0:
	mean_Q = 0.5*(target_Q1 + target_Q2) / num_ensemble
	else:
	mean_Q += 0.5*(target_Q1 + target_Q2) / num_ensemble

	temp_count = 0
	for target_Q in L_target_Q:
	if temp_count == 0:
	var_Q = (target_Q.detach() - mean_Q)**2
	else:
	var_Q += (target_Q.detach() - mean_Q)**2
	temp_count += 1
	var_Q = var_Q / temp_count
	ucb_score = mean_Q + inference_type * torch.sqrt(var_Q).detach()

	return ucb_score

	def ensemble_ucb_rollout(
	env,
	agent,
	critic1,
	critic2,
	inference_type,
	feedback_type,
	num_ensemble,
	noise_flag=0,
	max_path_length=np.inf,
	ber_mean=0.5,
	render=False,
	render_kwargs=None,
	):
	"""
	The following value for the following keys will be a 2D array, with the
	first dimension corresponding to the time dimension.
	- observations
	- actions
	- rewards
	- next_observations
	- terminals

	The next two elements will be lists of dictionaries, with the index into
	the list being the index into the time
	- agent_infos
	- env_infos
	"""
	if render_kwargs is None:
	render_kwargs = {}
	observations = []
	actions = []
	rewards = []
	terminals = []
	agent_infos = []
	env_infos = []
	masks = [] # mask for bootstrapping
	o = env.reset()
	for en_index in range(num_ensemble):
	agent[en_index].reset()
	next_o = None
	path_length = 0
	if render:
	env.render(**render_kwargs)

	while path_length < max_path_length:
	a_max, ucb_max, agent_info_max = None, None, None
	for en_index in range(num_ensemble):
	_a, agent_info = agent[en_index].get_action(o)
	ucb_score = get_ucb_std(o, _a, inference_type, critic1, critic2,
	feedback_type, en_index, num_ensemble)

	if en_index == 0:
	a_max = _a
	ucb_max = ucb_score
	agent_info_max = agent_info
	else:
	if ucb_score > ucb_max:
	ucb_max = ucb_score
	a_max = _a
	agent_info_max = agent_info

	next_o, r, d, env_info = env.step(a_max)
	if noise_flag == 1:
	r += np.random.normal(0,1,1)[0]
	observations.append(o)
	rewards.append(r)
	terminals.append(d)
	actions.append(a_max)
	agent_infos.append(agent_info_max)
	env_infos.append(env_info)
	mask = torch.bernoulli(torch.Tensor([ber_mean]*num_ensemble))
	if mask.sum() == 0:
	rand_index = np.random.randint(num_ensemble, size=1)
	mask[rand_index] = 1
	mask = mask.numpy()
	masks.append(mask)

	path_length += 1
	if d:
	break
	o = next_o
	if render:
	env.render(**render_kwargs)

	actions = np.array(actions)
	if len(actions.shape) == 1:
	actions = np.expand_dims(actions, 1)
	observations = np.array(observations)
	if len(observations.shape) == 1:
	observations = np.expand_dims(observations, 1)
	next_o = np.array([next_o])
	next_observations = np.vstack(
	(
	observations[1:, :],
	np.expand_dims(next_o, 0)
	)
	)
	masks = np.array(masks)

	return dict(
	observations=observations,
	actions=actions,
	rewards=np.array(rewards).reshape(-1, 1),
	next_observations=next_observations,
	terminals=np.array(terminals).reshape(-1, 1),
	agent_infos=agent_infos,
	env_infos=env_infos,
	masks=masks,
	)


	def ensemble_eval_rollout(
	env,
	agent,
	num_ensemble,
	max_path_length=np.inf,
	render=False,
	render_kwargs=None,
	):
	"""
	The following value for the following keys will be a 2D array, with the
	first dimension corresponding to the time dimension.
	- observations
	- actions
	- rewards
	- next_observations
	- terminals

	The next two elements will be lists of dictionaries, with the index into
	the list being the index into the time
	- agent_infos
	- env_infos
	"""
	if render_kwargs is None:
	render_kwargs = {}
	observations = []
	actions = []
	rewards = []
	terminals = []
	agent_infos = []
	env_infos = []
	o = env.reset()
	for en_index in range(num_ensemble):
	agent[en_index].reset()
	next_o = None
	path_length = 0
	if render:
	env.render(**render_kwargs)
	while path_length < max_path_length:
	a = None
	for en_index in range(num_ensemble):
	_a, agent_info = agent[en_index].get_action(o)
	if en_index == 0:
	a = _a
	else:
	a += _a
	a = a / num_ensemble
	next_o, r, d, env_info = env.step(a)
	observations.append(o)
	rewards.append(r)
	terminals.append(d)
	actions.append(a)
	agent_infos.append(agent_info)
	env_infos.append(env_info)
	path_length += 1
	if d:
	break
	o = next_o
	if render:
	env.render(**render_kwargs)

	actions = np.array(actions)
	if len(actions.shape) == 1:
	actions = np.expand_dims(actions, 1)
	observations = np.array(observations)
	if len(observations.shape) == 1:
	observations = np.expand_dims(observations, 1)
	next_o = np.array([next_o])
	next_observations = np.vstack(
	(
	observations[1:, :],
	np.expand_dims(next_o, 0)
	)
	)
	return dict(
	observations=observations,
	actions=actions,
	rewards=np.array(rewards).reshape(-1, 1),
	next_observations=next_observations,
	terminals=np.array(terminals).reshape(-1, 1),
	agent_infos=agent_infos,
	env_infos=env_infos,
	)

	def async_ensemble_eval_rollout(
	env,
	agent,
	num_ensemble,
	max_path_length=np.inf,
	render=False,
	render_kwargs=None,
	):
	"""
	The following value for the following keys will be a 2D array, with the
	first dimension corresponding to the time dimension.
	- observations
	- actions
	- rewards
	- next_observations
	- terminals

	The next two elements will be lists of dictionaries, with the index into
	the list being the index into the time
	- agent_infos
	- env_infos
	"""
	if render_kwargs is None:
	render_kwargs = {}
	# observations = []
	# actions = []
	# rewards = []
	# terminals = []
	# agent_infos = []
	# env_infos = []
	o = env.reset()
	for en_index in range(num_ensemble):
	agent[en_index].reset()
	next_o = None
	path_length = 0
	if render:
	env.render(**render_kwargs)
	while path_length < max_path_length:
	a = None
	for en_index in range(num_ensemble):
	_a, agent_info = agent[en_index].get_action(o)
	if en_index == 0:
	a = _a
	else:
	a += _a
	a = a / num_ensemble
	next_o, d = env.step(a)
	# observations.append(o)
	# rewards.append(r)
	# terminals.append(d)
	# actions.append(a)
	# agent_infos.append(agent_info)
	# env_infos.append(env_info)

	path_length += 1
	if d:
	break
	o = next_o
	if render:
	env.render(**render_kwargs)

	actions = np.array(actions)
	if len(actions.shape) == 1:
	actions = np.expand_dims(actions, 1)
	observations = np.array(observations)
	if len(observations.shape) == 1:
	observations = np.expand_dims(observations, 1)
	next_o = np.array([next_o])
	next_observations = np.vstack(
	(
	observations[1:, :],
	np.expand_dims(next_o, 0)
	)
	)
	return dict(
	observations=observations,
	actions=actions,
	rewards=np.array(rewards).reshape(-1, 1),
	next_observations=next_observations,
	terminals=np.array(terminals).reshape(-1, 1),
	agent_infos=agent_infos,
	env_infos=env_infos,
	)