Spaces:

OpenDILabCommunity
/

gomoku

Sleeping

App Files Files Community

gomoku / DI-engine /ding /rl_utils /retrace.py

zjowowen

init space

3dfe8fb almost 2 years ago

raw

history blame contribute delete

2.15 kB

	import torch
	import torch.nn.functional as F
	from collections import namedtuple
	from ding.rl_utils.isw import compute_importance_weights


	def compute_q_retraces(
	q_values: torch.Tensor,
	v_pred: torch.Tensor,
	rewards: torch.Tensor,
	actions: torch.Tensor,
	weights: torch.Tensor,
	ratio: torch.Tensor,
	gamma: float = 0.9
	) -> torch.Tensor:
	"""
	Shapes:
	- q_values (:obj:`torch.Tensor`): :math:`(T + 1, B, N)`, where T is unroll_len, B is batch size, N is discrete \
	action dim.
	- v_pred (:obj:`torch.Tensor`): :math:`(T + 1, B, 1)`
	- rewards (:obj:`torch.Tensor`): :math:`(T, B)`
	- actions (:obj:`torch.Tensor`): :math:`(T, B)`
	- weights (:obj:`torch.Tensor`): :math:`(T, B)`
	- ratio (:obj:`torch.Tensor`): :math:`(T, B, N)`
	- q_retraces (:obj:`torch.Tensor`): :math:`(T + 1, B, 1)`
	Examples:
	>>> T=2
	>>> B=3
	>>> N=4
	>>> q_values=torch.randn(T+1, B, N)
	>>> v_pred=torch.randn(T+1, B, 1)
	>>> rewards=torch.randn(T, B)
	>>> actions=torch.randint(0, N, (T, B))
	>>> weights=torch.ones(T, B)
	>>> ratio=torch.randn(T, B, N)
	>>> q_retraces = compute_q_retraces(q_values, v_pred, rewards, actions, weights, ratio)

	.. note::
	q_retrace operation doesn't need to compute gradient, just executes forward computation.
	"""
	T = q_values.size()[0] - 1
	rewards = rewards.unsqueeze(-1)
	actions = actions.unsqueeze(-1)
	weights = weights.unsqueeze(-1)
	q_retraces = torch.zeros_like(v_pred) # shape (T+1),B,1
	tmp_retraces = v_pred[-1] # shape B,1
	q_retraces[-1] = v_pred[-1]

	q_gather = torch.zeros_like(v_pred)
	q_gather[0:-1] = q_values[0:-1].gather(-1, actions) # shape (T+1),B,1
	ratio_gather = ratio.gather(-1, actions) # shape T,B,1

	for idx in reversed(range(T)):
	q_retraces[idx] = rewards[idx] + gamma * weights[idx] * tmp_retraces
	tmp_retraces = ratio_gather[idx].clamp(max=1.0) * (q_retraces[idx] - q_gather[idx]) + v_pred[idx]
	return q_retraces # shape (T+1),B,1