|
from typing import Dict
|
|
|
|
import numpy as np
|
|
import torch
|
|
import torch.nn as nn
|
|
|
|
from navsim.agents.transfuser.transfuser_backbone import TransfuserBackbone
|
|
from navsim.agents.transfuser.transfuser_backbone_conv import TransfuserBackboneConv
|
|
from navsim.agents.transfuser.transfuser_backbone_moe import TransfuserBackboneMoe
|
|
from navsim.agents.transfuser.transfuser_backbone_moe_ult32 import TransfuserBackboneMoeUlt32
|
|
from navsim.agents.transfuser.transfuser_backbone_vit import TransfuserBackboneViT
|
|
from navsim.agents.transfuser.transfuser_model import AgentHead
|
|
from navsim.agents.utils.attn import MemoryEffTransformer
|
|
from navsim.agents.utils.nerf import nerf_positional_encoding
|
|
from navsim.agents.vadv2.vadv2_config import Vadv2Config
|
|
|
|
|
|
class Vadv2ModelPDMProgress(nn.Module):
|
|
def __init__(self, config: Vadv2Config):
|
|
super().__init__()
|
|
|
|
self._query_splits = [
|
|
config.num_bounding_boxes,
|
|
]
|
|
|
|
self._config = config
|
|
assert config.backbone_type in ['vit', 'intern', 'vov', 'resnet', 'eva', 'moe', 'moe_ult32', 'swin']
|
|
if config.backbone_type == 'vit' or config.backbone_type == 'eva':
|
|
self._backbone = TransfuserBackboneViT(config)
|
|
elif config.backbone_type == 'intern' or config.backbone_type == 'vov' or config.backbone_type == 'swin':
|
|
self._backbone = TransfuserBackboneConv(config)
|
|
elif config.backbone_type == 'moe':
|
|
self._backbone = TransfuserBackboneMoe(config)
|
|
elif config.backbone_type == 'moe_ult32':
|
|
self._backbone = TransfuserBackboneMoeUlt32(config)
|
|
else:
|
|
self._backbone = TransfuserBackbone(config)
|
|
|
|
bev_size = config.lidar_vert_anchors * config.lidar_horz_anchors
|
|
bev_c = self._backbone.lidar_encoder.feature_info.info[4]['num_chs']
|
|
|
|
self._keyval_embedding = nn.Embedding(
|
|
bev_size, config.tf_d_model
|
|
)
|
|
self._query_embedding = nn.Embedding(sum(self._query_splits), config.tf_d_model)
|
|
|
|
|
|
self._bev_downscale = nn.Conv2d(bev_c, config.tf_d_model, kernel_size=1)
|
|
|
|
|
|
|
|
self._status_encoding = nn.Linear((4 + 2 + 2) * config.num_ego_status, config.tf_d_model)
|
|
|
|
self._bev_semantic_head = nn.Sequential(
|
|
nn.Conv2d(
|
|
config.bev_features_channels,
|
|
config.bev_features_channels,
|
|
kernel_size=(3, 3),
|
|
stride=1,
|
|
padding=(1, 1),
|
|
bias=True,
|
|
),
|
|
nn.ReLU(inplace=True),
|
|
nn.Conv2d(
|
|
config.bev_features_channels,
|
|
config.num_bev_classes,
|
|
kernel_size=(1, 1),
|
|
stride=1,
|
|
padding=0,
|
|
bias=True,
|
|
),
|
|
nn.Upsample(
|
|
size=(config.lidar_resolution_height // 2, config.lidar_resolution_width),
|
|
mode="bilinear",
|
|
align_corners=False,
|
|
),
|
|
)
|
|
|
|
tf_decoder_layer = nn.TransformerDecoderLayer(
|
|
d_model=config.tf_d_model,
|
|
nhead=config.tf_num_head,
|
|
dim_feedforward=config.tf_d_ffn,
|
|
dropout=config.tf_dropout,
|
|
batch_first=True,
|
|
)
|
|
|
|
self._tf_decoder = nn.TransformerDecoder(tf_decoder_layer, config.tf_num_layers)
|
|
self._agent_head = AgentHead(
|
|
num_agents=config.num_bounding_boxes,
|
|
d_ffn=config.tf_d_ffn,
|
|
d_model=config.tf_d_model,
|
|
)
|
|
|
|
self._trajectory_head = Vadv2HeadPDMProgress(
|
|
num_poses=config.trajectory_sampling.num_poses,
|
|
d_ffn=config.tf_d_ffn,
|
|
d_model=config.tf_d_model,
|
|
nhead=config.vadv2_head_nhead,
|
|
nlayers=config.vadv2_head_nlayers,
|
|
vocab_path=config.vocab_path,
|
|
config=config
|
|
)
|
|
|
|
def forward(self, features: Dict[str, torch.Tensor],
|
|
interpolated_traj=None) -> Dict[str, torch.Tensor]:
|
|
|
|
camera_feature: torch.Tensor = features["camera_feature"]
|
|
lidar_feature: torch.Tensor = features["lidar_feature"]
|
|
status_feature: torch.Tensor = features["status_feature"]
|
|
|
|
batch_size = status_feature.shape[0]
|
|
|
|
bev_feature_upscale, bev_feature, _ = self._backbone(camera_feature, lidar_feature)
|
|
|
|
bev_feature = self._bev_downscale(bev_feature).flatten(-2, -1)
|
|
bev_feature = bev_feature.permute(0, 2, 1)
|
|
|
|
if self._config.num_ego_status == 1 and status_feature.shape[1] == 32:
|
|
status_encoding = self._status_encoding(status_feature[:, :8])
|
|
else:
|
|
status_encoding = self._status_encoding(status_feature)
|
|
|
|
keyval = bev_feature
|
|
keyval += self._keyval_embedding.weight[None, ...]
|
|
|
|
query = self._query_embedding.weight[None, ...].repeat(batch_size, 1, 1)
|
|
agents_query = self._tf_decoder(query, keyval)
|
|
|
|
bev_semantic_map = self._bev_semantic_head(bev_feature_upscale)
|
|
|
|
output: Dict[str, torch.Tensor] = {"bev_semantic_map": bev_semantic_map}
|
|
|
|
trajectory = self._trajectory_head(keyval, status_encoding, interpolated_traj)
|
|
output.update(trajectory)
|
|
|
|
agents = self._agent_head(agents_query)
|
|
output.update(agents)
|
|
|
|
return output
|
|
|
|
|
|
class Vadv2HeadPDMProgress(nn.Module):
|
|
def __init__(self, num_poses: int, d_ffn: int, d_model: int, vocab_path: str,
|
|
nhead: int, nlayers: int, config: Vadv2Config = None
|
|
):
|
|
super().__init__()
|
|
self._num_poses = num_poses
|
|
self.transformer = nn.TransformerDecoder(
|
|
nn.TransformerDecoderLayer(
|
|
d_model, nhead, d_ffn,
|
|
dropout=0.0, batch_first=True
|
|
), nlayers
|
|
)
|
|
self.vocab = nn.Parameter(
|
|
torch.from_numpy(np.load(vocab_path)),
|
|
requires_grad=False
|
|
)
|
|
|
|
self.heads = nn.ModuleDict({
|
|
'noc': nn.Sequential(
|
|
nn.Linear(d_model, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, 1),
|
|
),
|
|
'da':
|
|
nn.Sequential(
|
|
nn.Linear(d_model, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, 1),
|
|
),
|
|
'ttc': nn.Sequential(
|
|
nn.Linear(d_model, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, 1),
|
|
),
|
|
'comfort': nn.Sequential(
|
|
nn.Linear(d_model, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, 1),
|
|
),
|
|
'progress': nn.Sequential(
|
|
nn.Linear(d_model, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, 1),
|
|
),
|
|
'imi': nn.Sequential(
|
|
nn.Linear(d_model, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, 1),
|
|
)
|
|
})
|
|
|
|
self.inference_imi_weight = config.inference_imi_weight
|
|
self.inference_da_weight = config.inference_da_weight
|
|
self.normalize_vocab_pos = config.normalize_vocab_pos
|
|
if self.normalize_vocab_pos:
|
|
self.encoder = MemoryEffTransformer(
|
|
d_model=d_model,
|
|
nhead=nhead,
|
|
dim_feedforward=d_model * 4,
|
|
dropout=0.0
|
|
)
|
|
self.use_nerf = config.use_nerf
|
|
|
|
if self.use_nerf:
|
|
self.pos_embed = nn.Sequential(
|
|
nn.Linear(1040, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, d_model),
|
|
)
|
|
else:
|
|
self.pos_embed = nn.Sequential(
|
|
nn.Linear(num_poses * 3, d_ffn),
|
|
nn.ReLU(),
|
|
nn.Linear(d_ffn, d_model),
|
|
)
|
|
|
|
def forward(self, bev_feature, status_encoding, interpolated_traj) -> Dict[str, torch.Tensor]:
|
|
|
|
|
|
|
|
|
|
vocab = self.vocab.data
|
|
L, HORIZON, _ = vocab.shape
|
|
B = bev_feature.shape[0]
|
|
if self.use_nerf:
|
|
vocab = torch.cat(
|
|
[
|
|
nerf_positional_encoding(vocab[..., :2]),
|
|
torch.cos(vocab[..., -1])[..., None],
|
|
torch.sin(vocab[..., -1])[..., None],
|
|
], dim=-1
|
|
)
|
|
|
|
if self.normalize_vocab_pos:
|
|
embedded_vocab = self.pos_embed(vocab.view(L, -1))[None]
|
|
embedded_vocab = self.encoder(embedded_vocab).repeat(B, 1, 1)
|
|
else:
|
|
embedded_vocab = self.pos_embed(vocab.view(L, -1))[None].repeat(B, 1, 1)
|
|
tr_out = self.transformer(embedded_vocab, bev_feature)
|
|
dist_status = tr_out + status_encoding.unsqueeze(1)
|
|
result = {}
|
|
|
|
for k, head in self.heads.items():
|
|
if k == 'imi':
|
|
result[k] = head(dist_status).squeeze(-1)
|
|
else:
|
|
result[k] = head(dist_status).squeeze(-1).sigmoid()
|
|
|
|
scores = (
|
|
self.inference_imi_weight * result['imi'].softmax(-1).log() +
|
|
result['noc'].log() +
|
|
self.inference_da_weight * result['da'].log() +
|
|
(5 * result['ttc'] + 2 * result['comfort'] + 5 * result['progress']).log()
|
|
)
|
|
selected_indices = scores.argmax(1)
|
|
result["trajectory"] = self.vocab.data[selected_indices]
|
|
result["trajectory_vocab"] = self.vocab.data
|
|
result["selected_indices"] = selected_indices
|
|
return result
|
|
|