history blame
56.3 kB
# ------------------------------------------------------------------------
# Copyright (c) 2022 megvii-model. All Rights Reserved.
# ------------------------------------------------------------------------
# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
# Copyright (c) 2021 Wang, Yue
# ------------------------------------------------------------------------
# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
# Copyright (c) OpenMMLab. All rights reserved.
# ------------------------------------------------------------------------
# Modified by Shihao Wang
# ------------------------------------------------------------------------
import numpy as np
import mmcv
from mmdet.datasets.builder import PIPELINES
import torch
from PIL import Image
from math import factorial
import cv2
import random
import copy
from transformers import AutoTokenizer
import json
import re
import os
from nuscenes.utils.geometry_utils import view_points
from typing import List, Tuple, Union
from shapely.geometry import MultiPoint, Polygon, LineString, Point
from shapely.geometry import box as canvas_box
from ..utils.data_utils import preprocess
from ..utils.constants import DEFAULT_IMAGE_TOKEN
import math
import pickle
def post_process_coords(corner_coords, imsize=(1600, 900)):
polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
img_canvas = canvas_box(0, 0, imsize[0], imsize[1])
if polygon_from_2d_box.intersects(img_canvas):
img_intersection = polygon_from_2d_box.intersection(img_canvas)
if isinstance(img_intersection, Polygon):
intersection_coords = np.array([coord for coord in img_intersection.exterior.coords])
# 计算 min_x, min_y, max_x, max_y
min_x = min(intersection_coords[:, 0])
min_y = min(intersection_coords[:, 1])
max_x = max(intersection_coords[:, 0])
max_y = max(intersection_coords[:, 1])
return min_x, min_y, max_x, max_y
return None
return None
def analyze_position(x, y, angle_deg):
direction = ''
if x > 0:
direction += 'front'
elif x < 0:
direction += 'back'
if y > 2.5:
direction += ' left'
elif y < -2.5:
direction += ' right'
if abs(angle_deg) < 45:
direction += ", same direction as you, "
elif abs(abs(angle_deg) - 180) < 45:
direction += ", opposite direction from you, "
elif abs(angle_deg - 90) < 45:
direction += ", heading from right to left, "
elif abs(angle_deg + 90) < 45:
direction += ", heading from left to right, "
return direction.strip()
class ResizeMultiview3D:
"""Resize images & bbox & mask.
This transform resizes the input image to some scale. Bboxes and masks are
then resized with the same scale factor. If the input dict contains the key
"scale", then the scale in the input dict is used, otherwise the specified
scale in the init method is used. If the input dict contains the key
"scale_factor" (if MultiScaleFlipAug does not give img_scale but
scale_factor), the actual scale will be computed by image shape and
`img_scale` can either be a tuple (single-scale) or a list of tuple
(multi-scale). There are 3 multiscale modes:
- ``ratio_range is not None``: randomly sample a ratio from the ratio \
range and multiply it with the image scale.
- ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
sample a scale from the multiscale range.
- ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
sample a scale from multiple scales.
img_scale (tuple or list[tuple]): Images scales for resizing.
multiscale_mode (str): Either "range" or "value".
ratio_range (tuple[float]): (min_ratio, max_ratio)
keep_ratio (bool): Whether to keep the aspect ratio when resizing the
bbox_clip_border (bool, optional): Whether to clip the objects outside
the border of the image. In some dataset like MOT17, the gt bboxes
are allowed to cross the border of images. Therefore, we don't
need to clip the gt bboxes in these cases. Defaults to True.
backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
These two backends generates slightly different results. Defaults
to 'cv2'.
override (bool, optional): Whether to override `scale` and
`scale_factor` so as to call resize twice. Default False. If True,
after the first resizing, the existed `scale` and `scale_factor`
will be ignored so the second resizing can be allowed.
This option is a work-around for multiple times of resize in DETR.
Defaults to False.
def __init__(self,
if img_scale is None:
self.img_scale = None
if isinstance(img_scale, list):
self.img_scale = img_scale
self.img_scale = [img_scale]
assert mmcv.is_list_of(self.img_scale, tuple)
if ratio_range is not None:
# mode 1: given a scale and a range of image ratio
assert len(self.img_scale) == 1
# mode 2: given multiple scales or a range of scales
assert multiscale_mode in ['value', 'range']
self.backend = backend
self.multiscale_mode = multiscale_mode
self.ratio_range = ratio_range
self.keep_ratio = keep_ratio
# TODO: refactor the override option in Resize
self.override = override
self.bbox_clip_border = bbox_clip_border
def random_select(img_scales):
"""Randomly select an img_scale from given candidates.
img_scales (list[tuple]): Images scales for selection.
(tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
where ``img_scale`` is the selected image scale and \
``scale_idx`` is the selected index in the given candidates.
assert mmcv.is_list_of(img_scales, tuple)
scale_idx = np.random.randint(len(img_scales))
img_scale = img_scales[scale_idx]
return img_scale, scale_idx
def random_sample(img_scales):
"""Randomly sample an img_scale when ``multiscale_mode=='range'``.
img_scales (list[tuple]): Images scale range for sampling.
There must be two tuples in img_scales, which specify the lower
and upper bound of image scales.
(tuple, None): Returns a tuple ``(img_scale, None)``, where \
``img_scale`` is sampled scale and None is just a placeholder \
to be consistent with :func:`random_select`.
assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
img_scale_long = [max(s) for s in img_scales]
img_scale_short = [min(s) for s in img_scales]
long_edge = np.random.randint(
max(img_scale_long) + 1)
short_edge = np.random.randint(
max(img_scale_short) + 1)
img_scale = (long_edge, short_edge)
return img_scale, None
def random_sample_ratio(img_scale, ratio_range):
"""Randomly sample an img_scale when ``ratio_range`` is specified.
A ratio will be randomly sampled from the range specified by
``ratio_range``. Then it would be multiplied with ``img_scale`` to
generate sampled scale.
img_scale (tuple): Images scale base to multiply with ratio.
ratio_range (tuple[float]): The minimum and maximum ratio to scale
the ``img_scale``.
(tuple, None): Returns a tuple ``(scale, None)``, where \
``scale`` is sampled ratio multiplied with ``img_scale`` and \
None is just a placeholder to be consistent with \
assert isinstance(img_scale, tuple) and len(img_scale) == 2
min_ratio, max_ratio = ratio_range
assert min_ratio <= max_ratio
ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
return scale, None
def _random_scale(self, results):
"""Randomly sample an img_scale according to ``ratio_range`` and
If ``ratio_range`` is specified, a ratio will be sampled and be
multiplied with ``img_scale``.
If multiple scales are specified by ``img_scale``, a scale will be
sampled according to ``multiscale_mode``.
Otherwise, single scale will be used.
results (dict): Result dict from :obj:`dataset`.
dict: Two new keys 'scale` and 'scale_idx` are added into \
``results``, which would be used by subsequent pipelines.
if self.ratio_range is not None:
scale, scale_idx = self.random_sample_ratio(
self.img_scale[0], self.ratio_range)
elif len(self.img_scale) == 1:
scale, scale_idx = self.img_scale[0], 0
elif self.multiscale_mode == 'range':
scale, scale_idx = self.random_sample(self.img_scale)
elif self.multiscale_mode == 'value':
scale, scale_idx = self.random_select(self.img_scale)
raise NotImplementedError
results['scale'] = scale
results['scale_idx'] = scale_idx
def _resize_img(self, results):
"""Resize images with ``results['scale']``."""
# results['scale'] = (1280, 720)
img_shapes = []
pad_shapes = []
scale_factors = []
keep_ratios = []
new_gt_bboxes = []
new_centers2d = []
for i in range(len(results['img'])):
if self.keep_ratio:
img, scale_factor = mmcv.imrescale(
# the w_scale and h_scale has minor difference
# a real fix should be done in the mmcv.imrescale in the future
new_h, new_w = img.shape[:2]
h, w = results['img'][i].shape[:2]
w_scale = new_w / w
h_scale = new_h / h
img, w_scale, h_scale = mmcv.imresize(
results['img'][i] = img
scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
#rescale the camera intrinsic
results['intrinsics'][i][0, 0] *= w_scale
results['intrinsics'][i][0, 2] *= w_scale
results['intrinsics'][i][1, 1] *= h_scale
results['intrinsics'][i][1, 2] *= h_scale
if 'gt_bboxes' in results.keys() and len(results['gt_bboxes']) > 0:
gt_bboxes = results['gt_bboxes'][i]
if len(gt_bboxes) > 0:
gt_bboxes[:, 0] *= w_scale
gt_bboxes[:, 1] *= h_scale
gt_bboxes[:, 2] *= w_scale
gt_bboxes[:, 3] *= h_scale
if 'centers2d' in results.keys() and len(results['centers2d']) > 0:
centers2d = results['centers2d'][i]
if len(gt_bboxes) > 0:
centers2d[:, 0] *= w_scale
centers2d[:, 1] *= h_scale
results['gt_bboxes'] = new_gt_bboxes
results['centers2d'] = new_centers2d
results['img_shape'] = img_shapes
results['pad_shape'] = pad_shapes
results['scale_factor'] = scale_factors
results['keep_ratio'] = keep_ratios
results['lidar2img'] = [results['intrinsics'][i] @ results['extrinsics'][i] for i in range(len(results['extrinsics']))]
def __call__(self, results):
"""Call function to resize images, bounding boxes, masks, semantic
segmentation map.
results (dict): Result dict from loading pipeline.
dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
'keep_ratio' keys are added into result dict.
if 'scale' not in results:
if not self.override:
assert 'scale_factor' not in results, (
'scale and scale_factor cannot be both set.')
if 'scale_factor' in results:
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(img_scale={self.img_scale}, '
repr_str += f'multiscale_mode={self.multiscale_mode}, '
repr_str += f'ratio_range={self.ratio_range}, '
repr_str += f'keep_ratio={self.keep_ratio}, '
return repr_str
class PadMultiViewImage():
"""Pad the multi-view image.
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
size (tuple, optional): Fixed padding size.
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
def __init__(self, size=None, size_divisor=None, pad_val=0):
self.size = size
self.size_divisor = size_divisor
self.pad_val = pad_val
assert size is not None or size_divisor is not None
assert size_divisor is None or size is None
def _pad_img(self, results):
"""Pad images according to ``self.size``."""
if self.size is not None:
padded_img = [mmcv.impad(img,
shape = self.size, pad_val=self.pad_val) for img in results['img']]
elif self.size_divisor is not None:
padded_img = [mmcv.impad_to_multiple(img,
self.size_divisor, pad_val=self.pad_val) for img in results['img']]
results['img_shape'] = [img.shape for img in results['img']]
results['img'] = padded_img
results['pad_shape'] = [img.shape for img in padded_img]
results['pad_fix_size'] = self.size
results['pad_size_divisor'] = self.size_divisor
def __call__(self, results):
"""Call function to pad images, masks, semantic segmentation maps.
results (dict): Result dict from loading pipeline.
dict: Updated result dict.
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(size={self.size}, '
repr_str += f'size_divisor={self.size_divisor}, '
repr_str += f'pad_val={self.pad_val})'
return repr_str
def format_number(n, decimal_places=1):
if abs(round(n, decimal_places)) <= 1e-2:
return 0.0
format_string = f"{{n:+.{decimal_places}f}}"
return format_string.format(n=n)
class LoadAnnoatationVQA():
def __init__(
ignore_type=["v1", "v2", "v3"],
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer,
self.n_gen = n_gen
self.ignore_type = ignore_type
self.tokenizer.pad_token = self.tokenizer.unk_token
self.base_vqa_path = base_vqa_path
self.base_desc_path = base_desc_path
self.base_conv_path = base_conv_path
self.base_key_path = base_key_path
self.lane_objs_info = pickle.load(open(lane_objs_info, 'rb'))
CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
self.id2cat = {i: name for i, name in enumerate(CLASSES)}
self.side = {
'singapore': 'left',
'boston': 'right',
self.template = [
"What can you tell about the current driving conditions from the images?",
"What can be observed in the panoramic images provided?",
"Can you provide a summary of the current driving scenario based on the input images?",
"What can you observe from the provided images regarding the driving conditions?",
"Please describe the current driving conditions based on the images provided.",
"Can you describe the current weather conditions and the general environment depicted in the images?",
"Please describe the current driving conditions based on the input images.",
"Could you summarize the current driving conditions based on the input images?",
"Please provide an overview of the current driving conditions based on the images.",
"Can you summarize what the panoramic images show?",
"Can you describe the overall conditions and environment based on the images?",
"Could you describe the overall environment and objects captured in the images provided?"
def preprocess_vqa(self, results, traj):
sources = []
if os.path.exists(self.base_key_path+results['sample_idx']+".json"):
with open(self.base_key_path+results['sample_idx']+".json", 'r') as f:
action = json.load(f)
{"from": 'human',
"value": "Please shortly describe your driving action."},
{"from": 'gpt',
"value": action}
if os.path.exists(self.base_desc_path+results['sample_idx']+".json"):
with open(self.base_desc_path+results['sample_idx']+".json", 'r') as f:
desc = json.load(f)
question = random.sample(self.template, 1)[0]
{"from": 'human',
"value": question},
{"from": 'gpt',
"value": desc["description"]}
if os.path.exists(self.base_vqa_path+results['sample_idx']+".json"):
with open(self.base_vqa_path+results['sample_idx']+".json", 'r') as f:
data_qa = json.load(f)
for i, pair in enumerate(data_qa):
{"from": 'human',
"value": pair["question"]},
{"from": 'gpt',
"value": pair["answer"]}
if os.path.exists(self.base_conv_path+results['sample_idx']+".json"):
with open(self.base_conv_path+results['sample_idx']+".json", 'r') as f:
data_qa = json.load(f)
for pair in data_qa:
{"from": 'human',
"value": pair["question"]},
{"from": 'gpt',
"value": pair["answer"]}
return sources
def online_vqa(self, results):
sources = []
gt_bboxes_2d = []
gt_bboxes_3d = copy.deepcopy(results['gt_bboxes_3d'])
gt_bboxes_3d_points = gt_bboxes_3d.corners
gt_bboxes_points = gt_bboxes_3d_points.view(-1, 3)
gt_bboxes_points = np.concatenate((gt_bboxes_points[:, :3], np.ones(gt_bboxes_points.shape[0])[:, None]), axis=1)
if "v1" not in self.ignore_type:
for i, (cam_type, cam_info) in enumerate(results['cam_infos'].items()):
gt_bboxes_points_cam = np.matmul(gt_bboxes_points, results['extrinsics'][i].T)
bboxes = gt_bboxes_points_cam.reshape(-1, 8, 4)
# img = results['img'][i]
for j, box in enumerate(bboxes):
box = box.transpose(1, 0)
in_front = np.argwhere(box[2, :] > 0).flatten()
corners_3d = box[:, in_front]
corner_coords = view_points(corners_3d[:3, :], results['intrinsics'][i], True).T[:, :2].tolist()
final_coords = post_process_coords(corner_coords)
if final_coords is None:
min_x, min_y, max_x, max_y = final_coords
(height, width, _) = results['pad_shape'][0]
min_x = np.clip(min_x, 0, width)
min_y = np.clip(min_y, 0, height)
max_x = np.clip(max_x, 0, width)
max_y = np.clip(max_y, 0, height)
w, h = max_x - min_x, max_y - min_y
inter_w = max(0, min(min_x + w, width) - max(min_x, 0))
inter_h = max(0, min(min_y + h, height) - max(min_y, 0))
area = w * h
if inter_w * inter_h == 0:
if area <= 0 or w < 16 or h < 16:
# cv2.rectangle(img, (int(min_x), int(min_y)), (int(max_x), int(max_y)), (0, 255, 0), 3)
gt_bboxes_2d.append([round(min_x/width, 3), round(min_y/height, 3), round(max_x/width, 3), round(max_y/height, 3), j, cam_type])
# cv2.imwrite(f"img_{cam_type}.jpg", img)
if len(gt_bboxes_2d) >= 1:
selected_objs = random.sample(gt_bboxes_2d, min(self.n_gen, len(gt_bboxes_2d)))
for obj in selected_objs:
answer = self.format_det_answer(obj[4], gt_bboxes_3d, results)
{"from": 'human',
"value": f"Please Identity the object in the <{obj[5]}, {obj[0]}, {obj[1]}, {obj[2]}, {obj[3]}> and describe its 3D information."},
{"from": 'gpt',
"value": f"The object is a {answer}",}
if len(gt_bboxes_3d) >= 1 and "v2" not in self.ignore_type:
centers = torch.FloatTensor(max(self.n_gen, len(gt_bboxes_3d)), 2).uniform_(-50, 50)
bbox_center = gt_bboxes_3d.center[:, :2] + 5 * (torch.rand_like(gt_bboxes_3d.center[:, :2]) * 2 - 1)
centers = torch.cat([bbox_center, centers], dim=0)
indices = torch.randperm(centers.size(0))[:self.n_gen]
centers = centers[indices]
for center in centers:
objs_near = []
for i in range(len(gt_bboxes_3d)):
gt_box = gt_bboxes_3d[i]
dis = torch.norm(gt_box.center[0, :2] - center)
if dis < 10:
objs_near.append(self.format_det_answer(i, gt_bboxes_3d, results))
if len(objs_near) == 0:
answer = f"There are no objects nearby."
answer = "There are the following objects nearby:\n"
answer += '\n'.join(objs_near)
{"from": 'human',
"value": f"What objects are there near the position ({format_number(center[0].item())}, {format_number(center[1].item())})?"},
{"from": 'gpt',
"value": f"{answer}",}
lane_objs = self.lane_objs_info[results['sample_idx']]
if "lane_objects" in lane_objs.keys():
if "v3" not in self.ignore_type:
index_list = [i for i in range(len(lane_objs['all_lane_pts']))]
index_list = random.sample(index_list, min(self.n_gen, len(index_list)))
for idx in index_list:
if idx not in lane_objs['lane_objects'].keys():
{"from": 'human',
"value": f"What objects are there on the lane {self.describe_lane([lane_objs['all_lane_pts'][idx]])}?"},
{"from": 'gpt',
"value": f"There are no objects on this lane.",}
objs = []
for obj in lane_objs['lane_objects'][idx]:
name, bbox, vel = obj
objs.append(self.format_lane_answer(bbox, vel, name))
answer = '\n'.join(objs)
{"from": 'human',
"value": f"What objects are there on the lane {self.describe_lane([lane_objs['all_lane_pts'][idx]])}?"},
{"from": 'gpt',
"value": f"The objects on this lane include:\n{answer}",}
return sources
def describe_lane(self, bezier_lane):
formatted_points = ", ".join(f"({format_number(point[0])}, {format_number(point[1])})" for point in bezier_lane[0])
result = f"[{formatted_points}]"
return result
def format_lane_answer(self, bbox, vel, name):
x = bbox[0]
y = bbox[1]
z = bbox[2]
l = bbox[3]
w = bbox[4]
h = bbox[5]
yaw = bbox[6]
yaw = math.degrees(yaw)
vx = vel[0]
vy =vel[1]
position = analyze_position(x, y, yaw)
answer = f"{name} in the {position} "
answer += f"location: ({format_number(x)}, {format_number(y)}), "
answer += f"length: {l:.1f}, width: {w:.1f}, height: {h:.1f}, "
answer += f"angles in degrees: {format_number(yaw)}"
if np.sqrt(vx**2 + vy**2) > 0.2:
answer += f", velocity: ({format_number(vx)}, {format_number(vy)}). "
answer += "."
return answer
def format_det_answer(self, index, gt_bboxes_3d, results):
x = gt_bboxes_3d.tensor[index][0].item()
y = gt_bboxes_3d.tensor[index][1].item()
z = gt_bboxes_3d.tensor[index][2].item()
l = gt_bboxes_3d.tensor[index][3].item()
w = gt_bboxes_3d.tensor[index][4].item()
h = gt_bboxes_3d.tensor[index][5].item()
yaw = gt_bboxes_3d.tensor[index][6].item()
vx = gt_bboxes_3d.tensor[index][7].item()
vy = gt_bboxes_3d.tensor[index][8].item()
yaw = math.degrees(yaw)
position = analyze_position(x, y, yaw)
answer = f"{self.id2cat[results['gt_labels_3d'][index]]} in the {position} "
answer += f"location: ({format_number(x)}, {format_number(y)}), "
answer += f"length: {l:.1f}, width: {w:.1f}, height: {h:.1f}, "
answer += f"angles in degrees: {format_number(yaw)}"
if np.sqrt(vx**2 + vy**2) > 0.2:
answer += f", velocity: ({format_number(vx)}, {format_number(vy)}). "
answer += "."
return answer
def __call__(self, results):
traj = None
if 'gt_planning' in results.keys():
planning_traj = results['gt_planning'][0 ,: , :2]
mask = results['gt_planning_mask'][0].any(axis=1)
planning_traj = planning_traj[mask]
if len(planning_traj) == 6:
formatted_points = ', '.join(f"({format_number(point[0], 2)}, {format_number(point[1], 2)})" for point in planning_traj)
traj = f"Here is the planning trajectory [PT, {formatted_points}]."
sources = self.preprocess_vqa(results, traj)
prompt = f"You are driving in {results['location']}. "
online_sources = self.online_vqa(results)
sources += online_sources
if 'gt_planning' in results.keys() and len(planning_traj) == 6:
sources = [
[{"from": 'human',
"value": "Please provide the planning trajectory for the ego car without reasons."},
{"from": 'gpt',
"value": traj}]
] + sources
vqa_anno = [item for pair in sources for item in pair]
vqa_anno[0]['value'] = DEFAULT_IMAGE_TOKEN + '\n' + prompt + vqa_anno[0]['value']
vqa_converted = preprocess([vqa_anno], self.tokenizer, True)
input_ids = vqa_converted['input_ids'][0]
vlm_labels = vqa_converted['labels'][0]
results['input_ids'] = input_ids
results['vlm_labels'] = vlm_labels
return results
def __repr__(self):
repr_str = self.__class__.__name__
return repr_str
class LoadAnnoatationVQATest():
def __init__(
load_type=["conv", "planning", "counter"],
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer,
self.tokenizer.pad_token = self.tokenizer.unk_token
self.base_conv_path = base_conv_path
self.base_vqa_path = base_vqa_path
self.base_counter_path = base_counter_path
self.load_type = load_type
self.side = {
'singapore': 'left',
'boston': 'right',
self.template = [
"What can you tell about the current driving conditions from the images?",
"What can be observed in the panoramic images provided?",
"Can you provide a summary of the current driving scenario based on the input images?",
"What can you observe from the provided images regarding the driving conditions?",
"Please describe the current driving conditions based on the images provided.",
"Can you describe the current weather conditions and the general environment depicted in the images?",
"Please describe the current driving conditions based on the input images.",
"Could you summarize the current driving conditions based on the input images?",
"Please provide an overview of the current driving conditions based on the images.",
"Can you summarize what the panoramic images show?",
"Can you describe the overall conditions and environment based on the images?",
"Could you describe the overall environment and objects captured in the images provided?"
def preprocess_vqa(self, results):
sources = []
if "planning" in self.load_type: # planning trajs
{"from": 'human',
"value": "Please provide the planning trajectory for the ego car without reasons."},
{"from": 'gpt',
"value": ""}
if "short" in self.load_type: # short driving action
{"from": 'human',
"value": "Please shortly describe your driving action."},
{"from": 'gpt',
"value": ""}
if "conv" in self.load_type: # conversation
question = random.sample(self.template, 1)[0] # detailed description
{"from": 'human',
"value": question},
{"from": 'gpt',
"value": ""}
if os.path.exists(self.base_conv_path+results['sample_idx']+".json"):
with open(self.base_conv_path+results['sample_idx']+".json", 'r') as f:
data_qa = json.load(f)
for pair in data_qa:
{"from": 'human',
"value": pair["question"]},
{"from": 'gpt',
"value": ""}
if os.path.exists(self.base_vqa_path+results['sample_idx']+".json"): # attention + action + counter * 2
with open(self.base_vqa_path+results['sample_idx']+".json", 'r') as f:
data_qa = json.load(f)
for pair in data_qa:
{"from": 'human',
"value": pair["question"]},
{"from": 'gpt',
"value": ""}
if "counter" in self.load_type:
all_counters = pickle.load(open(os.path.join(self.base_counter_path + results['sample_idx']+'.pkl'), 'rb'))
for data in all_counters:
{"from": 'human',
"value": f"If you follow the trajectory {data['traj']}, what would happen?"},
{"from": 'gpt',
"value": ""}
return sources
def __call__(self, results):
sources = self.preprocess_vqa(results)
prompt = f"You are driving in {results['location']}. "
vlm_labels = [anno[0]['value'] for anno in sources]
for anno in sources:
anno[0]['value'] = DEFAULT_IMAGE_TOKEN + '\n' + prompt + anno[0]['value']
anno[1]['value'] = ''
vqa_converted = preprocess(sources, self.tokenizer, True, False)
input_ids = vqa_converted['input_ids']
results['input_ids'] = input_ids
results['vlm_labels'] = vlm_labels
return results
def __repr__(self):
repr_str = self.__class__.__name__
return repr_str
class NormalizeMultiviewImage(object):
"""Normalize the image.
Added key is "img_norm_cfg".
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
def __init__(self, mean, std, to_rgb=True):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.to_rgb = to_rgb
def __call__(self, results):
"""Call function to normalize images.
results (dict): Result dict from loading pipeline.
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
results['img'] = [mmcv.imnormalize(
img, self.mean, self.std, self.to_rgb) for img in results['img']]
results['img_norm_cfg'] = dict(
mean=self.mean, std=self.std, to_rgb=self.to_rgb)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
return repr_str
class ResizeCropFlipRotImage():
def __init__(self, data_aug_conf=None, with_2d=True, filter_invisible=True, training=True):
self.data_aug_conf = data_aug_conf
self.training = training
self.min_size = 2.0
self.with_2d = with_2d
self.filter_invisible = filter_invisible
def __call__(self, results):
imgs = results['img']
N = len(imgs)
new_imgs = []
new_gt_bboxes = []
new_centers2d = []
new_gt_labels = []
new_depths = []
assert self.data_aug_conf['rot_lim'] == (0.0, 0.0), "Rotation is not currently supported"
resize, resize_dims, crop, flip, rotate = self._sample_augmentation()
for i in range(N):
img = Image.fromarray(np.uint8(imgs[i]))
img, ida_mat = self._img_transform(
if self.training and self.with_2d: # sync_2d bbox labels
gt_bboxes = results['gt_bboxes'][i]
centers2d = results['centers2d'][i]
gt_labels = results['gt_labels'][i]
depths = results['depths'][i]
if len(gt_bboxes) != 0:
gt_bboxes, centers2d, gt_labels, depths = self._bboxes_transform(
if len(gt_bboxes) != 0 and self.filter_invisible:
gt_bboxes, centers2d, gt_labels, depths = self._filter_invisible(gt_bboxes, centers2d, gt_labels, depths)
results['intrinsics'][i][:3, :3] = ida_mat @ results['intrinsics'][i][:3, :3]
results['gt_bboxes'] = new_gt_bboxes
results['centers2d'] = new_centers2d
results['gt_labels'] = new_gt_labels
results['depths'] = new_depths
results['img'] = new_imgs
results['lidar2img'] = [results['intrinsics'][i] @ results['extrinsics'][i] for i in range(len(results['extrinsics']))]
return results
def _bboxes_transform(self, bboxes, centers2d, gt_labels, depths,resize, crop, flip):
assert len(bboxes) == len(centers2d) == len(gt_labels) == len(depths)
fH, fW = self.data_aug_conf["final_dim"]
bboxes = bboxes * resize
bboxes[:, 0] = bboxes[:, 0] - crop[0]
bboxes[:, 1] = bboxes[:, 1] - crop[1]
bboxes[:, 2] = bboxes[:, 2] - crop[0]
bboxes[:, 3] = bboxes[:, 3] - crop[1]
bboxes[:, 0] = np.clip(bboxes[:, 0], 0, fW)
bboxes[:, 2] = np.clip(bboxes[:, 2], 0, fW)
bboxes[:, 1] = np.clip(bboxes[:, 1], 0, fH)
bboxes[:, 3] = np.clip(bboxes[:, 3], 0, fH)
keep = ((bboxes[:, 2] - bboxes[:, 0]) >= self.min_size) & ((bboxes[:, 3] - bboxes[:, 1]) >= self.min_size)
if flip:
x0 = bboxes[:, 0].copy()
x1 = bboxes[:, 2].copy()
bboxes[:, 2] = fW - x0
bboxes[:, 0] = fW - x1
bboxes = bboxes[keep]
centers2d = centers2d * resize
centers2d[:, 0] = centers2d[:, 0] - crop[0]
centers2d[:, 1] = centers2d[:, 1] - crop[1]
centers2d[:, 0] = np.clip(centers2d[:, 0], 0, fW)
centers2d[:, 1] = np.clip(centers2d[:, 1], 0, fH)
if flip:
centers2d[:, 0] = fW - centers2d[:, 0]
centers2d = centers2d[keep]
gt_labels = gt_labels[keep]
depths = depths[keep]
return bboxes, centers2d, gt_labels, depths
def _filter_invisible(self, bboxes, centers2d, gt_labels, depths):
# filter invisible 2d bboxes
assert len(bboxes) == len(centers2d) == len(gt_labels) == len(depths)
fH, fW = self.data_aug_conf["final_dim"]
indices_maps = np.zeros((fH,fW))
tmp_bboxes = np.zeros_like(bboxes)
tmp_bboxes[:, :2] = np.ceil(bboxes[:, :2])
tmp_bboxes[:, 2:] = np.floor(bboxes[:, 2:])
tmp_bboxes = tmp_bboxes.astype(np.int64)
sort_idx = np.argsort(-depths, axis=0, kind='stable')
tmp_bboxes = tmp_bboxes[sort_idx]
bboxes = bboxes[sort_idx]
depths = depths[sort_idx]
centers2d = centers2d[sort_idx]
gt_labels = gt_labels[sort_idx]
for i in range(bboxes.shape[0]):
u1, v1, u2, v2 = tmp_bboxes[i]
indices_maps[v1:v2, u1:u2] = i
indices_res = np.unique(indices_maps).astype(np.int64)
bboxes = bboxes[indices_res]
depths = depths[indices_res]
centers2d = centers2d[indices_res]
gt_labels = gt_labels[indices_res]
return bboxes, centers2d, gt_labels, depths
def _get_rot(self, h):
return torch.Tensor(
[np.cos(h), np.sin(h)],
[-np.sin(h), np.cos(h)],
def _img_transform(self, img, resize, resize_dims, crop, flip, rotate):
ida_rot = torch.eye(2)
ida_tran = torch.zeros(2)
# adjust image
img = img.resize(resize_dims)
img = img.crop(crop)
if flip:
img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
img = img.rotate(rotate)
# post-homography transformation
ida_rot *= resize
ida_tran -= torch.Tensor(crop[:2])
if flip:
A = torch.Tensor([[-1, 0], [0, 1]])
b = torch.Tensor([crop[2] - crop[0], 0])
ida_rot = A.matmul(ida_rot)
ida_tran = A.matmul(ida_tran) + b
A = self._get_rot(rotate / 180 * np.pi)
b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
b = A.matmul(-b) + b
ida_rot = A.matmul(ida_rot)
ida_tran = A.matmul(ida_tran) + b
ida_mat = torch.eye(3)
ida_mat[:2, :2] = ida_rot
ida_mat[:2, 2] = ida_tran
return img, ida_mat
def _sample_augmentation(self):
H, W = self.data_aug_conf["H"], self.data_aug_conf["W"]
fH, fW = self.data_aug_conf["final_dim"]
if self.training:
resize = np.random.uniform(*self.data_aug_conf["resize_lim"])
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.random.uniform(*self.data_aug_conf["bot_pct_lim"])) * newH) - fH
crop_w = int(np.random.uniform(0, max(0, newW - fW)))
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
if self.data_aug_conf["rand_flip"] and np.random.choice([0, 1]):
flip = True
rotate = np.random.uniform(*self.data_aug_conf["rot_lim"])
resize = max(fH / H, fW / W)
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.mean(self.data_aug_conf["bot_pct_lim"])) * newH) - fH
crop_w = int(max(0, newW - fW) / 2)
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
rotate = 0
return resize, resize_dims, crop, flip, rotate
class GlobalRotScaleTransImage():
def __init__(
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
self.rot_range = rot_range
self.scale_ratio_range = scale_ratio_range
self.translation_std = translation_std
self.reverse_angle = reverse_angle
self.training = training
def __call__(self, results):
# random rotate
translation_std = np.array(self.translation_std, dtype=np.float32)
rot_angle = np.random.uniform(*self.rot_range)
scale_ratio = np.random.uniform(*self.scale_ratio_range)
trans = np.random.normal(scale=translation_std, size=3).T
self._rotate_bev_along_z(results, rot_angle)
if self.reverse_angle:
rot_angle = rot_angle * -1
# random scale
self._scale_xyz(results, scale_ratio)
#random translate
self._trans_xyz(results, trans)
return results
def _trans_xyz(self, results, trans):
trans_mat = torch.eye(4, 4)
trans_mat[:3, -1] = torch.from_numpy(trans).reshape(1, 3)
trans_mat_inv = torch.inverse(trans_mat)
num_view = len(results["lidar2img"])
results['ego_pose'] = (torch.tensor(results["ego_pose"]).float() @ trans_mat_inv).numpy()
results['ego_pose_inv'] = (trans_mat.float() @ torch.tensor(results["ego_pose_inv"])).numpy()
for view in range(num_view):
results["lidar2img"][view] = (torch.tensor(results["lidar2img"][view]).float() @ trans_mat_inv).numpy()
def _rotate_bev_along_z(self, results, angle):
rot_cos = torch.cos(torch.tensor(angle))
rot_sin = torch.sin(torch.tensor(angle))
rot_mat = torch.tensor([[rot_cos, rot_sin, 0, 0], [-rot_sin, rot_cos, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
rot_mat_inv = torch.inverse(rot_mat)
results['ego_pose'] = (torch.tensor(results["ego_pose"]).float() @ rot_mat_inv).numpy()
results['ego_pose_inv'] = (rot_mat.float() @ torch.tensor(results["ego_pose_inv"])).numpy()
num_view = len(results["lidar2img"])
for view in range(num_view):
results["lidar2img"][view] = (torch.tensor(results["lidar2img"][view]).float() @ rot_mat_inv).numpy()
def _scale_xyz(self, results, scale_ratio):
scale_mat = torch.tensor(
[scale_ratio, 0, 0, 0],
[0, scale_ratio, 0, 0],
[0, 0, scale_ratio, 0],
[0, 0, 0, 1],
scale_mat_inv = torch.inverse(scale_mat)
results['ego_pose'] = (torch.tensor(results["ego_pose"]).float() @ scale_mat_inv).numpy()
results['ego_pose_inv'] = (scale_mat @ torch.tensor(results["ego_pose_inv"]).float()).numpy()
num_view = len(results["lidar2img"])
for view in range(num_view):
results["lidar2img"][view] = (torch.tensor(results["lidar2img"][view]).float() @ scale_mat_inv).numpy()
class CustomPadMultiViewImage:
def __init__(self, size_divisor=None, pad_val=0):
self.size_divisor = size_divisor
self.pad_val = pad_val
def __call__(self, results):
max_h = max([img.shape[0] for img in results['img']])
max_w = max([img.shape[1] for img in results['img']])
padded_img = [mmcv.impad(img, shape=(max_h, max_w), pad_val=self.pad_val) for img in results['img']]
if self.size_divisor is not None:
padded_img = [mmcv.impad_to_multiple(
img, self.size_divisor, pad_val=self.pad_val) for img in padded_img]
results['img'] = padded_img
results['pad_shape'] = [img.shape for img in padded_img]
results['pad_fixed_size'] = None
results['pad_size_divisor'] = self.size_divisor
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'size_divisor={self.size_divisor}, '
repr_str += f'pad_val={self.pad_val})'
return repr_str
class CustomParameterizeLane:
def __init__(self, method, n_control):
self.method = method
self.n_control = n_control
def __call__(self, results):
centerlines = results['ann_info']['lane_pts']
para_centerlines = getattr(self, self.method)(centerlines, self.n_control)
results['lane_pts'] = para_centerlines
return results
def comb(self, n, k):
return factorial(n) // (factorial(k) * factorial(n - k))
def fit_bezier(self, points, n_control):
n_points = len(points)
A = np.zeros((n_points, n_control))
t = np.arange(n_points) / (n_points - 1)
for i in range(n_points):
for j in range(n_control):
A[i, j] = self.comb(n_control - 1, j) * np.power(1 - t[i], n_control - 1 - j) * np.power(t[i], j)
conts = np.linalg.lstsq(A, points, rcond=None)
return conts
def fit_bezier_Endpointfixed(self, points, n_control):
n_points = len(points)
A = np.zeros((n_points, n_control))
t = np.arange(n_points) / (n_points - 1)
for i in range(n_points):
for j in range(n_control):
A[i, j] = self.comb(n_control - 1, j) * np.power(1 - t[i], n_control - 1 - j) * np.power(t[i], j)
A_BE = A[1:-1, 1:-1]
_points = points[1:-1]
_points = _points - A[1:-1, 0].reshape(-1, 1) @ points[0].reshape(1, -1) - A[1:-1, -1].reshape(-1, 1) @ points[-1].reshape(1, -1)
conts = np.linalg.lstsq(A_BE, _points, rcond=None)
control_points = np.zeros((n_control, points.shape[1]))
control_points[0] = points[0]
control_points[-1] = points[-1]
control_points[1:-1] = conts[0]
return control_points
def bezier_Endpointfixed(self, input_data, n_control=4):
coeffs_list = []
for idx, centerline in enumerate(input_data):
res = self.fit_bezier_Endpointfixed(centerline, n_control)
coeffs = res.flatten()
return np.array(coeffs_list, dtype=np.float32)
class PhotoMetricDistortionMultiViewImage:
Adapted from https://github.com/fundamentalvision/BEVFormer/blob/master/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py#L99.
Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
def __init__(self,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def __call__(self, results):
"""Call function to perform photometric distortion on images.
results (dict): Result dict from loading pipeline.
dict: Result dict with images distorted.
imgs = results['img']
new_imgs = []
for img in imgs:
assert img.dtype == np.float32, \
'PhotoMetricDistortion needs the input image of dtype np.float32,'\
' please set "to_float32=True" in "LoadImageFromFile" pipeline'
# random brightness
if np.random.randint(2):
delta = random.uniform(-self.brightness_delta,
img += delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = np.random.randint(2)
if mode == 1:
if np.random.randint(2):
alpha = np.random.uniform(self.contrast_lower,
img *= alpha
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if np.random.randint(2):
img[..., 1] *= np.random.uniform(self.saturation_lower,
# random hue
if np.random.randint(2):
img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if mode == 0:
if np.random.randint(2):
alpha = np.random.uniform(self.contrast_lower,
img *= alpha
# randomly swap channels
if np.random.randint(2):
img = img[..., np.random.permutation(3)]
results['img'] = new_imgs
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
repr_str += 'contrast_range='
repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
repr_str += 'saturation_range='
repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
repr_str += f'hue_delta={self.hue_delta})'
return repr_str