{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "a31f3377", "metadata": {}, "outputs": [], "source": [ "import json\n", "import numpy as np\n", "import tensorflow_datasets as tfds\n", "from tqdm import tqdm\n", "from PIL import Image\n", "import matplotlib.pyplot as plt\n", "import mediapy\n", "from scipy.spatial.transform import Rotation as R" ] }, { "cell_type": "code", "execution_count": null, "id": "30d1aaaf", "metadata": {}, "outputs": [], "source": [ "# Load the DROID subset dataset\n", "ds = tfds.load(\"droid_100\", data_dir=\"gs://gresearch/robotics\", split=\"train\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "ef404b22", "metadata": {}, "outputs": [], "source": [ "path_to_droid_repo = \".\" # TODO: Replace with the path to your DROID repository\n", "\n", "# Load the extrinsics\n", "cam2base_extrinsics_path = f\"{path_to_droid_repo}/cam2base_extrinsics.json\"\n", "with open(cam2base_extrinsics_path, \"r\") as f:\n", " cam2base_extrinsics = json.load(f)\n", "\n", "# Load the intrinsics\n", "intrinsics_path = f\"{path_to_droid_repo}/intrinsics.json\"\n", "with open(intrinsics_path, \"r\") as f:\n", " intrinsics = json.load(f)\n", "\n", "# Load mapping from episode ID to path, then invert\n", "episode_id_to_path_path = f\"{path_to_droid_repo}/episode_id_to_path.json\"\n", "with open(episode_id_to_path_path, \"r\") as f:\n", " episode_id_to_path = json.load(f)\n", "episode_path_to_id = {v: k for k, v in episode_id_to_path.items()}\n", "\n", "# Load camera serials\n", "camera_serials_path = f\"{path_to_droid_repo}/camera_serials.json\"\n", "with open(camera_serials_path, \"r\") as f:\n", " camera_serials = json.load(f)" ] }, { "cell_type": "code", "execution_count": 4, "id": "a1b27a95", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 25%|██▌ | 25/100 [00:02<00:06, 10.73it/s]\n" ] } ], "source": [ "# Iterate through the dataset to find the first episode that has a cam2base extrinsics entry\n", "eps = []\n", "for ep in tqdm(ds):\n", " file_path = ep[\"episode_metadata\"][\"file_path\"].numpy().decode(\"utf-8\")\n", " recording_folderpath = ep[\"episode_metadata\"][\"recording_folderpath\"].numpy().decode(\"utf-8\")\n", "\n", " episode_path = file_path.split(\"r2d2-data-full/\")[1].split(\"/trajectory\")[0]\n", " if episode_path not in episode_path_to_id:\n", " continue\n", " episode_id = episode_path_to_id[episode_path]\n", " \n", " if episode_id in cam2base_extrinsics:\n", " eps.append(ep)\n", " \n", " if len(eps) >= 10:\n", " break" ] }, { "cell_type": "code", "execution_count": 5, "id": "480b1485", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Camera with calibration data: ext1_cam_serial --> exterior_image_1_left\n" ] } ], "source": [ "# Iterate through the extrinsics to find key that is a digit\n", "# This is the camera serial number, and the corresponding value is the extrinsics\n", "for k, v in cam2base_extrinsics[episode_id].items():\n", " if k.isdigit():\n", " camera_serial = k\n", " extracted_extrinsics = v\n", " break\n", "\n", "# Also lets us get the intrinsics\n", "extracted_intrinsics = intrinsics[episode_id][camera_serial]\n", "\n", "# Using the camera serial, find the corresponding camera name (which is used to determine\n", "# which image stream in the episode to use)\n", "camera_serials_to_name = {v: k for k, v in camera_serials[episode_id].items()}\n", "calib_camera_name = camera_serials_to_name[camera_serial]\n", "\n", "if calib_camera_name == \"ext1_cam_serial\":\n", " calib_image_name = \"exterior_image_1_left\"\n", "elif calib_camera_name == \"ext2_cam_serial\":\n", " calib_image_name = \"exterior_image_2_left\"\n", "else:\n", " raise ValueError(f\"Unknown camera name: {calib_camera_name}\")\n", "\n", "print(f\"Camera with calibration data: {calib_camera_name} --> {calib_image_name}\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "51d0468e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[-0.79138441 -0.21481151 0.57233445 0.27362273]\n", " [-0.61099013 0.30863306 -0.72899705 0.45935 ]\n", " [-0.02004438 -0.9266076 -0.37549511 0.50548118]\n", " [ 0. 0. 0. 1. ]]\n" ] } ], "source": [ "# Convert the extrinsics to a homogeneous transformation matrix\n", "pos = extracted_extrinsics[0:3] # translation\n", "rot_mat = R.from_euler(\"xyz\", extracted_extrinsics[3:6]).as_matrix() # rotation\n", "\n", "# Make homogenous transformation matrix\n", "cam_to_base_extrinsics_matrix = np.eye(4)\n", "cam_to_base_extrinsics_matrix[:3, :3] = rot_mat\n", "cam_to_base_extrinsics_matrix[:3, 3] = pos\n", "\n", "print(cam_to_base_extrinsics_matrix)" ] }, { "cell_type": "code", "execution_count": 7, "id": "bb7545de", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[522.96893311 0. 634.02514648]\n", " [ 0. 522.96893311 365.70669556]\n", " [ 0. 0. 1. ]]\n" ] } ], "source": [ "# Convert the intrinsics to a matrix\n", "fx, cx, fy, cy = extracted_intrinsics[\"cameraMatrix\"]\n", "intrinsics_matrix = np.array([\n", " [fx, 0, cx],\n", " [0, fy, cy],\n", " [0, 0, 1]\n", "])\n", "print(intrinsics_matrix)" ] }, { "cell_type": "code", "execution_count": 8, "id": "bcc54c97", "metadata": {}, "outputs": [], "source": [ "# Save all observations for the calibrated camera and corresponding gripper positions\n", "images = []\n", "cartesian_poses = []\n", "for step in ep[\"steps\"]:\n", " image = step[\"observation\"][calib_image_name].numpy()\n", " images.append(image)\n", " cartesian_pose = step[\"observation\"][\"cartesian_position\"].numpy()\n", " cartesian_poses.append(cartesian_pose)\n", "\n", "# length images x 6\n", "cartesian_poses = np.array(cartesian_poses)\n", "# Remove the rotation and make homogeneous: --> length images x 3 --> length images x 4\n", "cartesian_homogeneous_positions = cartesian_poses[:, :3]\n", "cartesian_homogeneous_positions = np.hstack(\n", " (cartesian_homogeneous_positions, np.ones((cartesian_homogeneous_positions.shape[0], 1)))\n", ")\n", "\n", "# Transpose to support matrix multiplication: --> 4 x length images\n", "gripper_position_base = cartesian_homogeneous_positions.T" ] }, { "cell_type": "code", "execution_count": 9, "id": "95496c61", "metadata": {}, "outputs": [], "source": [ "# Transform gripper position to camera frame, then remove homogeneous component\n", "base_to_cam_extrinsics_matrix = np.linalg.inv(cam_to_base_extrinsics_matrix)\n", "robot_gripper_position_cam = base_to_cam_extrinsics_matrix @ gripper_position_base\n", "robot_gripper_position_cam = robot_gripper_position_cam[:3] # Now 3 x length images" ] }, { "cell_type": "code", "execution_count": 10, "id": "e2b66135", "metadata": {}, "outputs": [], "source": [ "# Finally, use intrinsics to project the gripper position in camera frame into pixel space\n", "pixel_positions = intrinsics_matrix @ robot_gripper_position_cam[:3]\n", "pixel_positions = pixel_positions[:2] / pixel_positions[2]" ] }, { "cell_type": "code", "execution_count": 11, "id": "ff8d3c50", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 558/558 [00:04<00:00, 137.22it/s]\n" ] } ], "source": [ "# Visualize!\n", "vis_images = []\n", "temp_img_path = f\"{path_to_droid_repo}/TEMP.png\"\n", "\n", "for i, image in enumerate(tqdm(images)):\n", " if i % 10 != 0:\n", " continue\n", " \n", " fig, axs = plt.subplots(1)\n", " x, y = pixel_positions[0, i] / 1280 * 320, pixel_positions[1, i] / 720 * 180 # Scale to match image dimensions\n", "\n", " # clip coords\n", " x = np.clip(x, 0, 320)\n", " y = np.clip(y, 0, 180)\n", "\n", " axs.imshow(image)\n", " axs.scatter(x, y, c='red', s=20)\n", " axs.set_xlim(0, 320)\n", " axs.set_ylim(180, 0) # Invert y-axis to match image\n", "\n", " # turn off axes\n", " axs.axis('off')\n", "\n", " # save the figure, then reopen it as PIL image\n", " plt.savefig(temp_img_path, bbox_inches='tight', pad_inches=0)\n", " plt.close(fig)\n", "\n", " vis_image = Image.open(temp_img_path).convert(\"RGB\")\n", " vis_images.append(np.array(vis_image))" ] }, { "cell_type": "code", "execution_count": 12, "id": "72049f59", "metadata": {}, "outputs": [ { "data": { "text/html": [ "