RoboticsDiffusionTransformer / tabletop_experiments.py
euijinrnd's picture
Add files using upload-large-folder tool
9de9fbf verified
raw
history blame
7.43 kB
import os
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union
import draccus
import numpy as np
import tqdm
from PIL import Image
import torch
import tabletop
from dm_env import StepType as st
import imageio
import time
import yaml
from scripts.agilex_model import create_model
import random
from PIL import Image, ImageDraw, ImageFont
def set_seed_everywhere(seed: int):
"""Sets the random seed for Python, NumPy, and PyTorch functions."""
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["PYTHONHASHSEED"] = str(seed)
def save_rollout_video(rollout_images, idx, success, task_description, log_file=None, folder=None, subtitile=None):
"""Saves an MP4 replay of an episode."""
if folder is None:
rollout_dir = f"./rollouts/{DATE}"
else:
rollout_dir = f"./rollouts/{DATE}/{folder}/videos"
os.makedirs(rollout_dir, exist_ok=True)
processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
mp4_path = f"{rollout_dir}/{DATE_TIME}--episode={idx}--success={success}--task={processed_task_description}.mp4"
video_writer = imageio.get_writer(mp4_path, fps=30)
for img in rollout_images:
if subtitile:
pil_img = Image.fromarray(img)
draw = ImageDraw.Draw(pil_img)
font = ImageFont.load_default()
# Increase font size by using a truetype font
font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" # Adjust path as needed
font = ImageFont.truetype(font_path, size=24) # Set font size to 24
# 🔧 bbox로 텍스트 크기 계산
bbox = draw.textbbox((0, 0), subtitile, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
text_x = (pil_img.width - text_width) // 2
text_y = pil_img.height - text_height - 10
draw.text((text_x, text_y), subtitile, font=font, fill="white")
img = np.array(pil_img)
video_writer.append_data(img)
video_writer.close()
print(f"Saved rollout MP4 at path {mp4_path}")
if log_file is not None:
log_file.write(f"Saved rollout MP4 at path {mp4_path}\n")
return mp4_path
DATE = time.strftime("%Y_%m_%d")
DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["MUJOCO_GL"] = "egl"
@dataclass
class Config:
checkpoint: Union[str, Path] = ""
twin_or_sing: str = 'TwinVLA'
task_name: str = "aloha_dish_drainer" # Task name
action_space: str = "ee_6d_pos"
num_steps_wait: int = 10 # Number of steps to wait for objects to stabilize in sim
num_trials_per_task: int = 5 # Number of rollouts per task
action_len: int = 20
benchmark: bool = True
run_id_note: Optional[str] = None # Extra note to add in run ID for logging
seed: int = 48
@draccus.wrap()
def eval_tabletop(cfg: Config) -> None:
set_seed_everywhere(cfg.seed)
unnorm_key = cfg.task_name
with open('configs/base.yaml', "r") as fp:
config = yaml.safe_load(fp)
## model load
model = create_model(
args=config,
dtype=torch.bfloat16,
pretrained=cfg.checkpoint,
pretrained_text_encoder_name_or_path="google/t5-v1_1-xxl",
pretrained_vision_encoder_name_or_path="google/siglip-so400m-patch14-384",
control_frequency=20
)
##################
env = tabletop.env(cfg.task_name, cfg.action_space)
highest_rewards = []
episode_returns = []
for rollout_id in tqdm.tqdm(range(cfg.num_trials_per_task)):
np.random.seed(cfg.seed + rollout_id)
ts = env.reset()
if cfg.benchmark:
ts = env.task.benchmark_init(env.physics, rollout_id)
action_counter = 0
replay_images = []
rewards = []
last_front_img = ts.observation['images']['back']
last_right_wrist_img = ts.observation['images']['wrist_right']
last_left_wrist_img = ts.observation['images']['wrist_left']
with torch.inference_mode():
while True:
obs = ts.observation
replay_images.append(obs['images']['back'])
if action_counter == 0:
front_img = obs['images']['back']
right_wrist_img = obs['images']['wrist_right']
left_wrist_img = obs['images']['wrist_left']
image_arrs = [
last_front_img,
last_right_wrist_img,
last_left_wrist_img,
front_img,
right_wrist_img,
left_wrist_img
]
images = [Image.fromarray(arr) if arr is not None else None for arr in image_arrs]
proprio = torch.tensor(obs['ee_6d_pos']).unsqueeze(0)
actions = model.step(
proprio=proprio,
images=images,
instruction=obs['language_instruction']
).squeeze(0).cpu().numpy()
action = actions[action_counter]
ts = env.step(action)
rewards.append(ts.reward)
action_counter += 1
if action_counter == cfg.action_len:
action_counter = 0
if ts.reward == env.task.max_reward or ts.step_type==st.LAST:
break
last_front_img = ts.observation['images']['back']
last_right_wrist_img = ts.observation['images']['wrist_right']
last_left_wrist_img = ts.observation['images']['wrist_left']
rewards = np.array(rewards)
episode_return = np.sum(rewards[rewards!=None])
episode_returns.append(episode_return)
episode_highest_reward = np.max(rewards)
highest_rewards.append(episode_highest_reward)
env_max_reward = env.task.max_reward
save_rollout_video(
replay_images, rollout_id, success=episode_highest_reward==env_max_reward, task_description=cfg.task_name, folder=f"{cfg.checkpoint.split('/')[-1]}-{cfg.task_name}-{cfg.seed}", subtitile=obs['language_instruction']
)
replay_images.clear()
success_rate = np.mean(np.array(highest_rewards) == env_max_reward)
avg_return = np.mean(episode_returns)
summary_str = f'\nSuccess rate: {success_rate}\nAverage return: {avg_return}\n\n'
for r in range(env_max_reward+1):
more_or_equal_r = (np.array(highest_rewards) >= r).sum()
more_or_equal_r_rate = more_or_equal_r / cfg.num_trials_per_task
summary_str += f'Reward >= {r}: {more_or_equal_r}/{cfg.num_trials_per_task} = {more_or_equal_r_rate*100}%\n'
log_dir = Path('rollouts') / DATE / f"{cfg.checkpoint.split('/')[-1]}-{cfg.task_name}-{cfg.seed}"
log_dir.mkdir(parents=True, exist_ok=True)
summary_file = log_dir / "summary.txt"
with summary_file.open("w") as f:
f.write(summary_str)
if __name__ == "__main__":
eval_tabletop()