euijinrnd
/

RoboticsDiffusionTransformer

Model card Files Files and versions Community

euijinrnd commited on Jul 29

Commit

d899b9f

verified ·

1 Parent(s): eef26ad

Add files using upload-large-folder tool

Browse files

Files changed (44) hide show

configs/base.yaml +71 -0
configs/calvin_rel_traj_location_bounds_task_ABC_D.json +50 -0
configs/dataset_control_freq.json +73 -0
configs/dataset_img_keys.json +674 -0
configs/dataset_stat.json +0 -0
configs/finetune_datasets.json +5 -0
configs/finetune_sample_weights.json +5 -0
configs/pretrain_datasets.json +3 -0
configs/pretrain_sample_weights.json +3 -0
configs/state_vec.py +114 -0
configs/zero2.json +14 -0
data/aloha/hdf5totfrecords.py +98 -0
data/aloha/unzip_data.sh +3 -0
data/bridgev2/bridgedata_numpy_to_tfrecord.py +174 -0
data/bridgev2/bridgedata_raw_to_numpy.py +316 -0
data/bridgev2/download.sh +13 -0
data/calvin/download.sh +19 -0
data/calvin/hdf5totfrecords.py +92 -0
data/rh20t/hdf5totfrecords.py +200 -0
data/roboset/download.py +42 -0
data/roboset/download.sh +21 -0
data/roboset/h5totfrecords.py +82 -0
data/roboset/links.txt +197 -0
docs/pretrain.md +270 -0
docs/test_6drot.py +99 -0
eval_sim/eval_dp.py +166 -0
eval_sim/eval_octo.py +182 -0
eval_sim/eval_openvla.py +175 -0
eval_sim/eval_rdt_maniskill.py +137 -0
lang_embed/aloha_dish_drainer.pt +3 -0
lang_embed/aloha_handover_box.pt +3 -0
lang_embed/aloha_lift_box.pt +3 -0
lang_embed/aloha_shoes_table.pt +3 -0
lang_embed/anubis_brush_to_pan.pt +3 -0
lang_embed/anubis_carrot_to_bag.pt +3 -0
lang_embed/anubis_towel_kirby.pt +3 -0
scripts/agilex_inference.py +658 -0
scripts/agilex_model.py +313 -0
scripts/encode_lang_batch.py +76 -0
scripts/maniskill_model.py +277 -0
train/dataset.py +467 -0
train/image_corrupt.py +44 -0
train/sample.py +99 -0
train/train.py +509 -0

configs/base.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+common:
+  # The number of historical images
+  img_history_size: 2
+  # The number of future actions to predict
+  action_chunk_size: 64
+  # The number of cameras to be used in the model
+  num_cameras: 3
+  # Dimension for state/action, we use the same space for both state and action
+  # This MUST be equal to configs/state_vec.py
+  state_dim: 128
+dataset:
+  # We will extract the data from raw dataset
+  # and store them in the disk buffer by producer
+  # When training, we will read the data
+  # randomly from the buffer by consumer
+  # The producer will replace the data which has been
+  # read by the consumer with new data
+  # The path to the buffer (at least 400GB)
+  buf_path: /home/jellyho/RDTBuffer
+  # The number of chunks in the buffer
+  buf_num_chunks: 128
+  # The number of samples (step rather than episode) in each chunk
+  buf_chunk_size: 128
+  # We will filter the episodes with length less than `epsd_len_thresh_low`
+  epsd_len_thresh_low: 32
+  # For those more than `epsd_len_thresh_high`,
+  # we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
+  # to better balance the training datasets
+  epsd_len_thresh_high: 2048
+  # How to fit the image size
+  image_aspect_ratio: pad
+  # Maximum number of language tokens
+  tokenizer_max_length: 1024
+model:
+  # Config for condition adpators
+  lang_adaptor: mlp2x_gelu
+  img_adaptor: mlp2x_gelu
+  state_adaptor: mlp3x_gelu
+  lang_token_dim: 4096
+  img_token_dim: 1152
+  # Dim of action or proprioception vector
+  # A `state` refers to an action or a proprioception vector
+  state_token_dim: 128
+  # Config for RDT structure
+  rdt:
+    # 1B: num_head 32 hidden_size 2048
+    hidden_size: 2048
+    depth: 28
+    num_heads: 32
+    cond_pos_embed_type: multimodal
+  # For noise scheduler
+  noise_scheduler:
+    type: ddpm
+    num_train_timesteps: 1000
+    num_inference_timesteps: 5
+    beta_schedule: squaredcos_cap_v2  # Critical choice
+    prediction_type: sample
+    clip_sample: False
+  # For EMA (params averaging)
+  # We do not use EMA currently
+  ema:
+    update_after_step: 0
+    inv_gamma: 1.0
+    power: 0.75
+    min_value: 0.0
+    max_value: 0.9999

configs/calvin_rel_traj_location_bounds_task_ABC_D.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+    "A": [
+        [
+            -0.2691913843154907,
+            -0.21995729207992554,
+            -0.182277649641037
+        ],
+        [
+            0.35127854347229004,
+            0.2769763469696045,
+            0.17159393429756165
+        ]
+    ],
+    "B": [
+        [
+            -0.2576896846294403,
+            -0.22244493663311005,
+            -0.20557966828346252
+        ],
+        [
+            0.32854634523391724,
+            0.2922680974006653,
+            0.17373555898666382
+        ]
+    ],
+    "C": [
+        [
+            -0.29205888509750366,
+            -0.24688798189163208,
+            -0.17577645182609558
+        ],
+        [
+            0.25053921341896057,
+            0.3277084231376648,
+            0.16431939601898193
+        ]
+    ],
+    "D": [
+        [
+            -0.25131964683532715,
+            -0.15233077108860016,
+            -0.13294968008995056
+        ],
+        [
+            0.19209328293800354,
+            0.19344553351402283,
+            0.1370421051979065
+        ]
+    ]
+}

configs/dataset_control_freq.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+    "fractal20220817_data": 3,
+    "taco_play": 15,
+    "jaco_play": 10,
+    "berkeley_cable_routing": 10,
+    "nyu_door_opening_surprising_effectiveness": 3,
+    "viola": 20,
+    "berkeley_autolab_ur5": 5,
+    "toto": 30,
+    "kuka": 10,
+    "language_table": 10,
+    "columbia_cairlab_pusht_real": 10,
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 20,
+    "nyu_rot_dataset_converted_externally_to_rlds":3,
+    "stanford_hydra_dataset_converted_externally_to_rlds": 10,
+    "austin_buds_dataset_converted_externally_to_rlds": 20,
+    "nyu_franka_play_dataset_converted_externally_to_rlds": 3,
+    "maniskill_dataset_converted_externally_to_rlds": 20,
+    "furniture_bench_dataset_converted_externally_to_rlds": 10,
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": 2,
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": 3,
+    "austin_sailor_dataset_converted_externally_to_rlds": 20,
+    "austin_sirius_dataset_converted_externally_to_rlds": 20,
+    "bc_z": 10,
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": 10,
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": 10,
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": 10,
+    "berkeley_mvp_converted_externally_to_rlds": 5,
+    "berkeley_rpt_converted_externally_to_rlds": 30,
+    "kaist_nonprehensile_converted_externally_to_rlds": 10,
+    "stanford_mask_vit_converted_externally_to_rlds": 0,
+    "tokyo_u_lsmo_converted_externally_to_rlds": 10,
+    "dlr_sara_pour_converted_externally_to_rlds": 10,
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": 10,
+    "dlr_edan_shared_control_converted_externally_to_rlds": 5,
+    "asu_table_top_converted_externally_to_rlds": 12.5,
+    "stanford_robocook_converted_externally_to_rlds": 5,
+    "eth_agent_affordances": 66.6,
+    "imperialcollege_sawyer_wrist_cam": 10,
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
+    "uiuc_d3field": 1,
+    "utaustin_mutex": 20,
+    "berkeley_fanuc_manipulation": 10,
+    "cmu_play_fusion": 5,
+    "cmu_stretch": 10,
+    "berkeley_gnm_recon": 3,
+    "berkeley_gnm_cory_hall": 5,
+    "berkeley_gnm_sac_son": 10,
+    "robo_net": 1,
+    "roboturk_real_towercreation": 10,
+    "roboturk_real_laundrylayout": 10,
+    "roboturk_real_objectsearch": 10,
+    "aloha_mobile": 50,
+    "aloha_static": 50,
+    "roboset": 5,
+    "droid": 15,
+    "fmb": 10,
+    "dobbe": 30,
+    "qut_dexterous_manpulation": 30,
+    "agilex": 25,
+    "rh20t": 10,
+    "calvin": 30,
+    "bridgev2": 5,
+    "aloha_dish_drainer" : 20,
+    "aloha_handover_box" : 20,
+    "aloha_shoes_table" : 20,
+    "aloha_lift_box" : 20,
+    "aloha_box_into_pot" : 20,
+    "anubis_towel_kirby" : 20,
+    "anubis_carrot_to_bag" : 20,
+    "anubis_brush_to_pan" : 20
+}

configs/dataset_img_keys.json ADDED Viewed

	@@ -0,0 +1,674 @@

+{
+    "anubis_towel_kirby": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "anubis_carrot_to_bag": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "anubis_brush_to_pan": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "aloha_box_into_pot": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "aloha_box_into_pot_easy": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "aloha_dish_drainer": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "aloha_handover_box": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "aloha_shoes_table": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "aloha_lift_box": {
+        "image_keys": [
+            "agentview_image",
+            "right_wrist_image",
+            "left_wrist_image",
+            "agentview_image"
+        ],
+        "image_mask":[
+            1,1,1,0
+        ]
+    },
+    "fractal20220817_data": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[
+            1,0,0,0
+        ]
+    },
+    "taco_play": {
+        "image_keys": [
+            "rgb_static",
+            "rgb_gripper",
+            "rgb_static",
+            "rgb_static"
+        ],
+        "image_mask":[
+            1,1,0,0
+        ]
+    },
+    "jaco_play": {
+        "image_keys": [
+            "image",
+            "image_wrist",
+            "image_wrist",
+            "image_wrist"
+        ],
+        "image_mask":[
+            1,1,0,0
+        ]
+    },
+    "berkeley_cable_routing": {
+        "image_keys": [
+            "image",
+            "wrist45_image",
+            "wrist225_image",
+            "top_image"
+        ],
+        "image_mask":[1,1,0,1]
+    },
+    "nyu_door_opening_surprising_effectiveness": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "viola": {
+        "image_keys": [
+            "agentview_rgb",
+            "eye_in_hand_rgb",
+            "eye_in_hand_rgb",
+            "eye_in_hand_rgb"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "berkeley_autolab_ur5": {
+        "image_keys": [
+            "image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "toto": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "kuka": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "language_table": {
+        "image_keys": [
+            "rgb",
+            "rgb",
+            "rgb",
+            "rgb"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "columbia_cairlab_pusht_real": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "nyu_rot_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_hydra_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "austin_buds_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "nyu_franka_play_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image_additional_view",
+            "image_additional_view",
+            "image_additional_view"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "maniskill_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "furniture_bench_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "austin_sailor_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "austin_sirius_dataset_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "bc_z": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "hand_image",
+            "hand_image",
+            "image2"
+        ],
+        "image_mask":[1,1,0,1]
+    },
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_mvp_converted_externally_to_rlds": {
+        "image_keys": [
+            "hand_image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "berkeley_rpt_converted_externally_to_rlds": {
+        "image_keys": [
+            "hand_image",
+            "hand_image",
+            "hand_image",
+            "hand_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "kaist_nonprehensile_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_mask_vit_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "tokyo_u_lsmo_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_sara_pour_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "dlr_edan_shared_control_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "asu_table_top_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "stanford_robocook_converted_externally_to_rlds": {
+        "image_keys": [
+            "image_2",
+            "image_1",
+            "image_3",
+            "image_4"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "eth_agent_affordances": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "imperialcollege_sawyer_wrist_cam": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "uiuc_d3field": {
+        "image_keys": [
+            "image_1",
+            "image_2",
+            "image_3",
+            "image_4"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "utaustin_mutex": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "berkeley_fanuc_manipulation": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "cmu_play_fusion": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "cmu_stretch": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_recon": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_cory_hall": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "berkeley_gnm_sac_son": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "robo_net": {
+        "image_keys": [
+            "image",
+            "image1",
+            "image2",
+            "image2"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_towercreation": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_laundrylayout": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "roboturk_real_objectsearch": {
+        "image_keys": [
+            "top_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame",
+            "front_rgb_frame"
+        ],
+        "image_mask":[1,0,0,1]
+    },
+    "aloha_mobile": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_right_wrist"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "aloha_static": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_low"
+        ],
+        "image_mask":[1,1,1,1]
+    },
+    "roboset": {
+        "image_keys": [
+            "rgb_top",
+            "rgb_right",
+            "rgb_left",
+            "rgb_right"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "droid": {
+        "image_keys": [
+            "exterior_image_1_left",
+            "wrist_image_left",
+            "wrist_image_left",
+            "exterior_image_2_left"
+        ],
+        "image_mask":[1,1,0,1]
+    },
+    "fmb": {
+        "image_keys": [
+            "image_side_1",
+            "image_wrist_1",
+            "image_wrist_1",
+            "image_side_2"
+        ],
+        "image_mask":[1,1,0,1]
+    },
+    "dobbe": {
+        "image_keys": [
+            "wrist_image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[0,1,0,0]
+    },
+    "qut_dexterous_manpulation": {
+        "image_keys": [
+            "image",
+            "wrist_image",
+            "wrist_image",
+            "wrist_image"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "agilex": {
+        "image_keys": [
+            "cam_high",
+            "cam_right_wrist",
+            "cam_left_wrist",
+            "cam_right_wrist"
+        ],
+        "image_mask":[1,1,1,0]
+    },
+    "rh20t": {
+        "image_keys": [
+            "image",
+            "image",
+            "image",
+            "image"
+        ],
+        "image_mask":[1,0,0,0]
+    },
+    "calvin": {
+        "image_keys": [
+            "rgb_static",
+            "rgb_gripper",
+            "rgb_gripper",
+            "rgb_gripper"
+        ],
+        "image_mask":[1,1,0,0]
+    },
+    "bridgev2": {
+        "image_keys": [
+            "images0",
+            "images0",
+            "images0",
+            "images0"
+        ],
+        "image_mask":[1,0,0,0]
+    }
+}

configs/dataset_stat.json ADDED Viewed

The diff for this file is too large to render. See raw diff

configs/finetune_datasets.json ADDED Viewed

	@@ -0,0 +1,5 @@

+[
+    "anubis_brush_to_pan",
+    "anubis_carrot_to_bag",
+    "anubis_towel_kirby"
+]

configs/finetune_sample_weights.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "anubis_towel_kirby" : 100,
+    "anubis_carrot_to_bag" :100,
+    "anubis_brush_to_pan" : 100
+}

configs/pretrain_datasets.json ADDED Viewed

	@@ -0,0 +1,3 @@

+[
+    "aloha_box_into_pot_easy"
+]

configs/pretrain_sample_weights.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "aloha_box_into_pot_easy" : 100
+}

configs/state_vec.py ADDED Viewed

	@@ -0,0 +1,114 @@

+STATE_VEC_IDX_MAPPING = {
+    # [0, 10): right arm joint positions
+    **{
+        'arm_joint_{}_pos'.format(i): i for i in range(10)
+    },
+    **{
+        'right_arm_joint_{}_pos'.format(i): i for i in range(10)
+    },
+    # [10, 15): right gripper joint positions
+    **{
+        'gripper_joint_{}_pos'.format(i): i + 10 for i in range(5)
+    },
+    **{
+        'right_gripper_joint_{}_pos'.format(i): i + 10 for i in range(5)
+    },
+    'gripper_open': 10, # alias of right_gripper_joint_0_pos
+    'right_gripper_open': 10,
+    # [15, 25): right arm joint velocities
+    **{
+        'arm_joint_{}_vel'.format(i): i + 15 for i in range(10)
+    },
+    **{
+        'right_arm_joint_{}_vel'.format(i): i + 15 for i in range(10)
+    },
+    # [25, 30): right gripper joint velocities
+    **{
+        'gripper_joint_{}_vel'.format(i): i + 25 for i in range(5)
+    },
+    **{
+        'right_gripper_joint_{}_vel'.format(i): i + 25 for i in range(5)
+    },
+    'gripper_open_vel': 25, # alias of right_gripper_joint_0_vel
+    'right_gripper_open_vel': 25,
+    # [30, 33): right end effector positions
+    'eef_pos_x': 30,
+    'right_eef_pos_x': 30,
+    'eef_pos_y': 31,
+    'right_eef_pos_y': 31,
+    'eef_pos_z': 32,
+    'right_eef_pos_z': 32,
+    # [33, 39): right end effector 6D pose
+    'eef_angle_0': 33,
+    'right_eef_angle_0': 33,
+    'eef_angle_1': 34,
+    'right_eef_angle_1': 34,
+    'eef_angle_2': 35,
+    'right_eef_angle_2': 35,
+    'eef_angle_3': 36,
+    'right_eef_angle_3': 36,
+    'eef_angle_4': 37,
+    'right_eef_angle_4': 37,
+    'eef_angle_5': 38,
+    'right_eef_angle_5': 38,
+    # [39, 42): right end effector velocities
+    'eef_vel_x': 39,
+    'right_eef_vel_x': 39,
+    'eef_vel_y': 40,
+    'right_eef_vel_y': 40,
+    'eef_vel_z': 41,
+    'right_eef_vel_z': 41,
+    # [42, 45): right end effector angular velocities
+    'eef_angular_vel_roll': 42,
+    'right_eef_angular_vel_roll': 42,
+    'eef_angular_vel_pitch': 43,
+    'right_eef_angular_vel_pitch': 43,
+    'eef_angular_vel_yaw': 44,
+    'right_eef_angular_vel_yaw': 44,
+    # [45, 50): reserved
+    # [50, 60): left arm joint positions
+    **{
+        'left_arm_joint_{}_pos'.format(i): i + 50 for i in range(10)
+    },
+    # [60, 65): left gripper joint positions
+    **{
+        'left_gripper_joint_{}_pos'.format(i): i + 60 for i in range(5)
+    },
+    'left_gripper_open': 60, # alias of left_gripper_joint_0_pos
+    # [65, 75): left arm joint velocities
+    **{
+        'left_arm_joint_{}_vel'.format(i): i + 65 for i in range(10)
+    },
+    # [75, 80): left gripper joint velocities
+    **{
+        'left_gripper_joint_{}_vel'.format(i): i + 75 for i in range(5)
+    },
+    'left_gripper_open_vel': 75, # alias of left_gripper_joint_0_vel
+    # [80, 83): left end effector positions
+    'left_eef_pos_x': 80,
+    'left_eef_pos_y': 81,
+    'left_eef_pos_z': 82,
+    # [83, 89): left end effector 6D pose
+    'left_eef_angle_0': 83,
+    'left_eef_angle_1': 84,
+    'left_eef_angle_2': 85,
+    'left_eef_angle_3': 86,
+    'left_eef_angle_4': 87,
+    'left_eef_angle_5': 88,
+    # [89, 92): left end effector velocities
+    'left_eef_vel_x': 89,
+    'left_eef_vel_y': 90,
+    'left_eef_vel_z': 91,
+    # [92, 95): left end effector angular velocities
+    'left_eef_angular_vel_roll': 92,
+    'left_eef_angular_vel_pitch': 93,
+    'left_eef_angular_vel_yaw': 94,
+    # [95, 100): reserved
+    # [100, 102): base linear velocities
+    'base_vel_x': 100,
+    'base_vel_y': 101,
+    # [102, 103): base angular velocities
+    'base_angular_vel': 102,
+    # [103, 128): reserved
+}
+STATE_VEC_LEN = 128

configs/zero2.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9
+    }
+}

data/aloha/hdf5totfrecords.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import tensorflow as tf
+import h5py
+import os
+import fnmatch
+import cv2
+import numpy as np
+from tqdm import tqdm
+def decode_img(img):
+    return cv2.cvtColor(cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+def decode_all_imgs(imgs):
+    return [decode_img(img) for img in imgs]
+def _bytes_feature(value):
+    """Returns a bytes_list from a string / byte."""
+    if isinstance(value, type(tf.constant(0))):
+        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+def _bool_feature(value):
+    """Returns a bool_list from a boolean."""
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))
+def serialize_example(action, base_action, qpos, qvel, cam_high, cam_left_wrist, cam_right_wrist, cam_low, instruction, terminate_episode):
+    if base_action is not None:
+        feature = {
+            'action': _bytes_feature(tf.io.serialize_tensor(action)),
+            'base_action': _bytes_feature(tf.io.serialize_tensor(base_action)),
+            'qpos': _bytes_feature(tf.io.serialize_tensor(qpos)),
+            'qvel': _bytes_feature(tf.io.serialize_tensor(qvel)),
+            'cam_high': _bytes_feature(tf.io.serialize_tensor(cam_high)),
+            'cam_left_wrist': _bytes_feature(tf.io.serialize_tensor(cam_left_wrist)),
+            'cam_right_wrist': _bytes_feature(tf.io.serialize_tensor(cam_right_wrist)),
+            'instruction': _bytes_feature(instruction),
+            'terminate_episode': _bool_feature(terminate_episode)
+        }
+    else:
+        feature = {
+            'action': _bytes_feature(tf.io.serialize_tensor(action)),
+            'qpos': _bytes_feature(tf.io.serialize_tensor(qpos)),
+            'qvel': _bytes_feature(tf.io.serialize_tensor(qvel)),
+            'cam_high': _bytes_feature(tf.io.serialize_tensor(cam_high)),
+            'cam_left_wrist': _bytes_feature(tf.io.serialize_tensor(cam_left_wrist)),
+            'cam_right_wrist': _bytes_feature(tf.io.serialize_tensor(cam_right_wrist)),
+            'cam_low': _bytes_feature(tf.io.serialize_tensor(cam_low)),
+            'instruction': _bytes_feature(instruction),
+            'terminate_episode': _bool_feature(terminate_episode)
+        }
+    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
+    return example_proto.SerializeToString()
+def write_tfrecords(root_dir, out_dir):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    num_files = 0
+    for root, dirs, files in os.walk(root_dir):
+        num_files += len(fnmatch.filter(files, '*.hdf5'))
+    with tqdm(total=num_files) as pbar:
+        for root, dirs, files in os.walk(root_dir):
+            for filename in fnmatch.filter(files, '*.hdf5'):
+                filepath = os.path.join(root, filename)
+                with h5py.File(filepath, 'r') as f:
+                    if not 'instruction' in f:
+                        continue
+                    pbar.update(1)
+                    output_dir = os.path.join(out_dir, os.path.relpath(root, root_dir))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    print(f"Writing TFRecords to {output_dir}")
+                    tfrecord_path = os.path.join(output_dir, filename.replace('.hdf5', '.tfrecord'))
+                    with tf.io.TFRecordWriter(tfrecord_path) as writer:
+                        num_episodes = f['action'].shape[0]
+                        for i in range(num_episodes):
+                            action = f['action'][i]
+                            if 'base_action' in f:
+                                base_action = f['base_action'][i]
+                            else:
+                                base_action = None
+                            qpos = f['observations']['qpos'][i]
+                            qvel = f['observations']['qvel'][i]
+                            cam_high = decode_img(f['observations']['images']['cam_high'][i])
+                            cam_left_wrist = decode_img(f['observations']['images']['cam_left_wrist'][i])
+                            cam_right_wrist = decode_img(f['observations']['images']['cam_right_wrist'][i])
+                            if 'cam_low' in f['observations']['images']:
+                                cam_low = decode_img(f['observations']['images']['cam_low'][i])
+                            else:
+                                cam_low = None
+                            instruction = f['instruction'][()]
+                            terminate_episode = i == num_episodes - 1
+                            serialized_example = serialize_example(action, base_action, qpos, qvel, cam_high, cam_left_wrist, cam_right_wrist, cam_low, instruction, terminate_episode)
+                            writer.write(serialized_example)
+                        print(f"TFRecords written to {tfrecord_path}")
+    print(f"TFRecords written to {out_dir}")
+root_dir = '../datasets/aloha/'
+output_dir = '../datasets/aloha/tfrecords/'
+write_tfrecords(root_dir, output_dir)

data/aloha/unzip_data.sh ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ cd ../datasets/aloha/
2	+
3	+ unzip aloha_mobile.zip

data/bridgev2/bridgedata_numpy_to_tfrecord.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Converts data from the BridgeData numpy format to TFRecord format.
+Consider the following directory structure for the input data:
+    bridgedata_numpy/
+        rss/
+            toykitchen2/
+                set_table/
+                    00/
+                        train/
+                            out.npy
+                        val/
+                            out.npy
+        icra/
+            ...
+The --depth parameter controls how much of the data to process at the
+--input_path; for example, if --depth=5, then --input_path should be
+"bridgedata_numpy", and all data will be processed. If --depth=3, then
+--input_path should be "bridgedata_numpy/rss/toykitchen2", and only data
+under "toykitchen2" will be processed.
+The same directory structure will be replicated under --output_path.  For
+example, in the second case, the output will be written to
+"{output_path}/set_table/00/...".
+Can read/write directly from/to Google Cloud Storage.
+Written by Kevin Black ([email protected]).
+"""
+import os
+from multiprocessing import Pool
+import numpy as np
+import tensorflow as tf
+import tqdm
+from absl import app, flags, logging
+import pickle
+from multiprocessing import cpu_count
+FLAGS = flags.FLAGS
+flags.DEFINE_string("input_path", None, "Input path", required=True)
+flags.DEFINE_string("output_path", None, "Output path", required=True)
+flags.DEFINE_integer(
+    "depth",
+    5,
+    "Number of directories deep to traverse. Looks for {input_path}/dir_1/dir_2/.../dir_{depth-1}/train/out.npy",
+)
+flags.DEFINE_bool("overwrite", False, "Overwrite existing files")
+num_workers = 8
+flags.DEFINE_integer("num_workers", num_workers, "Number of threads to use")
+print(f"using {num_workers} workers")
+def tensor_feature(value):
+    return tf.train.Feature(
+        bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(value).numpy()])
+    )
+def _bytes_feature(value):
+    """Returns a bytes_list from a string / byte."""
+    if isinstance(value, type(tf.constant(0))):
+        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))
+def _strings_feature(string_list):
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=s.encode('utf-8')))
+def _bool_feature(value):
+    """Returns a bool_list from a boolean."""
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))
+def process(path):
+    # with tf.io.gfile.GFile(path, "rb") as f:
+    #     arr = np.load(f, allow_pickle=True)
+    try:
+        with tf.io.gfile.GFile(path, "rb") as f:
+            arr = np.load(path, allow_pickle=True)
+    except Exception as e:
+        print(f"Error loading {path}: {e}")
+        return
+    dirname = os.path.dirname(os.path.abspath(path))
+    outpath = os.path.join(FLAGS.output_path, *dirname.split(os.sep)[-FLAGS.depth :])
+    if tf.io.gfile.exists(outpath):
+        if FLAGS.overwrite:
+            logging.info(f"Deleting {outpath}")
+            tf.io.gfile.rmtree(outpath)
+        else:
+            logging.info(f"Skipping {outpath}")
+            return
+    if len(arr) == 0:
+        logging.info(f"Skipping {path}, empty")
+        return
+    tf.io.gfile.makedirs(outpath)
+    for i,traj in enumerate(arr):
+        write_path = f"{outpath}/out_{i}.tfrecord"
+        with tf.io.TFRecordWriter(write_path) as writer:
+            truncates = np.zeros(len(traj["actions"]), dtype=np.bool_)
+            truncates[-1] = True
+            frames_num = len(traj["observations"])
+            # remove empty string
+            traj["language"] = [x for x in traj["language"] if x != ""]
+            if len(traj["language"]) == 0:
+                traj["language"] = [""]
+            instr = traj["language"][0]
+            if(len(traj["language"]) > 2):
+                print(len(traj["language"]))
+            for i in range(frames_num):
+                tf_features = {
+                    "observations/images0": tensor_feature(
+                        np.array(
+                            [traj["observations"][i]["images0"]],
+                            dtype=np.uint8,
+                        )
+                    ),
+                    "observations/state": tensor_feature(
+                        np.array(
+                            [traj["observations"][i]["state"]],
+                            dtype=np.float32,
+                        )
+                    ),
+                    "observations/qpos": tensor_feature(
+                        np.array(
+                            [traj["observations"][i]["qpos"]],
+                            dtype=np.float32,
+                        )
+                    ),
+                    "observations/eef_transform": tensor_feature(
+                        np.array(
+                            [traj["observations"][i]["eef_transform"]],
+                            dtype=np.float32,
+                        )
+                    ),
+                    "language": _bytes_feature(instr),
+                    "actions": tensor_feature(
+                        np.array(traj["actions"][i], dtype=np.float32)
+                    ),
+                    "truncates": _bool_feature(i == frames_num - 1),
+                }
+                example = tf.train.Example(
+                    features=tf.train.Features(
+                        feature = tf_features
+                    )
+                )
+                writer.write(example.SerializeToString())
+def main(_):
+    assert FLAGS.depth >= 1
+    paths = tf.io.gfile.glob(
+        tf.io.gfile.join(FLAGS.input_path, *("*" * (FLAGS.depth - 1)))
+    )
+    paths = [f"{p}/train/out.npy" for p in paths] + [f"{p}/val/out.npy" for p in paths]
+    # num_episodes = 0
+    # for dirpath in paths:
+    #     with tf.io.gfile.GFile(dirpath, "rb") as f:
+    #         arr = np.load(dirpath, allow_pickle=True)
+    #     num_episodes += len(arr)
+    # print(num_episodes)
+    with Pool(FLAGS.num_workers) as p:
+        list(tqdm.tqdm(p.imap(process, paths), total=len(paths)))
+if __name__ == "__main__":
+    app.run(main)

data/bridgev2/bridgedata_raw_to_numpy.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Converts data from the BridgeData raw format to numpy format.
+Consider the following directory structure for the input data:
+    bridgedata_raw/
+        rss/
+            toykitchen2/
+                set_table/
+                    00/
+                        2022-01-01_00-00-00/
+                            collection_metadata.json
+                            config.json
+                            diagnostics.png
+                            raw/
+                                traj_group0/
+                                    traj0/
+                                        obs_dict.pkl
+                                        policy_out.pkl
+                                        agent_data.pkl
+                                        images0/
+                                            im_0.jpg
+                                            im_1.jpg
+                                            ...
+                                    ...
+                                ...
+                    01/
+                    ...
+The --depth parameter controls how much of the data to process at the
+--input_path; for example, if --depth=5, then --input_path should be
+"bridgedata_raw", and all data will be processed. If --depth=3, then
+--input_path should be "bridgedata_raw/rss/toykitchen2", and only data
+under "toykitchen2" will be processed.
+The same directory structure will be replicated under --output_path.  For
+example, in the second case, the output will be written to
+"{output_path}/set_table/00/...".
+Squashes images to 128x128.
+Can write directly to Google Cloud Storage, but not read from it.
+Written by Kevin Black ([email protected]).
+"""
+import copy
+import glob
+import os
+import pickle
+import random
+from collections import defaultdict
+from datetime import datetime
+from functools import partial
+from multiprocessing import Pool
+import numpy as np
+import tensorflow as tf
+import tqdm
+from absl import app, flags, logging
+from PIL import Image
+FLAGS = flags.FLAGS
+flags.DEFINE_string("input_path", None, "Input path", required=True)
+flags.DEFINE_string("output_path", None, "Output path", required=True)
+flags.DEFINE_integer(
+    "depth",
+    5,
+    "Number of directories deep to traverse to the dated directory. Looks for"
+    "{input_path}/dir_1/dir_2/.../dir_{depth-1}/2022-01-01_00-00-00/...",
+)
+flags.DEFINE_bool("overwrite", False, "Overwrite existing files")
+flags.DEFINE_float(
+    "train_proportion", 0.9, "Proportion of data to use for training (rather than val)"
+)
+flags.DEFINE_integer("num_workers", 8, "Number of threads to use")
+flags.DEFINE_integer("im_size", 128, "Image size")
+def squash(path):
+    im = Image.open(path)
+    # im = im.resize((FLAGS.im_size, FLAGS.im_size), Image.Resampling.LANCZOS)
+    out = np.asarray(im).astype(np.uint8)
+    return out
+def process_images(path):  # processes images at a trajectory level
+    names = sorted(
+        [x for x in os.listdir(path) if "images" in x and not "depth" in x],
+        key=lambda x: int(x.split("images")[1]),
+    )
+    image_path = [
+        os.path.join(path, x)
+        for x in os.listdir(path)
+        if "images" in x and not "depth" in x
+    ]
+    image_path = sorted(image_path, key=lambda x: int(x.split("images")[1]))
+    images_out = defaultdict(list)
+    if len(image_path) == 0:
+        return None, None
+    tlen = len(glob.glob(image_path[0] + "/im_*.jpg"))
+    for i, name in enumerate(names):
+        for t in range(tlen):
+            images_out[name].append(squash(image_path[i] + "/im_{}.jpg".format(t)))
+    images_out = dict(images_out)
+    obs, next_obs = dict(), dict()
+    for n in names:
+        obs[n] = images_out[n][:-1]
+        next_obs[n] = images_out[n][1:]
+    return obs, next_obs
+def process_state(path):
+    fp = os.path.join(path, "obs_dict.pkl")
+    with open(fp, "rb") as f:
+        x = pickle.load(f)
+    qpos = None if "qpos" not in x.keys() else x["qpos"]
+    qvel = None if "qvel" not in x.keys() else x["qvel"]
+    eef_transform = None if "eef_transform" not in x.keys() else x["eef_transform"]
+    return x["full_state"][:-1], x["full_state"][1:], qpos, qvel, eef_transform
+def process_time(path):
+    fp = os.path.join(path, "obs_dict.pkl")
+    with open(fp, "rb") as f:
+        x = pickle.load(f)
+    return x["time_stamp"][:-1], x["time_stamp"][1:]
+def process_actions(path):  # gets actions
+    fp = os.path.join(path, "policy_out.pkl")
+    with open(fp, "rb") as f:
+        act_list = pickle.load(f)
+    if isinstance(act_list[0], dict):
+        act_list = [x["actions"] for x in act_list]
+    return act_list
+# processes each data collection attempt
+def process_dc(path, train_ratio=0.9):
+    # a mystery left by the greats of the past
+    if "lmdb" in path:
+        logging.warning(f"Skipping {path} because uhhhh lmdb?")
+        return [], [], [], []
+    all_dicts_train = list()
+    all_dicts_test = list()
+    all_rews_train = list()
+    all_rews_test = list()
+    # Data collected prior to 7-23 has a delay of 1, otherwise a delay of 0
+    date_time = datetime.strptime(path.split("/")[-1], "%Y-%m-%d_%H-%M-%S")
+    latency_shift = date_time < datetime(2021, 7, 23)
+    search_path = os.path.join(path, "raw", "traj_group*", "traj*")
+    all_traj = glob.glob(search_path)
+    if all_traj == []:
+        logging.info(f"no trajs found in {search_path}")
+        return [], [], [], []
+    random.shuffle(all_traj)
+    num_traj = len(all_traj)
+    for itraj, tp in tqdm.tqdm(enumerate(all_traj)):
+        try:
+            out = dict()
+            ld = os.listdir(tp)
+            assert "obs_dict.pkl" in ld, tp + ":" + str(ld)
+            assert "policy_out.pkl" in ld, tp + ":" + str(ld)
+            # assert "agent_data.pkl" in ld, tp + ":" + str(ld) # not used
+            obs, next_obs = process_images(tp)
+            if obs is None:
+                return
+            acts = process_actions(tp)
+            state, next_state, qpos, qvel, eef_transform = process_state(tp)
+            time_stamp, next_time_stamp = process_time(tp)
+            term = [0] * len(acts)
+            if "lang.txt" in ld:
+                with open(os.path.join(tp, "lang.txt")) as f:
+                    lang = list(f)
+                    lang = [l.strip() for l in lang if "confidence" not in l]
+            else:
+                # empty string is a placeholder for data with no language label
+                lang = [""]
+            out["observations"] = obs
+            out["observations"]["state"] = state
+            out["observations"]["time_stamp"] = time_stamp
+            if qpos is not None:
+                out["observations"]["qpos"] = qpos
+            else:
+                return None, None, None, None
+            if qvel is not None:
+                out["observations"]["qvel"] = qvel
+            if eef_transform is not None:
+                out["observations"]["eef_transform"] = eef_transform
+            out["next_observations"] = next_obs
+            out["next_observations"]["state"] = next_state
+            out["next_observations"]["time_stamp"] = next_time_stamp
+            out["observations"] = [
+                dict(zip(out["observations"], t))
+                for t in zip(*out["observations"].values())
+            ]
+            out["next_observations"] = [
+                dict(zip(out["next_observations"], t))
+                for t in zip(*out["next_observations"].values())
+            ]
+            out["actions"] = acts
+            out["terminals"] = term
+            out["language"] = lang
+            # shift the actions according to camera latency
+            if latency_shift:
+                out["observations"] = out["observations"][1:]
+                out["next_observations"] = out["next_observations"][1:]
+                out["actions"] = out["actions"][:-1]
+                out["terminals"] = term[:-1]
+            labeled_rew = copy.deepcopy(out["terminals"])[:]
+            labeled_rew[-2:] = [1, 1]
+            traj_len = len(out["observations"])
+            assert len(out["next_observations"]) == traj_len
+            assert len(out["actions"]) == traj_len
+            assert len(out["terminals"]) == traj_len
+            assert len(labeled_rew) == traj_len
+            if itraj < int(num_traj * train_ratio):
+                all_dicts_train.append(out)
+                all_rews_train.append(labeled_rew)
+            else:
+                all_dicts_test.append(out)
+                all_rews_test.append(labeled_rew)
+        except FileNotFoundError as e:
+            logging.error(e)
+            continue
+        except AssertionError as e:
+            logging.error(e)
+            continue
+    return all_dicts_train, all_dicts_test, all_rews_train, all_rews_test
+def make_numpy(path, train_proportion):
+    dirname = os.path.abspath(path)
+    outpath = os.path.join(
+        FLAGS.output_path, *dirname.split(os.sep)[-(max(FLAGS.depth - 1, 1)) :]
+    )
+    if os.path.exists(outpath):
+        if FLAGS.overwrite:
+            logging.info(f"Deleting {outpath}")
+            tf.io.gfile.rmtree(outpath)
+        else:
+            logging.info(f"Skipping {outpath}")
+            return
+    outpath_train = tf.io.gfile.join(outpath, "train")
+    outpath_val = tf.io.gfile.join(outpath, "val")
+    tf.io.gfile.makedirs(outpath_train)
+    tf.io.gfile.makedirs(outpath_val)
+    lst_train = []
+    lst_val = []
+    rew_train_l = []
+    rew_val_l = []
+    for dated_folder in os.listdir(path):
+        curr_train, curr_val, rew_train, rew_val = process_dc(
+            os.path.join(path, dated_folder), train_ratio=train_proportion
+        )
+        if curr_train is None:
+            continue
+        lst_train.extend(curr_train)
+        lst_val.extend(curr_val)
+        rew_train_l.extend(rew_train)
+        rew_val_l.extend(rew_val)
+    if len(lst_train) == 0 or len(lst_val) == 0:
+        return
+    with tf.io.gfile.GFile(tf.io.gfile.join(outpath_train, "out.npy"), "wb") as f:
+        np.save(f, lst_train)
+    with tf.io.gfile.GFile(tf.io.gfile.join(outpath_val, "out.npy"), "wb") as f:
+        np.save(f, lst_val)
+    # doesn't seem like these are ever used anymore
+    # np.save(os.path.join(outpath_train, "out_rew.npy"), rew_train_l)
+    # np.save(os.path.join(outpath_val, "out_rew.npy"), rew_val_l)
+def main(_):
+    assert FLAGS.depth >= 1
+    # each path is a directory that contains dated directories
+    paths = glob.glob(os.path.join(FLAGS.input_path, *("*" * (FLAGS.depth - 1))))
+    worker_fn = partial(make_numpy, train_proportion=FLAGS.train_proportion)
+    with Pool(FLAGS.num_workers) as p:
+        list(tqdm.tqdm(p.imap(worker_fn, paths), total=len(paths)))
+if __name__ == "__main__":
+    app.run(main)

data/bridgev2/download.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+# Download the dataset to ../datasets/bridgev2
+mkdir -p ../datasets/bridgev2
+wget -O ../datasets/bridgev2/demos_8_17.zip https://rail.eecs.berkeley.edu/datasets/bridge_release/data/demos_8_17.zip
+mkdir -p ../datasets/bridgev2/raw
+# Unzip the dataset
+unzip '../datasets/bridgev2/*.zip' -d ../datasets/bridgev2/raw
+# Convert the dataset to numpy
+python bridgedata_raw_to_numpy.py --input ../datasets/bridgev2/raw --output ../datasets/bridgev2/npy
+# Convert the dataset to tfrecords
+python bridgedata_numpy_to_tfrecord.py --input ../datasets/bridgev2/npy --output ../datasets/bridgev2/tfrecords
+# Remove the raw data and numpy data
+rm -rf ../datasets/bridgev2/raw
+rm -rf ../datasets/bridgev2/npy

data/calvin/download.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+echo "Downloading CALVIN dataset..."
+# Create calvin folder in ../datasets/calvin/
+mkdir -p ../datasets/calvin/
+cd ../datasets/calvin/
+# You can use this for faster downloading
+# aria2c -x 16 -s 16 http://calvin.cs.uni-freiburg.de/dataset/task_ABC_D.zip
+wget http://calvin.cs.uni-freiburg.de/dataset/task_ABC_D.zip
+echo "Unzipping CALVIN dataset..."
+unzip task_ABC_D.zip
+echo "Done downloading and unzipping CALVIN dataset."

data/calvin/hdf5totfrecords.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import tensorflow as tf
+import os
+import numpy as np
+from tqdm import tqdm
+def _bytes_feature(value):
+    """Returns a bytes_list from a string / byte."""
+    if isinstance(value, type(tf.constant(0))):
+        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+def _bool_feature(value):
+    """Returns a bool_list from a boolean."""
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))
+def serialize_example(action, robot_obs, rgb_static, rgb_gripper, instruction, terminate_episode):
+    # Feature for fixed-length fields
+    feature = {
+        'action': _bytes_feature(tf.io.serialize_tensor(action)),
+        'robot_obs': _bytes_feature(tf.io.serialize_tensor(robot_obs)),
+        'rgb_static': _bytes_feature(tf.io.serialize_tensor(rgb_static)),
+        'rgb_gripper': _bytes_feature(tf.io.serialize_tensor(rgb_gripper)),
+        'terminate_episode': _bool_feature(terminate_episode),
+        'instruction': _bytes_feature(instruction),
+    }
+    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
+    return example_proto.SerializeToString()
+def write_tfrecords(root_dir, out_dir):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    # Get the language annotation and corresponding indices
+    f = np.load(os.path.join(root_dir, "lang_annotations/auto_lang_ann.npy"), allow_pickle=True)
+    lang = f.item()['language']['ann']
+    lang = np.array([x.encode('utf-8') for x in lang])
+    lang_start_end_idx = f.item()['info']['indx']
+    num_ep = len(lang_start_end_idx)
+    with tqdm(total=num_ep) as pbar:
+        for episode_idx, (start_idx, end_idx) in enumerate(lang_start_end_idx):
+            pbar.update(1)
+            step_files = [
+                f"episode_{str(i).zfill(7)}.npz"
+                for i in range(start_idx, end_idx + 1)
+            ]
+            action = []
+            robot_obs = []
+            rgb_static = []
+            rgb_gripper = []
+            instr = lang[episode_idx]
+            for step_file in step_files:
+                filepath = os.path.join(root_dir, step_file)
+                f = np.load(filepath)
+                # Get relevent things
+                action.append(f['actions'])
+                robot_obs.append(f['robot_obs'])
+                rgb_static.append(f['rgb_static'])
+                rgb_gripper.append(f['rgb_gripper'])
+            tfrecord_path = os.path.join(out_dir, f'{episode_idx:07d}.tfrecord')
+            print(f"Writing TFRecords to {tfrecord_path}")
+            with tf.io.TFRecordWriter(tfrecord_path) as writer:
+                for i in range(len(step_files)):
+                    serialized_example = serialize_example(
+                        action[i], robot_obs[i], rgb_static[i], rgb_gripper[i], instr, i == len(step_files) - 1
+                    )
+                    writer.write(serialized_example)
+output_dirs = [
+    '../datasets/calvin/tfrecords/training',
+    '../datasets/calvin/tfrecords/validation'
+]
+for output_dir in output_dirs:
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+root_dirs = [
+    '../datasets/calvin/task_ABC_D/training',
+    '../datasets/calvin/task_ABC_D/validation'
+]
+for root_dir, output_dir in zip(root_dirs, output_dirs):
+    print(f"Writing TFRecords to {output_dir}")
+    write_tfrecords(root_dir, output_dir)

data/rh20t/hdf5totfrecords.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import numpy as np
+import os
+import cv2
+from multiprocessing import Pool, cpu_count, current_process
+import tensorflow as tf
+from tqdm import tqdm
+import json
+def _parse_function(proto):
+    # Define how to parse the data here.
+    feature_description = {
+        'joint': tf.io.FixedLenFeature([], tf.string),
+        'image': tf.io.FixedLenFeature([], tf.string),
+        'instruction': tf.io.FixedLenFeature([], tf.string),
+        'terminate_episode': tf.io.FixedLenFeature([], tf.int64),
+        'gripper': tf.io.FixedLenFeature([], tf.string, default_value=""),
+        'tcp': tf.io.FixedLenFeature([], tf.string, default_value=""),
+        'tcp_base': tf.io.FixedLenFeature([], tf.string, default_value="")
+    }
+    parsed_features = tf.io.parse_single_example(proto, feature_description)
+    # Parse tensors
+    parsed_features['joint'] = tf.io.parse_tensor(parsed_features['joint'], out_type=tf.float64)
+    parsed_features['image'] = tf.io.parse_tensor(parsed_features['image'], out_type=tf.uint8)
+    parsed_features['instruction'] = tf.io.parse_tensor(parsed_features['instruction'], out_type=tf.string)
+    parsed_features['gripper'] = tf.cond(
+        tf.math.equal(parsed_features['gripper'], ""),
+        lambda: tf.constant([], dtype=tf.float64),
+        lambda: tf.io.parse_tensor(parsed_features['gripper'], out_type=tf.float64)
+    )
+    parsed_features['tcp'] = tf.cond(
+        tf.math.equal(parsed_features['tcp'], ""),
+        lambda: tf.constant([], dtype=tf.float64),
+        lambda: tf.io.parse_tensor(parsed_features['tcp'], out_type=tf.float64)
+    )
+    parsed_features['tcp_base'] = tf.cond(
+        tf.math.equal(parsed_features['tcp_base'], ""),
+        lambda: tf.constant([], dtype=tf.float64),
+        lambda: tf.io.parse_tensor(parsed_features['tcp_base'], out_type=tf.float64)
+    )
+    return parsed_features
+def convert_color(color_file, color_timestamps):
+    """
+    Args:
+    - color_file: the color video file;
+    - color_timestamps: the color timestamps;
+    - dest_color_dir: the destination color directory.
+    """
+    cap = cv2.VideoCapture(color_file)
+    cnt = 0
+    frames = []
+    while True:
+        ret, frame = cap.read()
+        if ret:
+            resized_frame = cv2.resize(frame, (640, 360))
+            frames.append(resized_frame)
+            cnt += 1
+        else:
+            break
+    cap.release()
+    return frames
+def _bytes_feature(value):
+    if isinstance(value, type(tf.constant(0))):
+        value = value.numpy()
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+def _bool_feature(value):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))
+def serialize_example(joint,gripper,tcp,tcp_base,image,instruction,terminate_episode):
+    feature = {
+            'joint': _bytes_feature(tf.io.serialize_tensor(joint)),
+            'image': _bytes_feature(tf.io.serialize_tensor(image)),
+            'instruction': _bytes_feature(tf.io.serialize_tensor(instruction)),
+            'terminate_episode': _bool_feature(terminate_episode),
+        }
+    if gripper is not None:
+        feature['gripper'] = _bytes_feature(tf.io.serialize_tensor(gripper))
+    if tcp is not None:
+        feature['tcp'] = _bytes_feature(tf.io.serialize_tensor(tcp))
+    if tcp_base is not None:
+        feature['tcp_base'] = _bytes_feature(tf.io.serialize_tensor(tcp_base))
+    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
+    return example_proto.SerializeToString()
+def compress_tfrecord(tfrecord_path):
+    raw_dataset = tf.data.TFRecordDataset(tfrecord_path)
+    parsed_dataset = raw_dataset.map(_parse_function)
+    # Serialize and write to a new TFRecord file
+    with tf.io.TFRecordWriter(tfrecord_path) as writer:
+        for features in parsed_dataset:
+            image_tensor = features['image']
+            image_np = image_tensor.numpy()
+            if len(image_np.shape) <= 1: # already compressed
+                return
+            _, compressed_image = cv2.imencode('.jpg', image_np)
+            features['image'] = tf.io.serialize_tensor(tf.convert_to_tensor(compressed_image.tobytes(), dtype=tf.string))
+            def _bytes_feature(value):
+                """Returns a bytes_list from a string / byte."""
+                if isinstance(value, type(tf.constant(0))):
+                    value = value.numpy()
+                return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+            feature_dict = {
+                'joint': _bytes_feature(features['joint']),
+                'image': _bytes_feature(features['image']),
+                'instruction': _bytes_feature(features['instruction']),
+                'terminate_episode': tf.train.Feature(int64_list=tf.train.Int64List(value=[features['terminate_episode']])),
+                'gripper': _bytes_feature(features['gripper']),
+                'tcp': _bytes_feature(features['tcp']),
+                'tcp_base': _bytes_feature(features['tcp_base'])
+            }
+            example_proto = tf.train.Example(features=tf.train.Features(feature=feature_dict))
+            serialized_example = example_proto.SerializeToString()
+            writer.write(serialized_example)
+    print(f"compressed {tfrecord_path}")
+def write_task(args):
+    task_dir,output_dir = args
+    all_instructions = json.load(open('./instruction.json'))
+    instruction = None
+    for taskid in list(all_instructions.keys()):
+        if taskid in task_dir:
+            instruction = all_instructions[taskid]['task_description_english']
+    if instruction is None:
+        return
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    joints = np.load(os.path.join(task_dir,"transformed/joint.npy"),allow_pickle=True).item()
+    if not os.path.exists(os.path.join(task_dir,"transformed/gripper.npy")):
+        return
+    grippers = np.load(os.path.join(task_dir,"transformed/gripper.npy"),allow_pickle=True).item()
+    tcps = np.load(os.path.join(task_dir,"transformed/tcp.npy"),allow_pickle=True).item()
+    tcp_bases = np.load(os.path.join(task_dir,"transformed/tcp_base.npy"),allow_pickle=True).item()
+    for camid in joints.keys():
+        timesteps = joints[camid]
+        if len(timesteps) == 0:
+            continue
+        tfrecord_path = os.path.join(output_dir,f'cam_{camid}.tfrecord')
+        timesteps_file = os.path.join(task_dir,f'cam_{camid}/timestamps.npy')
+        if not os.path.exists(timesteps_file):
+            continue
+        if os.path.exists(tfrecord_path) and os.path.getsize(tfrecord_path) > 0:
+            continue
+        timesteps_file = np.load(timesteps_file,allow_pickle=True).item()
+        images = convert_color(os.path.join(task_dir,f'cam_{camid}/color.mp4'),timesteps_file['color'])
+        if len(timesteps) != len(images): ## BUG FROM RH20T
+            continue
+        with tf.io.TFRecordWriter(tfrecord_path) as writer:
+            for i,timestep in enumerate(timesteps):
+                # image = cv2.imread(os.path.join(img_dir,f"{timestep}.jpg"))
+                image = cv2.imencode('.jpg', images[i])[1].tobytes()
+                joint_pos = joints[camid][timestep]
+                tcp = next((item for item in tcps[camid] if item['timestamp'] == timestep), None)['tcp']
+                tcp_base = next((item for item in tcp_bases[camid] if item['timestamp'] == timestep), None)['tcp']
+                if timestep not in grippers[camid]:
+                    gripper_pos = None
+                else:
+                    gripper_pos = grippers[camid][timestep]['gripper_info']
+                terminate_episode = i == len(timesteps) - 1
+                # read from instruction.json
+                serialized_example = serialize_example(joint_pos,gripper_pos,tcp,tcp_base,image,instruction,terminate_episode)
+                writer.write(serialized_example)
+def write_tfrecords(root_dir,output_dir,num_processes = None):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    if num_processes is None:
+        num_processes = cpu_count()
+    num_files = 0
+    args = []
+    for dirs in os.listdir(root_dir):
+        for task in os.listdir(os.path.join(root_dir,dirs)):
+            if 'human' in task:
+                continue
+            task_dir = os.path.join(root_dir,dirs,task)
+            joint_path = os.path.join(task_dir,"transformed/joint.npy")
+            if not os.path.exists(joint_path):
+                continue
+            num_files += 1
+            task_out = os.path.join(output_dir,dirs,task)
+            os.makedirs(task_out,exist_ok=True)
+            args.append((task_dir,task_out))
+    with tqdm(total=num_files, desc="Processing files") as pbar:
+        with Pool(num_processes) as pool:
+            for _ in pool.imap_unordered(write_task, args):
+                pbar.update(1)
+write_tfrecords('../datasets/rh20t/raw_data/','../datasets/rh20t/tfrecords/')

data/roboset/download.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import requests
+import os
+from tqdm import tqdm
+links = []
+with open('links.txt', 'r', encoding='utf-8') as file:
+    for line in file:
+        links.append(line.strip())
+download_dir = "../datasets/roboset"
+os.makedirs(download_dir, exist_ok=True)
+for link in links:
+    filename = os.path.basename(link)
+    filepath = os.path.join(download_dir, filename)
+    print(f"Downloading {filename} from {link}")
+    response = requests.get(link, stream=True)
+    total_size_in_bytes = int(response.headers.get('content-length', 0))
+    block_size = 1024
+    if os.path.exists(filepath):
+        local_size = os.path.getsize(filepath)
+        if local_size == total_size_in_bytes:
+            print(f"{filename} already exists")
+            continue
+    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+    with open(filepath, 'wb') as f:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            f.write(data)
+    progress_bar.close()
+    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+        print("ERROR, something went wrong")
+    print(f"Downloaded {filename}")
+print("All files processed.")

data/roboset/download.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+while true; do
+    python download.py
+    EXIT_CODE=$?
+    if [ $EXIT_CODE -ne 0 ]; then
+        echo "Download exited with code $EXIT_CODE. Restarting..."
+    else
+        echo "Download exited with code 0. Not restarting."
+        break
+    fi
+done
+# Unzip all the files in the ../datasets/roboset/ directory
+cd ../datasets/roboset/
+for file in *.tar.gz; do
+    tar -xzvf "$file"
+done
+## Convert the dataset to tfrecords
+python hdf5totfrecords.py

data/roboset/h5totfrecords.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import tensorflow as tf
+import h5py
+import os
+import fnmatch
+from tqdm import tqdm
+from multiprocessing import Pool, cpu_count, current_process
+def _bytes_feature(value):
+    if isinstance(value, type(tf.constant(0))):
+        value = value.numpy()
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+def _bool_feature(value):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))
+def serialize_example(action, action_gripper, qpos, qvel, qpos_gripper, qvel_gripper, rgb_left, rgb_right, rgb_top, terminate_episode):
+    feature = {
+        'action': _bytes_feature(tf.io.serialize_tensor(action)),
+        'action_gripper': _bytes_feature(tf.io.serialize_tensor(action_gripper)),
+        'qpos': _bytes_feature(tf.io.serialize_tensor(qpos)),
+        'qvel': _bytes_feature(tf.io.serialize_tensor(qvel)),
+        'qpos_gripper': _bytes_feature(tf.io.serialize_tensor(qpos_gripper)),
+        'qvel_gripper': _bytes_feature(tf.io.serialize_tensor(qvel_gripper)),
+        'rgb_left': _bytes_feature(tf.io.serialize_tensor(rgb_left)),
+        'rgb_right': _bytes_feature(tf.io.serialize_tensor(rgb_right)),
+        'rgb_top': _bytes_feature(tf.io.serialize_tensor(rgb_top)),
+        'terminate_episode': _bool_feature(terminate_episode),
+    }
+    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
+    return example_proto.SerializeToString()
+def process_file(params):
+    filepath, output_dir = params
+    with h5py.File(filepath, 'r') as f:
+        for Trial in f.keys():
+            data = f[Trial]['data']
+            tfrecord_path = os.path.join(output_dir, os.path.basename(filepath).replace('.h5', f'_{Trial}.tfrecord'))
+            if os.path.exists(tfrecord_path) and os.path.getsize(tfrecord_path) > 0:
+                continue
+            with tf.io.TFRecordWriter(tfrecord_path) as writer:
+                num_episodes = data['ctrl_arm'].shape[0]
+                for i in range(num_episodes):
+                    action = data['ctrl_arm'][i]
+                    action_gripper = data['ctrl_ee'][i]
+                    qpos = data['qp_arm'][i]
+                    qvel = data['qv_arm'][i]
+                    qpos_gripper = data['qp_ee'][i]
+                    qvel_gripper = data['qv_ee'][i]
+                    rgb_left = data['rgb_left'][i]
+                    rgb_right = data['rgb_right'][i]
+                    rgb_top = data['rgb_top'][i]
+                    terminate_episode = i == num_episodes - 1
+                    serialized_example = serialize_example(action, action_gripper, qpos, qvel, qpos_gripper, qvel_gripper, rgb_left, rgb_right, rgb_top, terminate_episode)
+                    writer.write(serialized_example)
+def write_tfrecords(root_dir, out_dir, num_processes=None):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    if num_processes is None:
+        num_processes = cpu_count()
+    file_list = []
+    num_files = 0
+    for root, dirs, files in os.walk(root_dir):
+        for filename in fnmatch.filter(files, '*.h5'):
+            filepath = os.path.join(root, filename)
+            output_dir = os.path.join(out_dir, os.path.relpath(os.path.dirname(filepath), root_dir))
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            num_files += 1
+            file_list.append((filepath, output_dir))
+    with tqdm(total=num_files, desc="Processing files") as pbar:
+        with Pool(num_processes) as pool:
+            for _ in pool.imap_unordered(process_file, file_list):
+                pbar.update(1)
+root_dir = '../datasets/roboset/'
+output_dir = '../datasets/roboset/tfrecords/'
+write_tfrecords(root_dir, output_dir)

data/roboset/links.txt ADDED Viewed

	@@ -0,0 +1,197 @@

+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_1_Blocks_895/AutonomousRoboSet_Set_1_Blocks_895.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_10.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_11.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_12.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_2_SoftToys_12585/Autonomous_RoboSet_Set_2_SoftToys_839_13.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_3_Blocks_and_Toys_980/Autonomous_RoboSet_Set_3_Blocks_and_Toys_980.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_4_Medium_Block_7/Autonomous_RoboSet_Set_4_Medium_Block_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_5_Bottle_Cube_14/Autonomous_RoboSet_Set_5_Bottle_Cube_14.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_6_Planar_Push_120/Autonomous_RoboSet_Set_6_Planar_Push_120.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607/Autonomous_RoboSet_Set_7_Pick_Orange_Block_607_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_8_Pick_Bottle_10/Autonomous_RoboSet_Set_8_Pick_Bottle_10.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_5420/Autonomous_RoboSet_Set_9_Pick_Wooden_Block_542_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_10_Pick_Block_Eval_1837/Autonomous_RoboSet_Set_10_Pick_Block_Eval_167_10.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_2070/Autonomous_RoboSet_Set_11_Pick_Bottle_Eval_207_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_12_Pick_Block_Eval_2000/Autonomous_RoboSet_Set_12_Pick_Block_Eval_200_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_10.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_11.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_12.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_13.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_4170/Autonomous_RoboSet_Set_13_Bin_Reorient_Eval_278_14.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_10.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_11.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_12.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_13.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_14.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_15.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_16.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_17.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_18.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_14_Bin_Push_Eval_11300/Autonomous_RoboSet_Set_14_Bin_Push_Eval_565_19.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_1470/Autonomous_RoboSet_Set_15_Bin_Reorient_Eval_2_147_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_10.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_11.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_12.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_13.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_2115/Autonomous_RoboSet_Set_16_Bin_Reorient_Eval_3_141_14.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_0.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_1.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_2.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_3.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_4.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_5.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_6.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_7.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_8.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_9.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_10.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_11.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_12.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_13.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_14.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_15.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_16.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_17.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_18.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_19.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_20.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_21.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/AutonomousSet/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_10465/Autonomous_RoboSet_Set_17_Plannar_Push_Eval_455_22.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_banana_place_in_mug.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_banana_place_in_strainer.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_banana_from_plate_place_on_table.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_banana_from_toaster_place_on_table.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_banana_place_on_plate.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_banana_place_on_toaster.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_ketchup_place_in_strainer.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_ketchup_place_in_toaster.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_ketchup_from_strainer_place_on_table.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_ketchup_from_plate_place_on_table.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_ketchup_from_toaster_place_on_table.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_ketchup_place_on_plate.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/pick_ketchup_place_on_toaster.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/drag_mug_backward.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/drag_mug_forward.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/drag_mug_from_left_to_right.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/drag_mug_from_right_to_left.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/drag_strainer_backward.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/drag_strainer_forward.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/drag_strainer_left_to_right.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/drag_strainer_right_to_left.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/flap_open_toaster_oven.tar.gz
+https://dl.fbaipublicfiles.com/RoboSet/KinestheticSet/Activities/flap_close_toaster_oven.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/baking_prep/baking_prep_slide_open_drawer_scene_1.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/baking_prep/baking_prep_slide_open_drawer_scene_4.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/baking_prep/baking_prep_pick_butter_scene_1.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/baking_prep/baking_prep_pick_butter_scene_4.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/baking_prep/baking_prep_place_butter_scene_1.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/baking_prep/baking_prep_place_butter_scene_4.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/baking_prep/baking_prep_slide_close_drawer_scene_1.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/baking_prep/baking_prep_slide_close_drawer_scene_4.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/clean_kitchen/clean_kitchen_pick_lid_scene_3.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/clean_kitchen/clean_kitchen_cap_lid_scene_3.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/clean_kitchen/clean_kitchen_slide_close_drawer_scene_3.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/clean_kitchen/clean_kitchen_flap_close_oven_Scene_3.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/clean_kitchen/clean_kitchen_pick_towel_scene_3.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/clean_kitchen/clean_kitchen_Wipe_Counter_Scene_3.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/heat_soup/heat_soup_flap_open_oven_Scene_2.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/heat_soup/heat_soup_flap_open_oven_Scene_4.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/heat_soup/heat_soup_pick_bowl_scene_2.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/heat_soup/heat_soup_pick_bowl_scene_4.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/heat_soup/heat_soup_slide_in_bowl_scene_2.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/heat_soup/heat_soup_slide_in_bowl_scene_4.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/heat_soup/heat_soup_flap_close_oven_scene_2.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/make_tea/make_tea_Uncap_Lid_Scene_2.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/make_tea/make_tea_place_lid_scene_2.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/make_tea/make_tea_pick_tea_scene_2.tar.gz
+http://dl.fbaipublicfiles.com/RoboSet/TeleoperationSet/Activities/make_tea/make_tea_place_tea_scene_2.tar.gz

docs/pretrain.md ADDED Viewed

	@@ -0,0 +1,270 @@

+# Pipeline of Pre-Training RDT
+Firstly, you need to install the prerequisites for RDT (see [README](../README.md#installation)). Then, you can install the prerequisites for TensorFlow Dataset (in another Conda environment).
+## Installation for TensorFlow Dataset
+```bash
+# Under the root directory of this repo
+conda create -n rdt-data python=3.10
+conda activate rdt-data
+# Install all the prequisites
+pip install -r requirements_data.txt
+# Or you can manually install each package (please refer to requirements_data.txt for specific versions)
+pip install tfds-nightly gsutil tensorflow Pillow pyyaml opencv-python tensorflow-graphics imageio[ffmpeg]
+# If the speed is too slow, you can specify alternative sources (tfds-nightly is not available in Tsinghua mirror)
+pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gsutil tensorflow Pillow pyyaml opencv-python tensorflow-graphics imageio[ffmpeg]
+```
+## Download and Prepare Datasets
+We introduce how to download each of our pre-training datasets. If you plan to pre-train on a subset of them, just download the ones you need. You can also fine-tune RDT through this pipeline only if your target dataset is included below or in the Google Cloud Storage.
+|  Dataset    |   Sample Percentage (%)   |
+| ---- | ---- |
+| RT-1 Dataset | 9.00 |
+| TACO Dataset | 1.99 |
+| JACO Play Dataset | 1.10 |
+| Cable Routing Dataset | 0.27 |
+| NYU Door Opening | 0.33 |
+| Viola | 0.40 |
+| Berkeley UR5 | 1.06 |
+| TOTO | 1.06 |
+| Kuka | 1.66 |
+| Language Table | 3.32 |
+| Columbia Cairlab Pusht Real | 0.40 |
+| Stanford Kuka Multimodal Dataset | 1.83 |
+| Stanford Hydra Dataset  | 0.80 |
+| Austin Buds Dataset | 0.23 |
+| Maniskill Dataset | 5.78 |
+| Furniture Bench Dataset | 2.36 |
+| UCSD Kitchen Dataset | 0.40 |
+| UCSD Pick And Place Dataset | 1.23 |
+| Austin Sailor Dataset | 0.50 |
+| Austin Sirius Dataset | 0.80 |
+| BC Z | 6.91 |
+| UTokyo PR2 Opening Fridge | 0.30 |
+| UTokyo PR2 Tabletop Manipulation | 0.50 |
+| UTokyo Xarm Pick And Place | 0.33 |
+| UTokyo Xarm Bimanual | 0.03 |
+| Berkeley MVP | 0.73 |
+| Berkeley RPT | 1.00 |
+| KAIST Nonprehensile | 0.46 |
+| Tokyo U LSMO | 0.23 |
+| DLR Sara Grid Clamp | 0.03 |
+| Robocook | 1.66 |
+| Imperialcollege Sawyer Wrist Cam | 0.43 |
+| Iamlab CMU Pickup Insert | 0.83 |
+| UTAustin Mutex | 1.29 |
+| Fanuc Manipulation | 0.66 |
+| Play Fusion | 0.80 |
+| Droid | 10.06 |
+| FMB| 1.39 |
+| Dobb·E | 1.20 |
+| QUT Dexterous Manipulation | 0.46 |
+| Aloha Dataset | 4.98 |
+| Mobile Aloha Dataset | 4.98 |
+| Roboset | 4.48 |
+| RH20T | 10.99 |
+| Calvin Dataset | 3.32 |
+| Bridgev2 | 7.44 |
+Before everything, let's link the dataset directory on your disk to a subfolder of this repo:
+```bash
+ln -s /path/to/dataset /path/to/repo/RoboticsDiffusionTransformer/data/datasets
+```
+### Open X-Embodiment
+Specify the correct path to the `gsutil` in your Conda in [this file](../data/openx_embod/download.sh#L72).
+Run the following commands to download our selected datasets for the Open X-Embodiment:
+```bash
+# Under the root directory of this repo
+cd data/openx_embod
+# Download all datasets
+bash download_openx_embod.sh
+```
+Note: By modifying `download_openx_embod.sh`,  you can download any dataset on the Google Cloud (as long as it can be downloaded with `gsutil` and is stored in `TFRecord` format), not just the ones we have listed.
+### Mobile ALOHA Dataset
+Download the Mobile ALOHA Dataset from the [official website](https://mobile-aloha.github.io) to `data/datasets/aloha`, then run:
+```bash
+cd data/aloha
+# Convert the dataset to TFRecord
+python hdf5totfrecords.py
+```
+### Bridgev2
+Run:
+```bash
+cd data/bridgev2
+# Download and preprocess the dataset
+sh download.sh
+```
+### Calvin
+Run:
+```bash
+cd data/calvin
+# Download and preprocess the dataset
+sh download.sh
+# Convert the dataset to TFRecord format
+python hdf5totfrecords.py
+```
+### RH20T
+Download the RH20T Dataset from there [official website](https://rh20t.github.io/#download) to `data/datasets/rh20t`, then run
+```bash
+cd data/rh20t
+# Convert the dataset to TFRecord
+python hdf5totfrecords.py
+```
+### RoboSet
+Run:
+```bash
+cd data/roboset
+# Download and preprocess the dataset
+sh download.sh
+```
+## If Want to Train on a New Dataset
+If you want to train on a new dataset (e.g., `my_pretrain_dataset`) through this pre-training pipeline, you need to modify several files as follows:
+##### 1. `configs/dataset_control_freq.json`
+Add the control frequency of your dataset.
+##### 2. `data/preprocess_scripts/my_pretrain_dataset.py`
+If your dataset can be loaded by `tfds.builder_from_directory()`, then you only need to download it into the folder of Open X-Embodiment `data/datasets/openx_embod` and implement the function of `process_step()`. You may need to specify the tfds loading path in L78 (see [this file](../data/vla_dataset.py#L78)). We refer to `data/preprocess_scripts/droid.py` for an example.
+If not, you need to first convert it into TFRecords and then implement both `load_dataset()` and `process_step()`. We refer to `data/agilex/hdf5totfrecords.py` and `data/preprocess_scripts/agilex.py` for examples.
+Here some descriptions:
+##### `load_dataset(seed: int)`
+- Returns a dataset that supports iterator and `repeat` method with a random seed.
+- Suggested implementation: Use `tf.data.Dataset.from_generator` and `tf.data.TFRecordDataset`.
+- The iterator should return a subdataset that supports iterator representing one episode with the following structure:
+  - `step`: A dataset object that supports iterator containing multiple frames per episode.
+    - `observation`: A dictionary containing your images.
+      - `your_first_image_key`: Your observation RGB image keys.
+      - ...
+    - `other_attribute`: Any other relevant attributes.
+##### `process_step(step: dict) -> dict`
+Processes a single frame and returns a dictionary with the following keys:
+- `observation`:
+  - `your_first_view_image: tf.Tensor`: Your first view image.
+  - `arm_concat: tf.Tensor`: Concatenation of physical states.
+  - `format: tf.constant(string)`: Format of `arm_concat` (e.g., `arm_joint_pos_0,arm_joint_pos_1,arm_joint_pos_2`).
+- `action`: Frame action (leave empty if there's none).
+  - `arm_concat`: Same as in `observation`.
+  - `format`: Same as in `observation`.
+  - `terminate: tf.Tensor`: Boolean Tensor indicates if the episode ends.
+**IMPORTANT**: You should only use TensorFlow functions for any branch or loop operations. For example, use `tf.cond` instead of `if`.
+##### 3. `configs/dataset_img_keys.json`
+Add the image keys of your dataset. For example:
+```json
+"my_pretrain_dataset": {
+  "image_keys": [
+    "exterior-cam",
+    "right-wrist-cam",
+    "left-wrist-cam",
+    "left-wrist-cam"
+  ],
+  "image_mask": [1, 1, 1, 0]
+}
+```
+- To make TensorFlow happy, you have to specify four images in this order: `exterior-cam, right-wrist-cam, left-wrist-cam, any-cam`. Each key should correspond to your `step` attribute key of observation images.
+- If you only have a single wrist, just make it a *right* wrist.
+- The `image_mask` indicates whether each image is valid (1) or not (0).
+- What if you don’t have four images? Simply repeat the images in the following positions and set their masks to 0 (invalid).
+- The key order is *strict*. If you don't have the exterior camera but have both wrists, leave the exterior position blank (or pad) and use the following:
+   ```json
+   "my_pretrain_dataset": {
+     "image_keys": [
+       "right-wrist-cam",
+       "right-wrist-cam",
+       "left-wrist-cam",
+       "left-wrist-cam"
+     ],
+     "image_mask": [0, 1, 1, 0]
+   }
+   ```
+- During training, only the first *three* cameras will be used.
+##### 4. `configs/dataset_stat.json`
+Compute the statistics (min, max, mean, and std) for your dataset:
+```bash
+# Use -h to see the full usage
+python -m data.compute_dataset_stat --skip_exist
+```
+This will update the `dataset_stat.json` file with your dataset's statistics.
+##### 5. `data/vla_dataset.py`
+- Add your dataset to `DATASET_NAMES_NOOPENX` if it cannot be loaded by `tfds.builder_from_directory()`.
+- If your dataset only contains action but no proprioception (i.e., robot state), add your dataset to `DATASET_NAMES_NO_STATE` in [this file](../data/preprocess.py).
+- Normally, we consider the future state as the action of current timestep. If you want to use different actions, you should implement more functions. We refer to `flatten_episode_agilex()` in [this file](../data/episode_transform.py) and `_generate_json_state_agilex()` in [this file](../data/preprocess.py) for examples. You may also refer to L318 in [this file](../data/preprocess.py) and L128 in [this file](../data/vla_dataset.py) for how to select your dataset and preprocess it differently.
+## Start Pre-Training
+We employ a producer-consumer framework with TensorFlow Dataset for fast data loading. Since most of the datasets in the Open X-Embodiment are stored in the form of `TFRecord`, we convert all pre-training datasets into `TFRecord` for storage. In pre-training, we use the producer process to decompress the data from `TFRecord` and store it in a buffer on the hard disk. At the same time, we use the consumer process to read data from the buffer in a disorderly order and feed it to the model training.  This not only decouples the `TensorFlow` and `PyTorch` environments but also alleviates the training performance loss caused by the small size of the shuffling buffer in the memory.
+[This file](../configs/base.yaml) includes configurations relevant to model architecture (including number of heads, hidden dimension, and so on) and data processing. You may need to modify `buf_path` (L22) to your real buffer path. This buffer is used as disk shuffling buffer for data loading.
+Configurations relevant to training are passed through *Command Line Arguments*. Use `python main.py -h ` to see the descriptions. We provide an example pre-training script in [this file](../pretrain.sh) (`pretrain.sh`). You may need to modify some of the parameters in this file, such as `CUTLASS_PATH` and `WANDB_PROJECT`.
+You may need to modify the list of pre-training datasets in [this file](../configs/pretrain_datasets.json) and their corresponding sampling weights in [this file](../configs/pretrain_sample_weights.json). If you want to fine-tune RDT through this pipeline, you may need to remove abundant datasets in the list.
+Before start pre-training, we first start the data producer process (if you use multiple nodes, you should run this command in each node):
+```bash
+# Under the root directory of this repo
+conda activate rdt-data
+# Use -h to see the full usage
+python -m data.producer --fill_up
+# Please proceed to the next step AFTER finishing the filling up process
+```
+Then, we run the pre-training script:
+```bash
+source pretrain.sh
+```
+Note: You can monitor the training process by observing `loss` (through a long window moving average), `overall_avg_sample_mse`, and the sampling MSE of each dataset in [Wandb](https://wandb.ai/site) or [TensorBoard](https://www.tensorflow.org/tensorboard). We empirically found that the lower the `overall_avg_sample_mse`, the better the model performs.

docs/test_6drot.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import numpy as np
+from scipy.spatial.transform import Rotation as R
+def convert_quaternion_to_euler(quat):
+    """
+    Convert Quarternion (xyzw) to Euler angles (rpy)
+    """
+    # Normalize
+    quat = quat / np.linalg.norm(quat)
+    euler = R.from_quat(quat).as_euler('xyz')
+    return euler
+def convert_euler_to_quaternion(euler):
+    """
+    Convert Euler angles (rpy) to Quarternion (xyzw)
+    """
+    quat = R.from_euler('xyz', euler).as_quat()
+    return quat
+def convert_euler_to_rotation_matrix(euler):
+    """
+    Convert Euler angles (rpy) to rotation matrix (3x3).
+    """
+    quat = R.from_euler('xyz', euler).as_matrix()
+    return quat
+def convert_rotation_matrix_to_euler(rotmat):
+    """
+    Convert rotation matrix (3x3) to Euler angles (rpy).
+    """
+    r = R.from_matrix(rotmat)
+    euler = r.as_euler('xyz', degrees=False)
+    return euler
+def normalize_vector(v):
+    v_mag = np.linalg.norm(v, axis=-1, keepdims=True)
+    v_mag = np.maximum(v_mag, 1e-8)
+    return v / v_mag
+def cross_product(u, v):
+    i = u[:,1]*v[:,2] - u[:,2]*v[:,1]
+    j = u[:,2]*v[:,0] - u[:,0]*v[:,2]
+    k = u[:,0]*v[:,1] - u[:,1]*v[:,0]
+    out = np.stack((i, j, k), axis=1)
+    return out
+def compute_rotation_matrix_from_ortho6d(ortho6d):
+    x_raw = ortho6d[:, 0:3]
+    y_raw = ortho6d[:, 3:6]
+    x = normalize_vector(x_raw)
+    z = cross_product(x, y_raw)
+    z = normalize_vector(z)
+    y = cross_product(z, x)
+    x = x.reshape(-1, 3, 1)
+    y = y.reshape(-1, 3, 1)
+    z = z.reshape(-1, 3, 1)
+    matrix = np.concatenate((x, y, z), axis=2)
+    return matrix
+def compute_ortho6d_from_rotation_matrix(matrix):
+    # The ortho6d represents the first two column vectors a1 and a2 of the
+    # rotation matrix: [ | , |,  | ]
+    #                  [ a1, a2, a3]
+    #                  [ | , |,  | ]
+    ortho6d = matrix[:, :, :2].transpose(0, 2, 1).reshape(matrix.shape[0], -1)
+    return ortho6d
+# Test
+if __name__ == "__main__":
+    # Randomly generate a euler ange
+    euler = np.random.rand(3) * 2 * np.pi - np.pi
+    euler = euler[None, :]    # Add batch dimension
+    print(f"Input Euler angles: {euler}")
+    # Convert to 6D Rotation
+    rotmat = convert_euler_to_rotation_matrix(euler)
+    ortho6d = compute_ortho6d_from_rotation_matrix(rotmat)
+    print(f"6D Rotation: {ortho6d}")
+    # Convert back to Euler angles
+    rotmat_recovered = compute_rotation_matrix_from_ortho6d(ortho6d)
+    euler_recovered = convert_rotation_matrix_to_euler(rotmat_recovered)
+    print(f"Recovered Euler angles: {euler_recovered}")

eval_sim/eval_dp.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from typing import Callable, List, Type
+import gymnasium as gym
+import numpy as np
+from mani_skill.envs.sapien_env import BaseEnv
+from mani_skill.utils import common, gym_utils
+import argparse
+import yaml
+import torch
+from collections import deque
+from PIL import Image
+import cv2
+import imageio
+from functools import partial
+from diffusion_policy.workspace.robotworkspace import RobotWorkspace
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--env-id", type=str, default="PickCube-v1", help=f"Environment to run motion planning solver on. ")
+    parser.add_argument("-o", "--obs-mode", type=str, default="rgb", help="Observation mode to use. Usually this is kept as 'none' as observations are not necesary to be stored, they can be replayed later via the mani_skill.trajectory.replay_trajectory script.")
+    parser.add_argument("-n", "--num-traj", type=int, default=25, help="Number of trajectories to generate.")
+    parser.add_argument("--only-count-success", action="store_true", help="If true, generates trajectories until num_traj of them are successful and only saves the successful trajectories/videos")
+    parser.add_argument("--reward-mode", type=str)
+    parser.add_argument("-b", "--sim-backend", type=str, default="auto", help="Which simulation backend to use. Can be 'auto', 'cpu', 'gpu'")
+    parser.add_argument("--render-mode", type=str, default="rgb_array", help="can be 'sensors' or 'rgb_array' which only affect what is saved to videos")
+    parser.add_argument("--vis", action="store_true", help="whether or not to open a GUI to visualize the solution live")
+    parser.add_argument("--save-video", action="store_true", help="whether or not to save videos locally")
+    parser.add_argument("--traj-name", type=str, help="The name of the trajectory .h5 file that will be created.")
+    parser.add_argument("--shader", default="default", type=str, help="Change shader used for rendering. Default is 'default' which is very fast. Can also be 'rt' for ray tracing and generating photo-realistic renders. Can also be 'rt-fast' for a faster but lower quality ray-traced renderer")
+    parser.add_argument("--record-dir", type=str, default="demos", help="where to save the recorded trajectories")
+    parser.add_argument("--num-procs", type=int, default=1, help="Number of processes to use to help parallelize the trajectory replay process. This uses CPU multiprocessing and only works with the CPU simulation backend at the moment.")
+    parser.add_argument("--random_seed", type=int, default=0, help="Random seed for the environment.")
+    parser.add_argument("--pretrained_path", type=str, default=None, help="Random seed for the environment.")
+    return parser.parse_args()
+task2lang = {
+    "PegInsertionSide-v1": "Pick up a orange-white peg and insert the orange end into the box with a hole in it.",
+    "PickCube-v1": "Grasp a red cube and move it to a target goal position.",
+    "StackCube-v1":  "Pick up a red cube and stack it on top of a green cube and let go of the cube without it falling.",
+    "PlugCharger-v1": "Pick up one of the misplaced shapes on the board/kit and insert it into the correct empty slot.",
+    "PushCube-v1": "Push and move a cube to a goal region in front of it."
+}
+import random
+import os
+args = parse_args()
+seed = args.random_seed
+random.seed(seed)
+os.environ['PYTHONHASHSEED'] = str(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+env_id = args.env_id
+env = gym.make(
+    env_id,
+    obs_mode=args.obs_mode,
+    control_mode="pd_joint_pos",
+    render_mode=args.render_mode,
+    reward_mode="dense" if args.reward_mode is None else args.reward_mode,
+    sensor_configs=dict(shader_pack=args.shader),
+    human_render_camera_configs=dict(shader_pack=args.shader),
+    viewer_camera_configs=dict(shader_pack=args.shader),
+    sim_backend=args.sim_backend
+)
+from diffusion_policy.workspace.robotworkspace import RobotWorkspace
+import hydra
+import dill
+checkpoint_path = args.pretrained_path
+print(f"Loading policy from {checkpoint_path}. Task is {task2lang[env_id]}")
+def get_policy(output_dir, device):
+    # load checkpoint
+    payload = torch.load(open(checkpoint_path, 'rb'), pickle_module=dill)
+    cfg = payload['cfg']
+    cls = hydra.utils.get_class(cfg._target_)
+    workspace = cls(cfg, output_dir=output_dir)
+    workspace: RobotWorkspace
+    workspace.load_payload(payload, exclude_keys=None, include_keys=None)
+    # get policy from workspace
+    policy = workspace.model
+    if cfg.training.use_ema:
+        policy = workspace.ema_model
+    device = torch.device(device)
+    policy.to(device)
+    policy.eval()
+    return policy
+policy = get_policy('./', device = 'cuda')
+MAX_EPISODE_STEPS = 400
+total_episodes = args.num_traj
+success_count = 0
+base_seed = 20241201
+instr = task2lang[env_id]
+import tqdm
+DATA_STAT = {'state_min': [-0.7463043928146362, -0.0801204964518547, -0.4976441562175751, -2.657780647277832, -0.5742632150650024, 1.8309762477874756, -2.2423808574676514, 0.0, 0.0], 'state_max': [0.7645499110221863, 1.4967026710510254, 0.4650936424732208, -0.3866899907588959, 0.5505855679512024, 3.2900545597076416, 2.5737812519073486, 0.03999999910593033, 0.03999999910593033], 'action_min': [-0.7472005486488342, -0.08631071448326111, -0.4995281398296356, -2.658363103866577, -0.5751323103904724, 1.8290787935256958, -2.245187997817993, -1.0], 'action_max': [0.7654682397842407, 1.4984270334243774, 0.46786263585090637, -0.38181185722351074, 0.5517147779464722, 3.291581630706787, 2.575840711593628, 1.0], 'action_std': [0.2199309915304184, 0.18780815601348877, 0.13044124841690063, 0.30669933557510376, 0.1340624988079071, 0.24968451261520386, 0.9589747190475464, 0.9827960729598999], 'action_mean': [-0.00885344110429287, 0.5523102879524231, -0.007564723491668701, -2.0108158588409424, 0.004714342765510082, 2.615924596786499, 0.08461848646402359, -0.19301606714725494]}
+state_min = torch.tensor(DATA_STAT['state_min']).cuda()
+state_max = torch.tensor(DATA_STAT['state_max']).cuda()
+action_min = torch.tensor(DATA_STAT['action_min']).cuda()
+action_max = torch.tensor(DATA_STAT['action_max']).cuda()
+for episode in tqdm.trange(total_episodes):
+    obs_window = deque(maxlen=2)
+    obs, _ = env.reset(seed = episode + base_seed)
+    img = env.render().cuda().float()
+    proprio = obs['agent']['qpos'][:].cuda()
+    proprio = (proprio - state_min) / (state_max - state_min) * 2 - 1
+    obs_window.append({
+        'agent_pos': proprio,
+        "head_cam": img.permute(0, 3, 1, 2),
+    })
+    obs_window.append({
+        'agent_pos': proprio,
+        "head_cam": img.permute(0, 3, 1, 2),
+    })
+    global_steps = 0
+    video_frames = []
+    success_time = 0
+    done = False
+    while global_steps < MAX_EPISODE_STEPS and not done:
+        obs = obs_window[-1]
+        actions = policy.predict_action(obs)
+        actions = actions['action_pred'].squeeze(0)
+        actions = (actions + 1) / 2 * (action_max - action_min) + action_min
+        actions = actions.detach().cpu().numpy()
+        actions = actions[:8]
+        for idx in range(actions.shape[0]):
+            action = actions[idx]
+            obs, reward, terminated, truncated, info = env.step(action)
+            img = env.render().cuda().float()
+            proprio = obs['agent']['qpos'][:].cuda()
+            proprio = (proprio - state_min) / (state_max - state_min) * 2 - 1
+            obs_window.append({
+                'agent_pos': proprio,
+                "head_cam": img.permute(0, 3, 1, 2),
+            })
+            video_frames.append(env.render().squeeze(0).detach().cpu().numpy())
+            global_steps += 1
+            if terminated or truncated:
+                assert "success" in info, sorted(info.keys())
+                if info['success']:
+                    done = True
+                    success_count += 1
+                    break
+    print(f"Trial {episode+1} finished, success: {info['success']}, steps: {global_steps}")
+success_rate = success_count / total_episodes * 100
+print(f"Tested {total_episodes} episodes, success rate: {success_rate:.2f}%")
+log_file = f"results_dp_{checkpoint_path.split('/')[-1].split('.')[0]}.txt"
+with open(log_file, 'a') as f:
+    f.write(f"{args.env_id}:{seed}:{success_count}\n")

eval_sim/eval_octo.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from typing import Callable, List, Type
+import gymnasium as gym
+import numpy as np
+from mani_skill.envs.sapien_env import BaseEnv
+from mani_skill.utils import common, gym_utils
+import argparse
+import yaml
+import torch
+from collections import deque
+from PIL import Image
+import cv2
+from octo.model.octo_model import OctoModel
+from octo.utils.train_callbacks import supply_rng
+import imageio
+import jax
+import jax.numpy as jnp
+from octo.utils.train_callbacks import supply_rng
+from functools import partial
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--env-id", type=str, default="PickCube-v1", help=f"Environment to run motion planning solver on. ")
+    parser.add_argument("-o", "--obs-mode", type=str, default="rgb", help="Observation mode to use. Usually this is kept as 'none' as observations are not necesary to be stored, they can be replayed later via the mani_skill.trajectory.replay_trajectory script.")
+    parser.add_argument("-n", "--num-traj", type=int, default=25, help="Number of trajectories to generate.")
+    parser.add_argument("--only-count-success", action="store_true", help="If true, generates trajectories until num_traj of them are successful and only saves the successful trajectories/videos")
+    parser.add_argument("--reward-mode", type=str)
+    parser.add_argument("-b", "--sim-backend", type=str, default="auto", help="Which simulation backend to use. Can be 'auto', 'cpu', 'gpu'")
+    parser.add_argument("--render-mode", type=str, default="rgb_array", help="can be 'sensors' or 'rgb_array' which only affect what is saved to videos")
+    parser.add_argument("--vis", action="store_true", help="whether or not to open a GUI to visualize the solution live")
+    parser.add_argument("--save-video", action="store_true", help="whether or not to save videos locally")
+    parser.add_argument("--traj-name", type=str, help="The name of the trajectory .h5 file that will be created.")
+    parser.add_argument("--shader", default="default", type=str, help="Change shader used for rendering. Default is 'default' which is very fast. Can also be 'rt' for ray tracing and generating photo-realistic renders. Can also be 'rt-fast' for a faster but lower quality ray-traced renderer")
+    parser.add_argument("--record-dir", type=str, default="demos", help="where to save the recorded trajectories")
+    parser.add_argument("--num-procs", type=int, default=1, help="Number of processes to use to help parallelize the trajectory replay process. This uses CPU multiprocessing and only works with the CPU simulation backend at the moment.")
+    parser.add_argument("--random_seed", type=int, default=0, help="Random seed for the environment.")
+    parser.add_argument("--pretrained_path", type=str, default=None, help="Path to the pretrained model")
+    return parser.parse_args()
+task2lang = {
+    "PegInsertionSide-v1": "Pick up a orange-white peg and insert the orange end into the box with a hole in it.",
+    "PickCube-v1": "Grasp a red cube and move it to a target goal position.",
+    "StackCube-v1":  "Pick up a red cube and stack it on top of a green cube and let go of the cube without it falling.",
+    "PlugCharger-v1": "Pick up one of the misplaced shapes on the board/kit and insert it into the correct empty slot.",
+    "PushCube-v1": "Push and move a cube to a goal region in front of it."
+}
+import random
+import os
+args = parse_args()
+seed = args.random_seed
+random.seed(seed)
+os.environ['PYTHONHASHSEED'] = str(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+env_id = args.env_id
+env = gym.make(
+    env_id,
+    obs_mode=args.obs_mode,
+    control_mode="pd_ee_delta_pose",
+    render_mode=args.render_mode,
+    reward_mode="dense" if args.reward_mode is None else args.reward_mode,
+    sensor_configs=dict(shader_pack=args.shader),
+    human_render_camera_configs=dict(shader_pack=args.shader),
+    viewer_camera_configs=dict(shader_pack=args.shader),
+    sim_backend=args.sim_backend
+)
+def sample_actions(
+    pretrained_model: OctoModel,
+    observations,
+    tasks,
+    rng,
+):
+    # add batch dim to observations
+    observations = jax.tree_map(lambda x: x[None], observations)
+    actions = pretrained_model.sample_actions(
+        observations,
+        tasks,
+        rng=rng,
+    )
+    # remove batch dim
+    return actions[0]
+pretrain_path = args.pretrained_path
+step = 1000000
+model = OctoModel.load_pretrained(
+        pretrain_path,
+        step
+    )
+policy = supply_rng(
+    partial(
+        sample_actions,
+        model,
+    )
+)
+import tensorflow as tf
+def resize_img(image, size=(256, 256)):
+    image_tf = tf.convert_to_tensor(image, dtype=tf.float32)
+    image_tf = tf.expand_dims(image_tf, axis=0)
+    resized_tf = tf.image.resize(
+        image_tf,
+        size,
+        method=tf.image.ResizeMethod.LANCZOS3,
+        antialias=True
+    )
+    resized_tf = tf.squeeze(resized_tf)
+    resized_img = resized_tf.numpy().astype(np.uint8)
+    return resized_img
+MAX_EPISODE_STEPS = 400
+total_episodes = args.num_traj
+success_count = 0
+base_seed = 20241201
+import tqdm
+for episode in tqdm.trange(total_episodes):
+    task = model.create_tasks(texts=[task2lang[env_id]])
+    obs_window = deque(maxlen=2)
+    obs, _ = env.reset(seed = base_seed)
+    img = env.render().squeeze(0).detach().cpu().numpy()
+    proprio = obs['agent']['qpos'][:]
+    obs_window.append({
+        'proprio': proprio.detach().cpu().numpy(),
+        "image_primary": resize_img(img)[None],
+        "timestep_pad_mask": np.zeros((1),dtype = bool)
+    })
+    obs_window.append({
+        'proprio': proprio.detach().cpu().numpy(),
+        "image_primary": resize_img(img)[None],
+        "timestep_pad_mask": np.ones((1),dtype = bool)
+    })
+    global_steps = 0
+    video_frames = []
+    success_time = 0
+    done = False
+    while global_steps < MAX_EPISODE_STEPS and not done:
+        obs = {
+            'proprio': np.concatenate([obs_window[0]['proprio'], obs_window[1]['proprio']], axis=0),
+            "image_primary": np.concatenate([obs_window[0]['image_primary'], obs_window[1]['image_primary']], axis=0),
+            "timestep_pad_mask": np.concatenate([obs_window[0]['timestep_pad_mask'], obs_window[1]['timestep_pad_mask']], axis=0)
+        }
+        actions = policy(obs, task)
+        actions = jax.device_put(actions, device=jax.devices('cpu')[0])
+        actions = jax.device_get(actions)
+        # actions = actions[0:4]
+        for idx in range(actions.shape[0]):
+            action = actions[idx]
+            obs, reward, terminated, truncated, info = env.step(action)
+            img = env.render().squeeze(0).detach().cpu().numpy()
+            proprio = obs['agent']['qpos'][:]
+            obs_window.append({
+                'proprio': proprio.detach().cpu().numpy(),
+                "image_primary": resize_img(img)[None],
+                "timestep_pad_mask": np.ones((1),dtype = bool)
+            })
+            video_frames.append(img)
+            global_steps += 1
+            if terminated or truncated:
+                assert "success" in info, sorted(info.keys())
+                if info['success']:
+                    done = True
+                    success_count += 1
+                    break
+    print(f"Trial {episode+1} finished, success: {info['success']}, steps: {global_steps}")
+success_rate = success_count / total_episodes * 100
+print(f"Random seed: {seed}, Pretrained_path: {pretrain_path}")
+print(f"Tested {total_episodes} episodes, success rate: {success_rate:.2f}%")
+log_file = "results_octo.log"
+with open(log_file, 'a') as f:
+    f.write(f"{seed}:{success_count}\n")

eval_sim/eval_openvla.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from typing import Callable, List, Type
+import gymnasium as gym
+import numpy as np
+from mani_skill.envs.sapien_env import BaseEnv
+from mani_skill.utils import common, gym_utils
+import argparse
+import yaml
+import torch
+from collections import deque
+from PIL import Image
+import cv2
+import imageio
+from functools import partial
+from torchvision.transforms.functional import center_crop
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--env-id", type=str, default="PickCube-v1", help=f"Environment to run motion planning solver on. ")
+    parser.add_argument("-o", "--obs-mode", type=str, default="rgb", help="Observation mode to use. Usually this is kept as 'none' as observations are not necesary to be stored, they can be replayed later via the mani_skill.trajectory.replay_trajectory script.")
+    parser.add_argument("-n", "--num-traj", type=int, default=25, help="Number of trajectories to generate.")
+    parser.add_argument("--only-count-success", action="store_true", help="If true, generates trajectories until num_traj of them are successful and only saves the successful trajectories/videos")
+    parser.add_argument("--reward-mode", type=str)
+    parser.add_argument("-b", "--sim-backend", type=str, default="auto", help="Which simulation backend to use. Can be 'auto', 'cpu', 'gpu'")
+    parser.add_argument("--render-mode", type=str, default="rgb_array", help="can be 'sensors' or 'rgb_array' which only affect what is saved to videos")
+    parser.add_argument("--vis", action="store_true", help="whether or not to open a GUI to visualize the solution live")
+    parser.add_argument("--save-video", action="store_true", help="whether or not to save videos locally")
+    parser.add_argument("--traj-name", type=str, help="The name of the trajectory .h5 file that will be created.")
+    parser.add_argument("--shader", default="default", type=str, help="Change shader used for rendering. Default is 'default' which is very fast. Can also be 'rt' for ray tracing and generating photo-realistic renders. Can also be 'rt-fast' for a faster but lower quality ray-traced renderer")
+    parser.add_argument("--record-dir", type=str, default="demos", help="where to save the recorded trajectories")
+    parser.add_argument("--num-procs", type=int, default=1, help="Number of processes to use to help parallelize the trajectory replay process. This uses CPU multiprocessing and only works with the CPU simulation backend at the moment.")
+    parser.add_argument("--random_seed", type=int, default=0, help="Random seed for the environment.")
+    parser.add_argument("--pretrained_path", type=str, default=None, help="Path to the pretrained model")
+    return parser.parse_args()
+task2lang = {
+    "PegInsertionSide-v1": "Pick up a orange-white peg and insert the orange end into the box with a hole in it.",
+    "PickCube-v1": "Grasp a red cube and move it to a target goal position.",
+    "StackCube-v1":  "Pick up a red cube and stack it on top of a green cube and let go of the cube without it falling.",
+    "PlugCharger-v1": "Pick up one of the misplaced shapes on the board/kit and insert it into the correct empty slot.",
+    "PushCube-v1": "Push and move a cube to a goal region in front of it."
+}
+import random
+import os
+args = parse_args()
+seed = args.random_seed
+random.seed(seed)
+os.environ['PYTHONHASHSEED'] = str(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+from transformers import AutoModelForVision2Seq, AutoProcessor
+DATA_STAT = {'mean': [ 0.00263866,  0.01804881, -0.02151551, -0.00384866,  0.00500441,
+       -0.00057146, -0.26013601], 'std': [0.06639539, 0.1246438 , 0.09675793, 0.03351422, 0.04930534,
+       0.25787726, 0.96762997], 'max': [0.31303197, 0.77948809, 0.42906255, 0.20186238, 0.63990456,
+       0.99999917, 1.        ], 'min': [-0.31464151, -0.64183694, -0.62718982, -0.5888508 , -0.97813392,
+       -0.99999928, -1.        ], 'q01': [-0.18656027, -0.31995443, -0.24702898, -0.18005923, -0.2164692 ,
+       -0.82366071, -1.        ], 'q99': [0.18384692, 0.45547636, 0.27452313, 0.03571117, 0.1188747 ,
+       0.85074112, 1.        ]}
+MODEL_PATH = args.pretrained_path
+def make_policy():
+    device = torch.device('cuda')
+    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    vla = AutoModelForVision2Seq.from_pretrained(
+        MODEL_PATH,
+        attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True
+    ).to(device)
+    vla.norm_stats["maniskill"] = {
+        "action": {
+            "min": np.array(DATA_STAT["min"]),
+            "max": np.array(DATA_STAT["max"]),
+            "mean": np.array(DATA_STAT["mean"]),
+            "std": np.array(DATA_STAT["std"]),
+            "q01": np.array(DATA_STAT["q01"]),
+            "q99": np.array(DATA_STAT["q99"]),
+        }
+    }
+    vla = vla.eval()
+    return vla, processor
+vla, processor = make_policy()
+success_counts = {}
+for env_id in task2lang.keys():
+    env = gym.make(
+        env_id,
+        obs_mode=args.obs_mode,
+        control_mode="pd_ee_delta_pose",
+        render_mode=args.render_mode,
+        reward_mode="dense" if args.reward_mode is None else args.reward_mode,
+        sensor_configs=dict(shader_pack=args.shader),
+        human_render_camera_configs=dict(shader_pack=args.shader),
+        viewer_camera_configs=dict(shader_pack=args.shader),
+        sim_backend=args.sim_backend
+    )
+    MAX_EPISODE_STEPS = 400
+    total_episodes = args.num_traj
+    success_count = 0
+    base_seed = 20241201
+    import tqdm
+    for episode in tqdm.trange(total_episodes):
+        obs_window = deque(maxlen=2)
+        obs, _ = env.reset(seed = base_seed + episode)
+        img = env.render().squeeze(0).detach().cpu().numpy()
+        obs_window.append(img)
+        global_steps = 0
+        video_frames = []
+        success_time = 0
+        done = False
+        while global_steps < MAX_EPISODE_STEPS and not done:
+            obs = obs_window[-1]
+            image_arrs = [
+                obs_window[-1]
+            ]
+            images = [Image.fromarray(arr) for arr in image_arrs]
+            original_size = images[0].size
+            crop_scale = 0.9
+            sqrt_crop_scale = crop_scale
+            sqrt_crop_scale = np.sqrt(crop_scale)
+            images = [
+                center_crop(
+                    img, output_size=(
+                        int(sqrt_crop_scale * img.size[1]),
+                        int(sqrt_crop_scale * img.size[0])
+                    )
+                ) for img in images
+            ]
+            images = [img.resize(original_size, Image.Resampling.BILINEAR) for img in images]
+            # de-capitalize and remove trailing period
+            instruction = task2lang[env_id].lower()
+            prompt = f"In: What action should the robot take to {instruction}?\nOut:"
+            inputs = processor(prompt, images).to("cuda:0", dtype=torch.bfloat16)
+            actions = vla.predict_action(**inputs, unnorm_key="maniskill", do_sample=False)[None]
+            for idx in range(actions.shape[0]):
+                action = actions[idx]
+                # print(action)
+                # action = action * (np.array(DATA_STAT['std']) + 1e-8) + np.array(DATA_STAT['mean'])
+                obs, reward, terminated, truncated, info = env.step(action)
+                img = env.render().squeeze(0).detach().cpu().numpy()
+                obs_window.append(img)
+                video_frames.append(img)
+                global_steps += 1
+                if terminated or truncated:
+                    assert "success" in info, sorted(info.keys())
+                    if info['success']:
+                        success_count += 1
+                        done = True
+                        break
+        print(f"Trial {episode+1} finished, success: {info['success']}, steps: {global_steps}")
+    success_counts[env_id] = success_count
+    print(f"Task {env_id} finished, success: {success_count}/{total_episodes}")
+log_file = "results_ovla_all.log"
+with open(log_file, 'a') as f:
+    f.write(f"{seed}:{success_counts}\n")

eval_sim/eval_rdt_maniskill.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from typing import Callable, List, Type
+import sys
+sys.path.append('/')
+import gymnasium as gym
+import numpy as np
+from mani_skill.envs.sapien_env import BaseEnv
+from mani_skill.utils import common, gym_utils
+import argparse
+import yaml
+from scripts.maniskill_model import create_model, RoboticDiffusionTransformerModel
+import torch
+from collections import deque
+from PIL import Image
+import cv2
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--env-id", type=str, default="PickCube-v1", help=f"Environment to run motion planning solver on. ")
+    parser.add_argument("-o", "--obs-mode", type=str, default="rgb", help="Observation mode to use. Usually this is kept as 'none' as observations are not necesary to be stored, they can be replayed later via the mani_skill.trajectory.replay_trajectory script.")
+    parser.add_argument("-n", "--num-traj", type=int, default=25, help="Number of trajectories to test.")
+    parser.add_argument("--only-count-success", action="store_true", help="If true, generates trajectories until num_traj of them are successful and only saves the successful trajectories/videos")
+    parser.add_argument("--reward-mode", type=str)
+    parser.add_argument("-b", "--sim-backend", type=str, default="auto", help="Which simulation backend to use. Can be 'auto', 'cpu', 'gpu'")
+    parser.add_argument("--render-mode", type=str, default="rgb_array", help="can be 'sensors' or 'rgb_array' which only affect what is saved to videos")
+    parser.add_argument("--shader", default="default", type=str, help="Change shader used for rendering. Default is 'default' which is very fast. Can also be 'rt' for ray tracing and generating photo-realistic renders. Can also be 'rt-fast' for a faster but lower quality ray-traced renderer")
+    parser.add_argument("--num-procs", type=int, default=1, help="Number of processes to use to help parallelize the trajectory replay process. This uses CPU multiprocessing and only works with the CPU simulation backend at the moment.")
+    parser.add_argument("--pretrained_path", type=str, default=None, help="Path to the pretrained model")
+    parser.add_argument("--random_seed", type=int, default=0, help="Random seed for the environment.")
+    return parser.parse_args()
+import random
+import os
+# set cuda
+args = parse_args()
+# set random seeds
+seed = args.random_seed
+random.seed(seed)
+os.environ['PYTHONHASHSEED'] = str(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+task2lang = {
+    "PegInsertionSide-v1": "Pick up a orange-white peg and insert the orange end into the box with a hole in it.",
+    "PickCube-v1": "Grasp a red cube and move it to a target goal position.",
+    "StackCube-v1":  "Pick up a red cube and stack it on top of a green cube and let go of the cube without it falling.",
+    "PlugCharger-v1": "Pick up one of the misplaced shapes on the board/kit and insert it into the correct empty slot.",
+    "PushCube-v1": "Push and move a cube to a goal region in front of it."
+}
+env_id = args.env_id
+env = gym.make(
+    env_id,
+    obs_mode=args.obs_mode,
+    control_mode="pd_joint_pos",
+    render_mode=args.render_mode,
+    reward_mode="dense" if args.reward_mode is None else args.reward_mode,
+    sensor_configs=dict(shader_pack=args.shader),
+    human_render_camera_configs=dict(shader_pack=args.shader),
+    viewer_camera_configs=dict(shader_pack=args.shader),
+    sim_backend=args.sim_backend
+)
+config_path = 'configs/base.yaml'
+with open(config_path, "r") as fp:
+    config = yaml.safe_load(fp)
+pretrained_text_encoder_name_or_path = "google/t5-v1_1-xxl"
+pretrained_vision_encoder_name_or_path = "google/siglip-so400m-patch14-384"
+pretrained_path = args.pretrained_path
+policy = create_model(
+    args=config,
+    dtype=torch.bfloat16,
+    pretrained=pretrained_path,
+    pretrained_text_encoder_name_or_path=pretrained_text_encoder_name_or_path,
+    pretrained_vision_encoder_name_or_path=pretrained_vision_encoder_name_or_path
+)
+if os.path.exists(f'text_embed_{env_id}.pt'):
+    text_embed = torch.load(f'text_embed_{env_id}.pt')
+else:
+    text_embed = policy.encode_instruction(task2lang[env_id])
+    torch.save(text_embed, f'text_embed_{env_id}.pt')
+MAX_EPISODE_STEPS = 400
+total_episodes = args.num_traj
+success_count = 0
+base_seed = 20241201
+import tqdm
+for episode in tqdm.trange(total_episodes):
+    obs_window = deque(maxlen=2)
+    obs, _ = env.reset(seed = episode + base_seed)
+    policy.reset()
+    img = env.render().squeeze(0).detach().cpu().numpy()
+    obs_window.append(None)
+    obs_window.append(np.array(img))
+    proprio = obs['agent']['qpos'][:, :-1]
+    global_steps = 0
+    video_frames = []
+    success_time = 0
+    done = False
+    while global_steps < MAX_EPISODE_STEPS and not done:
+        image_arrs = []
+        for window_img in obs_window:
+            image_arrs.append(window_img)
+            image_arrs.append(None)
+            image_arrs.append(None)
+        images = [Image.fromarray(arr) if arr is not None else None
+                  for arr in image_arrs]
+        actions = policy.step(proprio, images, text_embed).squeeze(0).cpu().numpy()
+        # Take 8 steps since RDT is trained to predict interpolated 64 steps(actual 14 steps)
+        actions = actions[::4, :]
+        for idx in range(actions.shape[0]):
+            action = actions[idx]
+            obs, reward, terminated, truncated, info = env.step(action)
+            img = env.render().squeeze(0).detach().cpu().numpy()
+            obs_window.append(img)
+            proprio = obs['agent']['qpos'][:, :-1]
+            video_frames.append(img)
+            global_steps += 1
+            if terminated or truncated:
+                assert "success" in info, sorted(info.keys())
+                if info['success']:
+                    success_count += 1
+                    done = True
+                    break
+    print(f"Trial {episode+1} finished, success: {info['success']}, steps: {global_steps}")
+success_rate = success_count / total_episodes * 100
+print(f"Success rate: {success_rate}%")

lang_embed/aloha_dish_drainer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:903018f06a23f7d8b97480b5bf304442f0593a4d854dc9e0e0fd70822c52b82e
+size 99667

lang_embed/aloha_handover_box.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a343452e908910230df6ca0045320e121f2f59969314c6a1a09af88192b5e81
+size 91475

lang_embed/aloha_lift_box.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e2953aa4aad84e687f08f5819f3b3e6c3d4671cbb1d95539dacf4dcad2e9142
+size 83263

lang_embed/aloha_shoes_table.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f606c5cbb856de1c7a362d7ab437d098abc76e78b2b731f77ded048851b87900
+size 132494

lang_embed/anubis_brush_to_pan.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1155d325416a6d1be4a70bb49d7b15272ef0d75dc8a0437a42a9f4660c607a49
+size 66904

lang_embed/anubis_carrot_to_bag.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:394c01aa62fbfd6fdf3d6b53684b10e377820b0a4d0c9425e08b549862beedb6
+size 83293

lang_embed/anubis_towel_kirby.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92e180dc03d6b6841810bafa499aeee1c43d45a9206b6401426b105bad1fd966
+size 83283

scripts/agilex_inference.py ADDED Viewed

	@@ -0,0 +1,658 @@

+#!/home/lin/software/miniconda3/envs/aloha/bin/python
+# -- coding: UTF-8
+"""
+#!/usr/bin/python3
+"""
+import argparse
+import sys
+import threading
+import time
+import yaml
+from collections import deque
+import numpy as np
+import rospy
+import torch
+from cv_bridge import CvBridge
+from geometry_msgs.msg import Twist
+from nav_msgs.msg import Odometry
+from PIL import Image as PImage
+from sensor_msgs.msg import Image, JointState
+from std_msgs.msg import Header
+import cv2
+from scripts.agilex_model import create_model
+# sys.path.append("./")
+CAMERA_NAMES = ['cam_high', 'cam_right_wrist', 'cam_left_wrist']
+observation_window = None
+lang_embeddings = None
+# debug
+preload_images = None
+# Initialize the model
+def make_policy(args):
+    with open(args.config_path, "r") as fp:
+        config = yaml.safe_load(fp)
+    args.config = config
+    # pretrained_text_encoder_name_or_path = "google/t5-v1_1-xxl"
+    pretrained_vision_encoder_name_or_path = "google/siglip-so400m-patch14-384"
+    model = create_model(
+        args=args.config,
+        dtype=torch.bfloat16,
+        pretrained=args.pretrained_model_name_or_path,
+        # pretrained_text_encoder_name_or_path=pretrained_text_encoder_name_or_path,
+        pretrained_vision_encoder_name_or_path=pretrained_vision_encoder_name_or_path,
+        control_frequency=args.ctrl_freq,
+    )
+    return model
+def set_seed(seed):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+# Interpolate the actions to make the robot move smoothly
+def interpolate_action(args, prev_action, cur_action):
+    steps = np.concatenate((np.array(args.arm_steps_length), np.array(args.arm_steps_length)), axis=0)
+    diff = np.abs(cur_action - prev_action)
+    step = np.ceil(diff / steps).astype(int)
+    step = np.max(step)
+    if step <= 1:
+        return cur_action[np.newaxis, :]
+    new_actions = np.linspace(prev_action, cur_action, step + 1)
+    return new_actions[1:]
+def get_config(args):
+    config = {
+        'episode_len': args.max_publish_step,
+        'state_dim': 14,
+        'chunk_size': args.chunk_size,
+        'camera_names': CAMERA_NAMES,
+    }
+    return config
+# Get the observation from the ROS topic
+def get_ros_observation(args,ros_operator):
+    rate = rospy.Rate(args.publish_rate)
+    print_flag = True
+    while True and not rospy.is_shutdown():
+        result = ros_operator.get_frame()
+        if not result:
+            if print_flag:
+                print("syn fail when get_ros_observation")
+                print_flag = False
+            rate.sleep()
+            continue
+        print_flag = True
+        (img_front, img_left, img_right, img_front_depth, img_left_depth, img_right_depth,
+         puppet_arm_left, puppet_arm_right, robot_base) = result
+        # print(f"sync success when get_ros_observation")
+        return (img_front, img_left, img_right,
+         puppet_arm_left, puppet_arm_right)
+# Update the observation window buffer
+def update_observation_window(args, config, ros_operator):
+    # JPEG transformation
+    # Align with training
+    def jpeg_mapping(img):
+        img = cv2.imencode('.jpg', img)[1].tobytes()
+        img = cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR)
+        return img
+    global observation_window
+    if observation_window is None:
+        observation_window = deque(maxlen=2)
+        # Append the first dummy image
+        observation_window.append(
+            {
+                'qpos': None,
+                'images':
+                    {
+                        config["camera_names"][0]: None,
+                        config["camera_names"][1]: None,
+                        config["camera_names"][2]: None,
+                    },
+            }
+        )
+    img_front, img_left, img_right, puppet_arm_left, puppet_arm_right = get_ros_observation(args,ros_operator)
+    img_front = jpeg_mapping(img_front)
+    img_left = jpeg_mapping(img_left)
+    img_right = jpeg_mapping(img_right)
+    qpos = np.concatenate(
+            (np.array(puppet_arm_left.position), np.array(puppet_arm_right.position)), axis=0)
+    qpos = torch.from_numpy(qpos).float().cuda()
+    observation_window.append(
+        {
+            'qpos': qpos,
+            'images':
+                {
+                    config["camera_names"][0]: img_front,
+                    config["camera_names"][1]: img_right,
+                    config["camera_names"][2]: img_left,
+                },
+        }
+    )
+# RDT inference
+def inference_fn(args, config, policy, t):
+    global observation_window
+    global lang_embeddings
+    # print(f"Start inference_thread_fn: t={t}")
+    while True and not rospy.is_shutdown():
+        time1 = time.time()
+        # fetch images in sequence [front, right, left]
+        image_arrs = [
+            observation_window[-2]['images'][config['camera_names'][0]],
+            observation_window[-2]['images'][config['camera_names'][1]],
+            observation_window[-2]['images'][config['camera_names'][2]],
+            observation_window[-1]['images'][config['camera_names'][0]],
+            observation_window[-1]['images'][config['camera_names'][1]],
+            observation_window[-1]['images'][config['camera_names'][2]]
+        ]
+        # fetch debug images in sequence [front, right, left]
+        # image_arrs = [
+        #     preload_images[config['camera_names'][0]][max(t - 1, 0)],
+        #     preload_images[config['camera_names'][2]][max(t - 1, 0)],
+        #     preload_images[config['camera_names'][1]][max(t - 1, 0)],
+        #     preload_images[config['camera_names'][0]][t],
+        #     preload_images[config['camera_names'][2]][t],
+        #     preload_images[config['camera_names'][1]][t]
+        # ]
+        # # encode the images
+        # for i in range(len(image_arrs)):
+        #     image_arrs[i] = cv2.imdecode(np.frombuffer(image_arrs[i], np.uint8), cv2.IMREAD_COLOR)
+        # proprio = torch.from_numpy(preload_images['qpos'][t]).float().cuda()
+        images = [PImage.fromarray(arr) if arr is not None else None
+                  for arr in image_arrs]
+        # for i, pos in enumerate(['f', 'r', 'l'] * 2):
+        #     images[i].save(f'{t}-{i}-{pos}.png')
+        # get last qpos in shape [14, ]
+        proprio = observation_window[-1]['qpos']
+        # unsqueeze to [1, 14]
+        proprio = proprio.unsqueeze(0)
+        # actions shaped as [1, 64, 14] in format [left, right]
+        actions = policy.step(
+            proprio=proprio,
+            images=images,
+            text_embeds=lang_embeddings
+        ).squeeze(0).cpu().numpy()
+        # print(f"inference_actions: {actions.squeeze()}")
+        print(f"Model inference time: {time.time() - time1} s")
+        # print(f"Finish inference_thread_fn: t={t}")
+        return actions
+# Main loop for the manipulation task
+def model_inference(args, config, ros_operator):
+    global lang_embeddings
+    # Load rdt model
+    policy = make_policy(args)
+    lang_dict = torch.load(args.lang_embeddings_path)
+    print(f"Running with instruction: \"{lang_dict['instruction']}\" from \"{lang_dict['name']}\"")
+    lang_embeddings = lang_dict["embeddings"]
+    max_publish_step = config['episode_len']
+    chunk_size = config['chunk_size']
+    # Initialize position of the puppet arm
+    left0 = [-0.00133514404296875, 0.00209808349609375, 0.01583099365234375, -0.032616615295410156, -0.00286102294921875, 0.00095367431640625, 3.557830810546875]
+    right0 = [-0.00133514404296875, 0.00438690185546875, 0.034523963928222656, -0.053597450256347656, -0.00476837158203125, -0.00209808349609375, 3.557830810546875]
+    left1 = [-0.00133514404296875, 0.00209808349609375, 0.01583099365234375, -0.032616615295410156, -0.00286102294921875, 0.00095367431640625, -0.3393220901489258]
+    right1 = [-0.00133514404296875, 0.00247955322265625, 0.01583099365234375, -0.032616615295410156, -0.00286102294921875, 0.00095367431640625, -0.3397035598754883]
+    ros_operator.puppet_arm_publish_continuous(left0, right0)
+    input("Press enter to continue")
+    ros_operator.puppet_arm_publish_continuous(left1, right1)
+    # Initialize the previous action to be the initial robot state
+    pre_action = np.zeros(config['state_dim'])
+    pre_action[:14] = np.array(
+        [-0.00133514404296875, 0.00209808349609375, 0.01583099365234375, -0.032616615295410156, -0.00286102294921875, 0.00095367431640625, -0.3393220901489258] +
+        [-0.00133514404296875, 0.00247955322265625, 0.01583099365234375, -0.032616615295410156, -0.00286102294921875, 0.00095367431640625, -0.3397035598754883]
+    )
+    action = None
+    # Inference loop
+    with torch.inference_mode():
+        while True and not rospy.is_shutdown():
+            # The current time step
+            t = 0
+            rate = rospy.Rate(args.publish_rate)
+            action_buffer = np.zeros([chunk_size, config['state_dim']])
+            while t < max_publish_step and not rospy.is_shutdown():
+                # Update observation window
+                update_observation_window(args, config, ros_operator)
+                # When coming to the end of the action chunk
+                if t % chunk_size == 0:
+                    # Start inference
+                    action_buffer = inference_fn(args, config, policy, t).copy()
+                raw_action = action_buffer[t % chunk_size]
+                action = raw_action
+                # Interpolate the original action sequence
+                if args.use_actions_interpolation:
+                    # print(f"Time {t}, pre {pre_action}, act {action}")
+                    interp_actions = interpolate_action(args, pre_action, action)
+                else:
+                    interp_actions = action[np.newaxis, :]
+                # Execute the interpolated actions one by one
+                for act in interp_actions:
+                    left_action = act[:7]
+                    right_action = act[7:14]
+                    if not args.disable_puppet_arm:
+                        ros_operator.puppet_arm_publish(left_action, right_action)  # puppet_arm_publish_continuous_thread
+                    if args.use_robot_base:
+                        vel_action = act[14:16]
+                        ros_operator.robot_base_publish(vel_action)
+                    rate.sleep()
+                    # print(f"doing action: {act}")
+                t += 1
+                print("Published Step", t)
+                pre_action = action.copy()
+# ROS operator class
+class RosOperator:
+    def __init__(self, args):
+        self.robot_base_deque = None
+        self.puppet_arm_right_deque = None
+        self.puppet_arm_left_deque = None
+        self.img_front_deque = None
+        self.img_right_deque = None
+        self.img_left_deque = None
+        self.img_front_depth_deque = None
+        self.img_right_depth_deque = None
+        self.img_left_depth_deque = None
+        self.bridge = None
+        self.puppet_arm_left_publisher = None
+        self.puppet_arm_right_publisher = None
+        self.robot_base_publisher = None
+        self.puppet_arm_publish_thread = None
+        self.puppet_arm_publish_lock = None
+        self.args = args
+        self.init()
+        self.init_ros()
+    def init(self):
+        self.bridge = CvBridge()
+        self.img_left_deque = deque()
+        self.img_right_deque = deque()
+        self.img_front_deque = deque()
+        self.img_left_depth_deque = deque()
+        self.img_right_depth_deque = deque()
+        self.img_front_depth_deque = deque()
+        self.puppet_arm_left_deque = deque()
+        self.puppet_arm_right_deque = deque()
+        self.robot_base_deque = deque()
+        self.puppet_arm_publish_lock = threading.Lock()
+        self.puppet_arm_publish_lock.acquire()
+    def puppet_arm_publish(self, left, right):
+        joint_state_msg = JointState()
+        joint_state_msg.header = Header()
+        joint_state_msg.header.stamp = rospy.Time.now()  # Set timestep
+        joint_state_msg.name = ['joint0', 'joint1', 'joint2', 'joint3', 'joint4', 'joint5', 'joint6']  # 设置关节名称
+        joint_state_msg.position = left
+        self.puppet_arm_left_publisher.publish(joint_state_msg)
+        joint_state_msg.position = right
+        self.puppet_arm_right_publisher.publish(joint_state_msg)
+    def robot_base_publish(self, vel):
+        vel_msg = Twist()
+        vel_msg.linear.x = vel[0]
+        vel_msg.linear.y = 0
+        vel_msg.linear.z = 0
+        vel_msg.angular.x = 0
+        vel_msg.angular.y = 0
+        vel_msg.angular.z = vel[1]
+        self.robot_base_publisher.publish(vel_msg)
+    def puppet_arm_publish_continuous(self, left, right):
+        rate = rospy.Rate(self.args.publish_rate)
+        left_arm = None
+        right_arm = None
+        while True and not rospy.is_shutdown():
+            if len(self.puppet_arm_left_deque) != 0:
+                left_arm = list(self.puppet_arm_left_deque[-1].position)
+            if len(self.puppet_arm_right_deque) != 0:
+                right_arm = list(self.puppet_arm_right_deque[-1].position)
+            if left_arm is None or right_arm is None:
+                rate.sleep()
+                continue
+            else:
+                break
+        left_symbol = [1 if left[i] - left_arm[i] > 0 else -1 for i in range(len(left))]
+        right_symbol = [1 if right[i] - right_arm[i] > 0 else -1 for i in range(len(right))]
+        flag = True
+        step = 0
+        while flag and not rospy.is_shutdown():
+            if self.puppet_arm_publish_lock.acquire(False):
+                return
+            left_diff = [abs(left[i] - left_arm[i]) for i in range(len(left))]
+            right_diff = [abs(right[i] - right_arm[i]) for i in range(len(right))]
+            flag = False
+            for i in range(len(left)):
+                if left_diff[i] < self.args.arm_steps_length[i]:
+                    left_arm[i] = left[i]
+                else:
+                    left_arm[i] += left_symbol[i] * self.args.arm_steps_length[i]
+                    flag = True
+            for i in range(len(right)):
+                if right_diff[i] < self.args.arm_steps_length[i]:
+                    right_arm[i] = right[i]
+                else:
+                    right_arm[i] += right_symbol[i] * self.args.arm_steps_length[i]
+                    flag = True
+            joint_state_msg = JointState()
+            joint_state_msg.header = Header()
+            joint_state_msg.header.stamp = rospy.Time.now()  # Set the timestep
+            joint_state_msg.name = ['joint0', 'joint1', 'joint2', 'joint3', 'joint4', 'joint5', 'joint6']  # 设置关节名称
+            joint_state_msg.position = left_arm
+            self.puppet_arm_left_publisher.publish(joint_state_msg)
+            joint_state_msg.position = right_arm
+            self.puppet_arm_right_publisher.publish(joint_state_msg)
+            step += 1
+            print("puppet_arm_publish_continuous:", step)
+            rate.sleep()
+    def puppet_arm_publish_linear(self, left, right):
+        num_step = 100
+        rate = rospy.Rate(200)
+        left_arm = None
+        right_arm = None
+        while True and not rospy.is_shutdown():
+            if len(self.puppet_arm_left_deque) != 0:
+                left_arm = list(self.puppet_arm_left_deque[-1].position)
+            if len(self.puppet_arm_right_deque) != 0:
+                right_arm = list(self.puppet_arm_right_deque[-1].position)
+            if left_arm is None or right_arm is None:
+                rate.sleep()
+                continue
+            else:
+                break
+        traj_left_list = np.linspace(left_arm, left, num_step)
+        traj_right_list = np.linspace(right_arm, right, num_step)
+        for i in range(len(traj_left_list)):
+            traj_left = traj_left_list[i]
+            traj_right = traj_right_list[i]
+            traj_left[-1] = left[-1]
+            traj_right[-1] = right[-1]
+            joint_state_msg = JointState()
+            joint_state_msg.header = Header()
+            joint_state_msg.header.stamp = rospy.Time.now()  # 设置时间戳
+            joint_state_msg.name = ['joint0', 'joint1', 'joint2', 'joint3', 'joint4', 'joint5', 'joint6']  # 设置关节名称
+            joint_state_msg.position = traj_left
+            self.puppet_arm_left_publisher.publish(joint_state_msg)
+            joint_state_msg.position = traj_right
+            self.puppet_arm_right_publisher.publish(joint_state_msg)
+            rate.sleep()
+    def puppet_arm_publish_continuous_thread(self, left, right):
+        if self.puppet_arm_publish_thread is not None:
+            self.puppet_arm_publish_lock.release()
+            self.puppet_arm_publish_thread.join()
+            self.puppet_arm_publish_lock.acquire(False)
+            self.puppet_arm_publish_thread = None
+        self.puppet_arm_publish_thread = threading.Thread(target=self.puppet_arm_publish_continuous, args=(left, right))
+        self.puppet_arm_publish_thread.start()
+    def get_frame(self):
+        if len(self.img_left_deque) == 0 or len(self.img_right_deque) == 0 or len(self.img_front_deque) == 0 or \
+                (self.args.use_depth_image and (len(self.img_left_depth_deque) == 0 or len(self.img_right_depth_deque) == 0 or len(self.img_front_depth_deque) == 0)):
+            return False
+        if self.args.use_depth_image:
+            frame_time = min([self.img_left_deque[-1].header.stamp.to_sec(), self.img_right_deque[-1].header.stamp.to_sec(), self.img_front_deque[-1].header.stamp.to_sec(),
+                              self.img_left_depth_deque[-1].header.stamp.to_sec(), self.img_right_depth_deque[-1].header.stamp.to_sec(), self.img_front_depth_deque[-1].header.stamp.to_sec()])
+        else:
+            frame_time = min([self.img_left_deque[-1].header.stamp.to_sec(), self.img_right_deque[-1].header.stamp.to_sec(), self.img_front_deque[-1].header.stamp.to_sec()])
+        if len(self.img_left_deque) == 0 or self.img_left_deque[-1].header.stamp.to_sec() < frame_time:
+            return False
+        if len(self.img_right_deque) == 0 or self.img_right_deque[-1].header.stamp.to_sec() < frame_time:
+            return False
+        if len(self.img_front_deque) == 0 or self.img_front_deque[-1].header.stamp.to_sec() < frame_time:
+            return False
+        if len(self.puppet_arm_left_deque) == 0 or self.puppet_arm_left_deque[-1].header.stamp.to_sec() < frame_time:
+            return False
+        if len(self.puppet_arm_right_deque) == 0 or self.puppet_arm_right_deque[-1].header.stamp.to_sec() < frame_time:
+            return False
+        if self.args.use_depth_image and (len(self.img_left_depth_deque) == 0 or self.img_left_depth_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if self.args.use_depth_image and (len(self.img_right_depth_deque) == 0 or self.img_right_depth_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if self.args.use_depth_image and (len(self.img_front_depth_deque) == 0 or self.img_front_depth_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        if self.args.use_robot_base and (len(self.robot_base_deque) == 0 or self.robot_base_deque[-1].header.stamp.to_sec() < frame_time):
+            return False
+        while self.img_left_deque[0].header.stamp.to_sec() < frame_time:
+            self.img_left_deque.popleft()
+        img_left = self.bridge.imgmsg_to_cv2(self.img_left_deque.popleft(), 'passthrough')
+        while self.img_right_deque[0].header.stamp.to_sec() < frame_time:
+            self.img_right_deque.popleft()
+        img_right = self.bridge.imgmsg_to_cv2(self.img_right_deque.popleft(), 'passthrough')
+        while self.img_front_deque[0].header.stamp.to_sec() < frame_time:
+            self.img_front_deque.popleft()
+        img_front = self.bridge.imgmsg_to_cv2(self.img_front_deque.popleft(), 'passthrough')
+        while self.puppet_arm_left_deque[0].header.stamp.to_sec() < frame_time:
+            self.puppet_arm_left_deque.popleft()
+        puppet_arm_left = self.puppet_arm_left_deque.popleft()
+        while self.puppet_arm_right_deque[0].header.stamp.to_sec() < frame_time:
+            self.puppet_arm_right_deque.popleft()
+        puppet_arm_right = self.puppet_arm_right_deque.popleft()
+        img_left_depth = None
+        if self.args.use_depth_image:
+            while self.img_left_depth_deque[0].header.stamp.to_sec() < frame_time:
+                self.img_left_depth_deque.popleft()
+            img_left_depth = self.bridge.imgmsg_to_cv2(self.img_left_depth_deque.popleft(), 'passthrough')
+        img_right_depth = None
+        if self.args.use_depth_image:
+            while self.img_right_depth_deque[0].header.stamp.to_sec() < frame_time:
+                self.img_right_depth_deque.popleft()
+            img_right_depth = self.bridge.imgmsg_to_cv2(self.img_right_depth_deque.popleft(), 'passthrough')
+        img_front_depth = None
+        if self.args.use_depth_image:
+            while self.img_front_depth_deque[0].header.stamp.to_sec() < frame_time:
+                self.img_front_depth_deque.popleft()
+            img_front_depth = self.bridge.imgmsg_to_cv2(self.img_front_depth_deque.popleft(), 'passthrough')
+        robot_base = None
+        if self.args.use_robot_base:
+            while self.robot_base_deque[0].header.stamp.to_sec() < frame_time:
+                self.robot_base_deque.popleft()
+            robot_base = self.robot_base_deque.popleft()
+        return (img_front, img_left, img_right, img_front_depth, img_left_depth, img_right_depth,
+                puppet_arm_left, puppet_arm_right, robot_base)
+    def img_left_callback(self, msg):
+        if len(self.img_left_deque) >= 2000:
+            self.img_left_deque.popleft()
+        self.img_left_deque.append(msg)
+    def img_right_callback(self, msg):
+        if len(self.img_right_deque) >= 2000:
+            self.img_right_deque.popleft()
+        self.img_right_deque.append(msg)
+    def img_front_callback(self, msg):
+        if len(self.img_front_deque) >= 2000:
+            self.img_front_deque.popleft()
+        self.img_front_deque.append(msg)
+    def img_left_depth_callback(self, msg):
+        if len(self.img_left_depth_deque) >= 2000:
+            self.img_left_depth_deque.popleft()
+        self.img_left_depth_deque.append(msg)
+    def img_right_depth_callback(self, msg):
+        if len(self.img_right_depth_deque) >= 2000:
+            self.img_right_depth_deque.popleft()
+        self.img_right_depth_deque.append(msg)
+    def img_front_depth_callback(self, msg):
+        if len(self.img_front_depth_deque) >= 2000:
+            self.img_front_depth_deque.popleft()
+        self.img_front_depth_deque.append(msg)
+    def puppet_arm_left_callback(self, msg):
+        if len(self.puppet_arm_left_deque) >= 2000:
+            self.puppet_arm_left_deque.popleft()
+        self.puppet_arm_left_deque.append(msg)
+    def puppet_arm_right_callback(self, msg):
+        if len(self.puppet_arm_right_deque) >= 2000:
+            self.puppet_arm_right_deque.popleft()
+        self.puppet_arm_right_deque.append(msg)
+    def robot_base_callback(self, msg):
+        if len(self.robot_base_deque) >= 2000:
+            self.robot_base_deque.popleft()
+        self.robot_base_deque.append(msg)
+    def init_ros(self):
+        rospy.init_node('joint_state_publisher', anonymous=True)
+        rospy.Subscriber(self.args.img_left_topic, Image, self.img_left_callback, queue_size=1000, tcp_nodelay=True)
+        rospy.Subscriber(self.args.img_right_topic, Image, self.img_right_callback, queue_size=1000, tcp_nodelay=True)
+        rospy.Subscriber(self.args.img_front_topic, Image, self.img_front_callback, queue_size=1000, tcp_nodelay=True)
+        if self.args.use_depth_image:
+            rospy.Subscriber(self.args.img_left_depth_topic, Image, self.img_left_depth_callback, queue_size=1000, tcp_nodelay=True)
+            rospy.Subscriber(self.args.img_right_depth_topic, Image, self.img_right_depth_callback, queue_size=1000, tcp_nodelay=True)
+            rospy.Subscriber(self.args.img_front_depth_topic, Image, self.img_front_depth_callback, queue_size=1000, tcp_nodelay=True)
+        rospy.Subscriber(self.args.puppet_arm_left_topic, JointState, self.puppet_arm_left_callback, queue_size=1000, tcp_nodelay=True)
+        rospy.Subscriber(self.args.puppet_arm_right_topic, JointState, self.puppet_arm_right_callback, queue_size=1000, tcp_nodelay=True)
+        rospy.Subscriber(self.args.robot_base_topic, Odometry, self.robot_base_callback, queue_size=1000, tcp_nodelay=True)
+        self.puppet_arm_left_publisher = rospy.Publisher(self.args.puppet_arm_left_cmd_topic, JointState, queue_size=10)
+        self.puppet_arm_right_publisher = rospy.Publisher(self.args.puppet_arm_right_cmd_topic, JointState, queue_size=10)
+        self.robot_base_publisher = rospy.Publisher(self.args.robot_base_cmd_topic, Twist, queue_size=10)
+def get_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--max_publish_step', action='store', type=int,
+                        help='Maximum number of action publishing steps', default=10000, required=False)
+    parser.add_argument('--seed', action='store', type=int,
+                        help='Random seed', default=None, required=False)
+    parser.add_argument('--img_front_topic', action='store', type=str, help='img_front_topic',
+                        default='/camera_f/color/image_raw', required=False)
+    parser.add_argument('--img_left_topic', action='store', type=str, help='img_left_topic',
+                        default='/camera_l/color/image_raw', required=False)
+    parser.add_argument('--img_right_topic', action='store', type=str, help='img_right_topic',
+                        default='/camera_r/color/image_raw', required=False)
+    parser.add_argument('--img_front_depth_topic', action='store', type=str, help='img_front_depth_topic',
+                        default='/camera_f/depth/image_raw', required=False)
+    parser.add_argument('--img_left_depth_topic', action='store', type=str, help='img_left_depth_topic',
+                        default='/camera_l/depth/image_raw', required=False)
+    parser.add_argument('--img_right_depth_topic', action='store', type=str, help='img_right_depth_topic',
+                        default='/camera_r/depth/image_raw', required=False)
+    parser.add_argument('--puppet_arm_left_cmd_topic', action='store', type=str, help='puppet_arm_left_cmd_topic',
+                        default='/master/joint_left', required=False)
+    parser.add_argument('--puppet_arm_right_cmd_topic', action='store', type=str, help='puppet_arm_right_cmd_topic',
+                        default='/master/joint_right', required=False)
+    parser.add_argument('--puppet_arm_left_topic', action='store', type=str, help='puppet_arm_left_topic',
+                        default='/puppet/joint_left', required=False)
+    parser.add_argument('--puppet_arm_right_topic', action='store', type=str, help='puppet_arm_right_topic',
+                        default='/puppet/joint_right', required=False)
+    parser.add_argument('--robot_base_topic', action='store', type=str, help='robot_base_topic',
+                        default='/odom_raw', required=False)
+    parser.add_argument('--robot_base_cmd_topic', action='store', type=str, help='robot_base_topic',
+                        default='/cmd_vel', required=False)
+    parser.add_argument('--use_robot_base', action='store_true',
+                        help='Whether to use the robot base to move around',
+                        default=False, required=False)
+    parser.add_argument('--publish_rate', action='store', type=int,
+                        help='The rate at which to publish the actions',
+                        default=30, required=False)
+    parser.add_argument('--ctrl_freq', action='store', type=int,
+                        help='The control frequency of the robot',
+                        default=25, required=False)
+    parser.add_argument('--chunk_size', action='store', type=int,
+                        help='Action chunk size',
+                        default=64, required=False)
+    parser.add_argument('--arm_steps_length', action='store', type=float,
+                        help='The maximum change allowed for each joint per timestep',
+                        default=[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.2], required=False)
+    parser.add_argument('--use_actions_interpolation', action='store_true',
+                        help='Whether to interpolate the actions if the difference is too large',
+                        default=False, required=False)
+    parser.add_argument('--use_depth_image', action='store_true',
+                        help='Whether to use depth images',
+                        default=False, required=False)
+    parser.add_argument('--disable_puppet_arm', action='store_true',
+                        help='Whether to disable the puppet arm. This is useful for safely debugging',default=False)
+    parser.add_argument('--config_path', type=str, default="configs/base.yaml",
+                        help='Path to the config file')
+    # parser.add_argument('--cfg_scale', type=float, default=2.0,
+    #                     help='the scaling factor used to modify the magnitude of the control features during denoising')
+    parser.add_argument('--pretrained_model_name_or_path', type=str, required=True, help='Name or path to the pretrained model')
+    parser.add_argument('--lang_embeddings_path', type=str, required=True,
+                        help='Path to the pre-encoded language instruction embeddings')
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_arguments()
+    ros_operator = RosOperator(args)
+    if args.seed is not None:
+        set_seed(args.seed)
+    config = get_config(args)
+    model_inference(args, config, ros_operator)
+if __name__ == '__main__':
+    main()

scripts/agilex_model.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import os
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+from configs.state_vec import STATE_VEC_IDX_MAPPING
+from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
+from models.multimodal_encoder.t5_encoder import T5Embedder
+from models.rdt_runner import RDTRunner
+# The indices that the raw vector should be mapped to in the unified action vector
+AGILEX_STATE_INDICES = [
+    STATE_VEC_IDX_MAPPING[f"left_arm_joint_{i}_pos"] for i in range(6)
+] + [
+    STATE_VEC_IDX_MAPPING["left_gripper_open"]
+] + [
+    STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)
+] + [
+    STATE_VEC_IDX_MAPPING[f"right_gripper_open"]
+]
+TABLETOP_6D_INDICES_NAMES = [
+    'left_eef_pos_x','left_eef_pos_y','left_eef_pos_z','left_eef_angle_0','left_eef_angle_1','left_eef_angle_2','left_eef_angle_3','left_eef_angle_4','left_eef_angle_5','left_gripper_open','right_eef_pos_x','right_eef_pos_y','right_eef_pos_z','right_eef_angle_0','right_eef_angle_1','right_eef_angle_2','right_eef_angle_3','right_eef_angle_4','right_eef_angle_5','right_gripper_open']
+TABLETOP_6D_INDICES = [STATE_VEC_IDX_MAPPING[n] for n in TABLETOP_6D_INDICES_NAMES]
+# Create the RDT model
+def create_model(args, **kwargs):
+    model = RoboticDiffusionTransformerModel(args, **kwargs)
+    pretrained = kwargs.get("pretrained", None)
+    if (
+        pretrained is not None
+        and os.path.isfile(pretrained)
+    ):
+        model.load_pretrained_weights(pretrained)
+    return model
+class RoboticDiffusionTransformerModel(object):
+    """A wrapper for the RDT model, which handles
+            1. Model initialization
+            2. Encodings of instructions
+            3. Model inference
+    """
+    def __init__(
+        self, args,
+        device='cuda',
+        dtype=torch.bfloat16,
+        image_size=None,
+        control_frequency=25,
+        pretrained=None,
+        pretrained_vision_encoder_name_or_path=None,
+        pretrained_text_encoder_name_or_path=None
+    ):
+        self.args = args
+        self.dtype = dtype
+        self.image_size = image_size
+        self.device = device
+        self.control_frequency = control_frequency
+        # We do not use the text encoder due to limited GPU memory
+        self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
+        self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
+        self.policy = self.get_policy(pretrained)
+        self.reset()
+    def get_policy(self, pretrained):
+        """Initialize the model."""
+        # Initialize model with arguments
+        if (
+            pretrained is None
+            or os.path.isfile(pretrained)
+        ):
+            img_cond_len = (self.args["common"]["img_history_size"]
+                            * self.args["common"]["num_cameras"]
+                            * self.vision_model.num_patches)
+            _model = RDTRunner(
+                action_dim=self.args["common"]["state_dim"],
+                pred_horizon=self.args["common"]["action_chunk_size"],
+                config=self.args["model"],
+                lang_token_dim=self.args["model"]["lang_token_dim"],
+                img_token_dim=self.args["model"]["img_token_dim"],
+                state_token_dim=self.args["model"]["state_token_dim"],
+                max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
+                img_cond_len=img_cond_len,
+                img_pos_embed_config=[
+                    # No initial pos embed in the last grid size
+                    # since we've already done in ViT
+                    ("image", (self.args["common"]["img_history_size"],
+                        self.args["common"]["num_cameras"],
+                        -self.vision_model.num_patches)),
+                ],
+                lang_pos_embed_config=[
+                    # Similarly, no initial pos embed for language
+                    ("lang", -self.args["dataset"]["tokenizer_max_length"]),
+                ],
+                dtype=self.dtype,
+            )
+        else:
+            _model = RDTRunner.from_pretrained(pretrained)
+        return _model
+    def get_text_encoder(self, pretrained_text_encoder_name_or_path):
+        text_embedder = T5Embedder(from_pretrained=pretrained_text_encoder_name_or_path,
+                                   model_max_length=self.args["dataset"]["tokenizer_max_length"],
+                                   device=self.device)
+        tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+        return tokenizer, text_encoder
+    def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
+        vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
+        image_processor = vision_encoder.image_processor
+        return image_processor, vision_encoder
+    def reset(self):
+        """Set model to evaluation mode.
+        """
+        device = self.device
+        weight_dtype = self.dtype
+        self.policy.eval()
+        # self.text_model.eval()
+        self.vision_model.eval()
+        self.policy = self.policy.to(device, dtype=weight_dtype)
+        # self.text_model = self.text_model.to(device, dtype=weight_dtype)
+        self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
+    def load_pretrained_weights(self, pretrained=None):
+        if pretrained is None:
+            return
+        print(f'Loading weights from {pretrained}')
+        filename = os.path.basename(pretrained)
+        if filename.endswith('.pt'):
+            checkpoint =  torch.load(pretrained)
+            self.policy.load_state_dict(checkpoint["module"])
+        elif filename.endswith('.safetensors'):
+            from safetensors.torch import load_model
+            load_model(self.policy, pretrained)
+        else:
+            raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
+    def encode_instruction(self, instruction, device="cuda"):
+        """Encode string instruction to latent embeddings.
+        Args:
+            instruction: a string of instruction
+            device: a string of device
+        Returns:
+            pred: a tensor of latent embeddings of shape (text_max_length, 512)
+        """
+        tokens = self.text_tokenizer(
+            instruction, return_tensors="pt",
+            padding="longest",
+            truncation=True
+        )["input_ids"].to(device)
+        tokens = tokens.view(1, -1)
+        with torch.no_grad():
+            pred = self.text_model(tokens).last_hidden_state.detach()
+        return pred
+    def _format_joint_to_state(self, joints):
+        """
+        Format the joint proprioception into the unified action vector.
+        Args:
+            joints (torch.Tensor): The 6D EEF proprioception to be formatted.
+                qpos ([B, N, 20]).
+        Returns:
+            state (torch.Tensor): The formatted vector for RDT ([B, N, 128]).
+        """
+        # Rescale the gripper to the range of [0, 1]
+        joints = joints / torch.tensor(
+            [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],
+            device=joints.device, dtype=joints.dtype
+        )
+        B, N, _ = joints.shape
+        state = torch.zeros(
+            (B, N, self.args["model"]["state_token_dim"]),
+            device=joints.device, dtype=joints.dtype
+        )
+        # Fill into the unified state vector
+        state[:, :, TABLETOP_6D_INDICES] = joints
+        # Assemble the mask indicating each dimension's availability
+        state_elem_mask = torch.zeros(
+            (B, self.args["model"]["state_token_dim"]),
+            device=joints.device, dtype=joints.dtype
+        )
+        state_elem_mask[:,TABLETOP_6D_INDICES] = 1
+        return state, state_elem_mask
+    def _unformat_action_to_joint(self, action):
+        """
+        Unformat the unified action vector into the joint action to be executed.
+        Args:
+            action (torch.Tensor): The unified action vector to be unformatted.
+                ([B, N, 128])
+        Returns:
+            joints (torch.Tensor): The unformatted robot joint action.
+                qpos ([B, N, 14]).
+        """
+        action_indices = TABLETOP_6D_INDICES
+        joints = action[:, :, action_indices]
+        # Rescale the gripper back to the action range
+        # Note that the action range and proprioception range are different
+        # for Mobile ALOHA robot
+        joints = joints * torch.tensor(
+            [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],
+            device=joints.device, dtype=joints.dtype
+        )
+        return joints
+    @torch.no_grad()
+    def step(self, proprio, images, instruction):
+        """
+        Predict the next action chunk given the
+        proprioceptive states, images, and instruction embeddings.
+        Args:
+            proprio: proprioceptive states
+            images: RGB images, the order should be
+                [ext_{t-1}, right_wrist_{t-1}, left_wrist_{t-1},
+                ext_{t}, right_wrist_{t}, left_wrist_{t}]
+            text_embeds: instruction embeddings
+        Returns:
+            action: predicted action
+        """
+        device = self.device
+        dtype = self.dtype
+        # The background image used for padding
+        background_color = np.array([
+            int(x*255) for x in self.image_processor.image_mean
+        ], dtype=np.uint8).reshape(1, 1, 3)
+        background_image = np.ones((
+            self.image_processor.size["height"],
+            self.image_processor.size["width"], 3), dtype=np.uint8
+        ) * background_color
+        # Preprocess the images by order and encode them
+        image_tensor_list = []
+        for image in images:
+            if image is None:
+                # Replace it with the background image
+                image = Image.fromarray(background_image)
+            if self.image_size is not None:
+                image = transforms.Resize(self.data_args.image_size)(image)
+            if self.args["dataset"].get("auto_adjust_image_brightness", False):
+                pixel_values = list(image.getdata())
+                average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
+                if average_brightness <= 0.15:
+                    image = transforms.ColorJitter(brightness=(1.75,1.75))(image)
+            if self.args["dataset"].get("image_aspect_ratio", "pad") == 'pad':
+                def expand2square(pil_img, background_color):
+                    width, height = pil_img.size
+                    if width == height:
+                        return pil_img
+                    elif width > height:
+                        result = Image.new(pil_img.mode, (width, width), background_color)
+                        result.paste(pil_img, (0, (width - height) // 2))
+                        return result
+                    else:
+                        result = Image.new(pil_img.mode, (height, height), background_color)
+                        result.paste(pil_img, ((height - width) // 2, 0))
+                        return result
+                image = expand2square(image, tuple(int(x*255) for x in self.image_processor.image_mean))
+            image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            image_tensor_list.append(image)
+        image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
+        image_embeds = self.vision_model(image_tensor).detach()
+        image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
+        # Prepare the proprioception states and the control frequency
+        joints = proprio.to(device).unsqueeze(0)   # (1, 1, 14)
+        states, state_elem_mask = self._format_joint_to_state(joints)    # (1, 1, 128), (1, 128)
+        states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
+        states = states[:, -1:, :]  # (1, 1, 128)
+        ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
+        # text_embeds = text_embeds.to(device, dtype=dtype)
+        text_embeds = self.encode_instruction(instruction=instruction)
+        # Predict the next action chunk given the inputs
+        trajectory = self.policy.predict_action(
+            lang_tokens=text_embeds,
+            lang_attn_mask=torch.ones(
+                text_embeds.shape[:2], dtype=torch.bool,
+                device=text_embeds.device),
+            img_tokens=image_embeds,
+            state_tokens=states,
+            action_mask=state_elem_mask.unsqueeze(1),
+            ctrl_freqs=ctrl_freqs
+        )
+        trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
+        return trajectory

scripts/encode_lang_batch.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import json
+import torch
+import yaml
+from tqdm import tqdm
+from models.multimodal_encoder.t5_encoder import T5Embedder
+GPU = 0
+MODEL_PATH = "google/t5-v1_1-xxl"
+CONFIG_PATH = "configs/base.yaml"
+# Modify the TARGET_DIR to your dataset path
+TARGET_DIR = "data/datasets/openx_embod/singlevla_benchmark_ee"
+# Note: if your GPU VRAM is less than 24GB,
+# it is recommended to enable offloading by specifying an offload directory.
+OFFLOAD_DIR = None  # Specify your offload directory here, ensuring the directory exists.
+def main():
+    with open(CONFIG_PATH, "r") as fp:
+        config = yaml.safe_load(fp)
+    device = torch.device(f"cuda:{GPU}")
+    text_embedder = T5Embedder(
+        from_pretrained=MODEL_PATH,
+        model_max_length=config["dataset"]["tokenizer_max_length"],
+        device=device,
+        use_offload_folder=OFFLOAD_DIR
+    )
+    tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+    # Get all the task paths
+    task_paths = []
+    for sub_dir in os.listdir(TARGET_DIR):
+        middle_dir = os.path.join(TARGET_DIR, sub_dir)
+        if os.path.isdir(middle_dir):
+            for task_dir in os.listdir(middle_dir):
+                task_path = os.path.join(middle_dir, task_dir)
+                if os.path.isdir(task_path):
+                    task_paths.append(task_path)
+    # For each task, encode the instructions
+    for task_path in tqdm(task_paths):
+        # Load the instructions corresponding to the task from the directory
+        with open(os.path.join(task_path, 'expanded_instruction_gpt-4-turbo.json'), 'r') as f_instr:
+            instruction_dict = json.load(f_instr)
+        instructions = [instruction_dict['instruction']] + instruction_dict['simplified_instruction'] + \
+            instruction_dict['expanded_instruction']
+        # Encode the instructions
+        tokenized_res = tokenizer(
+            instructions, return_tensors="pt",
+            padding="longest",
+            truncation=True
+        )
+        tokens = tokenized_res["input_ids"].to(device)
+        attn_mask = tokenized_res["attention_mask"].to(device)
+        with torch.no_grad():
+            text_embeds = text_encoder(
+                input_ids=tokens,
+                attention_mask=attn_mask
+            )["last_hidden_state"].detach().cpu()
+        attn_mask = attn_mask.cpu().bool()
+        # Save the embeddings for training use
+        for i in range(len(instructions)):
+            text_embed = text_embeds[i][attn_mask[i]]
+            save_path = os.path.join(task_path, f"lang_embed_{i}.pt")
+            torch.save(text_embed, save_path)
+if __name__ == "__main__":
+    main()

scripts/maniskill_model.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import os
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+from configs.state_vec import STATE_VEC_IDX_MAPPING
+from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
+from models.multimodal_encoder.t5_encoder import T5Embedder
+from models.rdt_runner import RDTRunner
+MANISKILL_INDICES = [
+    STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(7)
+] + [
+    STATE_VEC_IDX_MAPPING[f"right_gripper_open"]
+]
+def create_model(args, pretrained, **kwargs):
+    model = RoboticDiffusionTransformerModel(args, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_weights(pretrained)
+    return model
+DATA_STAT = {'state_min': [-0.7463043928146362, -0.0801204964518547, -0.4976441562175751, -2.657780647277832, -0.5742632150650024, 1.8309762477874756, -2.2423808574676514, 0.0], 'state_max': [0.7645499110221863, 1.4967026710510254, 0.4650936424732208, -0.3866899907588959, 0.5505855679512024, 3.2900545597076416, 2.5737812519073486, 0.03999999910593033], 'action_min': [-0.7472005486488342, -0.08631071448326111, -0.4995281398296356, -2.658363103866577, -0.5751323103904724, 1.8290787935256958, -2.245187997817993, -1.0], 'action_max': [0.7654682397842407, 1.4984270334243774, 0.46786263585090637, -0.38181185722351074, 0.5517147779464722, 3.291581630706787, 2.575840711593628, 1.0]}
+class RoboticDiffusionTransformerModel(object):
+    """A wrapper for the RDT model, which handles
+            1. Model initialization
+            2. Encodings of instructions
+            3. Model inference
+    """
+    def __init__(
+        self, args,
+        device='cuda',
+        dtype=torch.bfloat16,
+        image_size=None,
+        control_frequency=25,
+        pretrained_text_encoder_name_or_path=None,
+        pretrained_vision_encoder_name_or_path=None,
+    ):
+        self.args = args
+        self.dtype = dtype
+        self.image_size = image_size
+        self.device = device
+        self.control_frequency = control_frequency
+        self.text_tokenizer, self.text_model = self.get_text_encoder(pretrained_text_encoder_name_or_path)
+        self.image_processor, self.vision_model = self.get_vision_encoder(pretrained_vision_encoder_name_or_path)
+        self.policy = self.get_policy()
+        self.state_min = torch.tensor(DATA_STAT['state_min']).to(device)
+        self.state_max = torch.tensor(DATA_STAT['state_max']).to(device)
+        self.action_min = torch.tensor(DATA_STAT['action_min']).to(device)
+        self.action_max = torch.tensor(DATA_STAT['action_max']).to(device)
+        self.reset()
+    def get_policy(self):
+        """Initialize the model."""
+        # Initialize model with arguments
+        img_cond_len = (self.args["common"]["img_history_size"]
+                        * self.args["common"]["num_cameras"]
+                        * self.vision_model.num_patches)
+        _model = RDTRunner(
+            action_dim=self.args["common"]["state_dim"],
+            pred_horizon=self.args["common"]["action_chunk_size"],
+            config=self.args["model"],
+            lang_token_dim=self.args["model"]["lang_token_dim"],
+            img_token_dim=self.args["model"]["img_token_dim"],
+            state_token_dim=self.args["model"]["state_token_dim"],
+            max_lang_cond_len=self.args["dataset"]["tokenizer_max_length"],
+            img_cond_len=img_cond_len,
+            img_pos_embed_config=[
+                # No initial pos embed in the last grid size
+                # since we've already done in ViT
+                ("image", (self.args["common"]["img_history_size"],
+                    self.args["common"]["num_cameras"],
+                    -self.vision_model.num_patches)),
+            ],
+            lang_pos_embed_config=[
+                # Similarly, no initial pos embed for language
+                ("lang", -self.args["dataset"]["tokenizer_max_length"]),
+            ],
+            dtype=self.dtype,
+        )
+        return _model
+    def get_text_encoder(self, pretrained_text_encoder_name_or_path):
+        text_embedder = T5Embedder(from_pretrained=pretrained_text_encoder_name_or_path,
+                                   model_max_length=self.args["dataset"]["tokenizer_max_length"],
+                                   device=self.device)
+        tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+        return tokenizer, text_encoder
+    def get_vision_encoder(self, pretrained_vision_encoder_name_or_path):
+        vision_encoder = SiglipVisionTower(vision_tower=pretrained_vision_encoder_name_or_path, args=None)
+        image_processor = vision_encoder.image_processor
+        return image_processor, vision_encoder
+    def reset(self):
+        """Set model to evaluation mode.
+        """
+        device = self.device
+        weight_dtype = self.dtype
+        self.policy.eval()
+        self.text_model.eval()
+        self.vision_model.eval()
+        self.policy = self.policy.to(device, dtype=weight_dtype)
+        self.text_model = self.text_model.to(device, dtype=weight_dtype)
+        self.vision_model = self.vision_model.to(device, dtype=weight_dtype)
+    def load_pretrained_weights(self, pretrained=None):
+        if pretrained is None:
+            return
+        print(f'Loading weights from {pretrained}')
+        filename = os.path.basename(pretrained)
+        if filename.endswith('.pt'):
+            checkpoint =  torch.load(pretrained)
+            self.policy.load_state_dict(checkpoint["module"])
+        elif filename.endswith('.safetensors'):
+            from safetensors.torch import load_model
+            load_model(self.policy, pretrained)
+        else:
+            raise NotImplementedError(f"Unknown checkpoint format: {pretrained}")
+    def encode_instruction(self, instruction, device="cuda"):
+        """Encode string instruction to latent embeddings.
+        Args:
+            instruction: a string of instruction
+            device: a string of device
+        Returns:
+            pred: a tensor of latent embeddings of shape (text_max_length, 512)
+        """
+        tokens = self.text_tokenizer(
+            instruction, return_tensors="pt",
+            padding="longest",
+            truncation=True
+        )["input_ids"].to(device)
+        tokens = tokens.view(1, -1)
+        with torch.no_grad():
+            pred = self.text_model(tokens).last_hidden_state.detach()
+        return pred
+    def _format_joint_to_state(self, joints):
+        """
+        Format the robot joint state into the unified state vector.
+        Args:
+            joints (torch.Tensor): The joint state to be formatted.
+                qpos ([B, N, 14]).
+        Returns:
+            state (torch.Tensor): The formatted state for RDT ([B, N, 128]).
+        """
+        # Rescale the gripper
+        # joints = joints / torch.tensor(
+        #     [[[1, 1, 1, 1, 1, 1, 4.7908, 1, 1, 1, 1, 1, 1, 4.7888]]],
+        #     device=joints.device, dtype=joints.dtype
+        # )
+        # normalize to -1,1
+        joints = (joints - self.state_min) / (self.state_max - self.state_min) * 2 - 1
+        B, N, _ = joints.shape
+        state = torch.zeros(
+            (B, N, self.args["model"]["state_token_dim"]),
+            device=joints.device, dtype=joints.dtype
+        )
+        # assemble the unifed state vector
+        state[:, :, MANISKILL_INDICES] = joints
+        state_elem_mask = torch.zeros(
+            (B, self.args["model"]["state_token_dim"]),
+            device=joints.device, dtype=joints.dtype
+        )
+        state_elem_mask[:, MANISKILL_INDICES] = 1
+        return state, state_elem_mask
+    def _unformat_action_to_joint(self, action):
+        action_indices = MANISKILL_INDICES
+        joints = action[:, :, action_indices]
+        # denormalize to action space
+        joints = (joints + 1) / 2 * (self.action_max - self.action_min) + self.action_min
+        return joints
+    @torch.no_grad()
+    def step(self, proprio, images, text_embeds):
+        """
+        Args:
+            proprio: proprioceptive states
+            images: RGB images
+            text_embeds: instruction embeddings
+        Returns:
+            action: predicted action
+        """
+        device = self.device
+        dtype = self.dtype
+        background_color = np.array([
+            int(x*255) for x in self.image_processor.image_mean
+        ], dtype=np.uint8).reshape(1, 1, 3)
+        background_image = np.ones((
+            self.image_processor.size["height"],
+            self.image_processor.size["width"], 3), dtype=np.uint8
+        ) * background_color
+        image_tensor_list = []
+        for image in images:
+            if image is None:
+                # Replace it with the background image
+                image = Image.fromarray(background_image)
+            if self.image_size is not None:
+                image = transforms.Resize(self.data_args.image_size)(image)
+            if self.args["dataset"].get("auto_adjust_image_brightness", False):
+                pixel_values = list(image.getdata())
+                average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
+                if average_brightness <= 0.15:
+                    image = transforms.ColorJitter(brightness=(1.75,1.75))(image)
+            if self.args["dataset"].get("image_aspect_ratio", "pad") == 'pad':
+                def expand2square(pil_img, background_color):
+                    width, height = pil_img.size
+                    if width == height:
+                        return pil_img
+                    elif width > height:
+                        result = Image.new(pil_img.mode, (width, width), background_color)
+                        result.paste(pil_img, (0, (width - height) // 2))
+                        return result
+                    else:
+                        result = Image.new(pil_img.mode, (height, height), background_color)
+                        result.paste(pil_img, ((height - width) // 2, 0))
+                        return result
+                image = expand2square(image, tuple(int(x*255) for x in self.image_processor.image_mean))
+            image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            image_tensor_list.append(image)
+        image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
+        image_embeds = self.vision_model(image_tensor).detach()
+        image_embeds = image_embeds.reshape(-1, self.vision_model.hidden_size).unsqueeze(0)
+        # history of actions
+        joints = proprio.to(device).unsqueeze(0)   # (1, 1, 14)
+        states, state_elem_mask = self._format_joint_to_state(joints)    # (1, 1, 128), (1, 128)
+        states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(device, dtype=dtype)
+        states = states[:, -1:, :]  # (1, 1, 128)
+        ctrl_freqs = torch.tensor([self.control_frequency]).to(device)
+        text_embeds = text_embeds.to(device, dtype=dtype)
+        trajectory = self.policy.predict_action(
+            lang_tokens=text_embeds,
+            lang_attn_mask=torch.ones(
+                text_embeds.shape[:2], dtype=torch.bool,
+                device=text_embeds.device),
+            img_tokens=image_embeds,
+            state_tokens=states,
+            action_mask=state_elem_mask.unsqueeze(1),
+            ctrl_freqs=ctrl_freqs
+        )
+        trajectory = self._unformat_action_to_joint(trajectory).to(torch.float32)
+        return trajectory

train/dataset.py ADDED Viewed

	@@ -0,0 +1,467 @@

+import traceback
+import time
+import os
+import json
+import math
+import random
+from typing import Dict, Sequence
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from torchvision import transforms
+from PIL import Image
+import transformers
+from data.filelock import FileLock
+from data.hdf5_vla_dataset import TabletopHDF5VLADataset, AnubisHDF5VLADataset
+from train.image_corrupt import image_corrupt
+def get_clean_item(chunk_dir):
+    """
+    Get indexes of clean items in a chunk.
+    """
+    dirty_bit = read_dirty_bit(chunk_dir)
+    return np.where(1 - dirty_bit)[0].tolist()
+def save_dirty_bit(chunk_dir, dirty_bit):
+    """
+    Save the dirty bit to the chunk directory.
+    """
+    time_stmp = time.time()
+    while time.time() - time_stmp < 10.0:
+        try:
+            file_path = os.path.join(chunk_dir, "dirty_bit")
+            lock = FileLock(file_path)
+            lock.acquire_write_lock()
+            with open(file_path, 'wb') as file:
+                file.write(dirty_bit.tobytes())
+            lock.release_lock()
+            return
+        except KeyboardInterrupt:
+            lock.release_lock()
+            raise KeyboardInterrupt
+        except BaseException:
+            lock.release_lock()
+            continue
+    raise RuntimeError("Failed to save dirty bit.")
+def read_dirty_bit(chunk_dir):
+    """
+    Read the dirty bit from the chunk directory.
+    """
+    # If error occurs, retry
+    time_stmp = time.time()
+    while time.time() - time_stmp < 10.0:
+        try:
+            file_path = os.path.join(chunk_dir, "dirty_bit")
+            lock = FileLock(file_path)
+            lock.acquire_read_lock()
+            with open(file_path, 'rb') as file:
+                dirty_bit = np.frombuffer(file.read(), dtype=np.uint8).copy()
+            lock.release_lock()
+            assert len(dirty_bit) > 0
+            return dirty_bit
+        except KeyboardInterrupt:
+            lock.release_lock()
+            raise KeyboardInterrupt
+        except BaseException:
+            lock.release_lock()
+            continue
+    raise RuntimeError("Failed to read dirty bit.")
+class VLAConsumerDataset(Dataset):
+    """A vision-languange-action Dataset for supervised training.
+    This dataset will load data from the buffer directory.
+    """
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        image_processor,
+        num_cameras,
+        img_history_size,
+        image_size=None,
+        auto_adjust_image_brightness=False,
+        image_aug=False,
+        dataset_type='pretrain',
+        cond_mask_prob=0.1,
+        cam_ext_mask_prob=-1.0,
+        state_noise_snr=None,
+        use_hdf5=False,
+        use_precomp_lang_embed=False,
+        task_name=None
+    ):
+        super(VLAConsumerDataset, self).__init__()
+        # Load the control frequency for each dataset
+        with open("configs/dataset_control_freq.json", 'r') as fp:
+            self.control_freq = json.load(fp)
+        # Load the dataset names
+        dataset_names_cfg = 'configs/pretrain_datasets.json' \
+            if dataset_type == 'pretrain' else 'configs/finetune_datasets.json'
+        with open(dataset_names_cfg, 'r') as file:
+            DATASET_NAMES = json.load(file)
+        # Create the mapping between dataset name and id
+        # self.dataset_name2id = {name: i for i, name in enumerate(DATASET_NAMES)}
+        # self.dataset_id2name = {i: name for i, name in enumerate(DATASET_NAMES)}
+        self.dataset_name2id = {task_name: 0}
+        self.dataset_id2name = {0: task_name}
+        self.image_processor = image_processor
+        self.buffer_dir = config["buf_path"]
+        self.num_chunks = config["buf_num_chunks"]
+        self.chunk_size = config["buf_chunk_size"]
+        self.tokenizer_max_length = config["tokenizer_max_length"]
+        self.image_aspect_ratio = config["image_aspect_ratio"]
+        self.state_noise_snr = state_noise_snr
+        self.num_cameras = num_cameras
+        self.img_history_size = img_history_size
+        self.cond_mask_prob = cond_mask_prob
+        self.cam_ext_mask_prob = cam_ext_mask_prob
+        self.use_hdf5 = use_hdf5
+        self.hdf5_dataset = None
+        if use_hdf5:
+            self.hdf5_dataset = AnubisHDF5VLADataset(task_name)
+        self.use_precomp_lang_embed = use_precomp_lang_embed
+        if use_precomp_lang_embed:
+            self.empty_lang_embed = torch.load("data/empty_lang_embed.pt")
+        # Load dataset stat
+        with open("configs/dataset_stat.json", 'r') as f:
+            dataset_stat = json.load(f)
+        self.dataset_stat = dataset_stat
+        self.tokenizer = tokenizer
+        self.image_size = image_size
+        self.auto_adjust_image_brightness = auto_adjust_image_brightness
+        self.image_aug = image_aug
+        self.last_content = None
+        self.last_meta = None
+    def get_dataset_name2id(self):
+        return self.dataset_name2id
+    def get_dataset_id2name(self):
+        return self.dataset_id2name
+    @staticmethod
+    def pairwise(iterable):
+        a = iter(iterable)
+        return zip(a, a)
+    @staticmethod
+    def _load_data_from_chunk(chunk_dir, chunk_item_idx):
+        # If error occurs, retry
+        time_stmp = time.time()
+        while time.time() - time_stmp < 10.0:
+            try:
+                locks = []
+                file_path = os.path.join(chunk_dir, f"json_content_{chunk_item_idx}.json")
+                lock = FileLock(file_path)
+                locks.append(lock)
+                lock.acquire_read_lock()
+                with open(file_path, 'r') as file:
+                    json_content = json.load(file)
+                lock.release_lock()
+                file_path = os.path.join(chunk_dir, f"sample_{chunk_item_idx}.npz")
+                lock = FileLock(file_path)
+                locks.append(lock)
+                lock.acquire_read_lock()
+                with open(file_path, 'rb') as file:
+                    sample_dict = np.load(file)
+                    meta = tuple(sample_dict.values())
+                lock.release_lock()
+                return json_content, meta
+            except KeyboardInterrupt:
+                for lock in locks:
+                    lock.release_lock()
+                raise KeyboardInterrupt
+            except BaseException:
+                for lock in locks:
+                    lock.release_lock()
+                continue
+        raise RuntimeError("Failed to load sample.")
+    def __len__(self) -> int:
+        if self.use_hdf5:
+            return len(self.hdf5_dataset)
+        else:
+            return self.num_chunks * self.chunk_size
+    def _safe_load(self, index):
+        read_chunk_item_indices = []
+        # Start searching from a random chunk
+        read_chunk_idx = index // self.chunk_size
+        while len(read_chunk_item_indices) == 0:
+            read_chunk_dir = os.path.join(self.buffer_dir, f"chunk_{read_chunk_idx}")
+            try:
+                read_chunk_item_indices = get_clean_item(read_chunk_dir)
+            except BaseException as e:
+                # Print the error info
+                print("Error catched when searching a clean chunk:", e)
+                traceback.print_exc()
+                read_chunk_item_indices = []
+            read_chunk_idx = (read_chunk_idx + 1) % self.num_chunks
+        # read_chunk_item_index = random.choice(read_chunk_item_indices)
+        # read_chunk_item_index = read_chunk_item_indices.pop()
+        random_item_index = index % len(read_chunk_item_indices)
+        read_chunk_item_index = read_chunk_item_indices[random_item_index]
+        # Modify the dirty bit
+        try:
+            dirty_bit = read_dirty_bit(read_chunk_dir)
+            dirty_bit[read_chunk_item_index] = 1
+            save_dirty_bit(read_chunk_dir, dirty_bit)
+        except BaseException as e:
+            # Print the error info
+            print("Error catched when modifying the dirty bit:", e)
+            traceback.print_exc()
+        # load the sample
+        try:
+            content, meta = self._load_data_from_chunk(read_chunk_dir, read_chunk_item_index)
+            self.last_content, self.last_meta = content, meta
+        except BaseException as e:
+            # Print the error info
+            print("Error catched when loading sample:", e)
+            traceback.print_exc()
+            # If failed to load the data, return the last loaded data for robustness
+            content, meta = self.last_content, self.last_meta
+        return (content, *meta)
+    def __getitem__(self, index):
+        # For robustness, we will try to load the data until we succeed
+        while True:
+            data_dict = None
+            try:
+                if self.use_hdf5:
+                    res = self.hdf5_dataset.get_item()
+                    content = res['meta']
+                    states = res['state']
+                    actions = res['actions']
+                    state_elem_mask = res['state_indicator']
+                    image_metas = [
+                        res['cam_high'], res['cam_high_mask'],
+                        res['cam_right_wrist'], res['cam_right_wrist_mask'],
+                        res['cam_left_wrist'], res['cam_left_wrist_mask'],
+                    ]
+                    state_std = res['state_std']
+                    state_mean = res['state_mean']
+                    state_norm = res['state_norm']
+                else:
+                    (content, _, states, _, actions, _,
+                    state_elem_mask, *image_metas,
+                    state_std, state_mean, state_norm) = self._safe_load(index)
+                data_dict = {}
+                data_dict['dataset_name'] = content['dataset_name']
+                data_dict['data_idx'] = self.dataset_name2id[data_dict['dataset_name']]
+                data_dict['ctrl_freq'] = self.control_freq[data_dict['dataset_name']] \
+                    if random.random() > self.cond_mask_prob else 0
+                if self.state_noise_snr is not None:
+                    states += np.random.normal(
+                        0.0, state_std / np.sqrt(10 ** (self.state_noise_snr / 10)),
+                        states.shape)
+                ds_state_mean = np.array(self.dataset_stat[data_dict['dataset_name']]['state_mean'])
+                ds_state_mean = np.tile(ds_state_mean[None], (states.shape[0], 1))
+                # Randomly mask the states by the mean state
+                data_dict["states"] = states \
+                    if random.random() > self.cond_mask_prob else ds_state_mean
+                data_dict["actions"] = actions
+                data_dict["state_elem_mask"] = state_elem_mask \
+                    if random.random() > self.cond_mask_prob else np.zeros_like(state_elem_mask)
+                # Stat for the episode that the step belongs to
+                data_dict["state_norm"] = state_norm
+                # We replace the invalid images with the background image
+                # and also randomly mask images by the background image
+                background_color = np.array([
+                    int(x*255) for x in self.image_processor.image_mean
+                ], dtype=np.uint8).reshape(1, 1, 3)
+                background_image = np.ones((
+                    self.image_processor.size["height"],
+                    self.image_processor.size["width"], 3), dtype=np.uint8
+                ) * background_color
+                image_metas = list(self.pairwise(image_metas))
+                mask_probs = [self.cond_mask_prob] * self.num_cameras
+                if self.cam_ext_mask_prob >= 0.0:
+                    mask_probs[0] = self.cam_ext_mask_prob
+                rearranged_images = []
+                for i in range(self.img_history_size):
+                    for j in range(self.num_cameras):
+                        images, image_mask = image_metas[j]
+                        image, valid = images[i], image_mask[i]
+                        if valid and (math.prod(image.shape) > 0) and \
+                            (random.random() > mask_probs[j]):
+                            rearranged_images.append((image, True))
+                        else:
+                            rearranged_images.append((background_image.copy(), False))
+                preprocessed_images = []
+                processor = self.image_processor
+                for image, valid in rearranged_images:
+                    image = Image.fromarray(image)
+                    if self.image_size is not None:
+                        image = transforms.Resize(self.image_size)(image) # (1008, 336)
+                    # assert image.height == 336, "We haven't prepare for training with images of different resolutions."
+                    if valid and self.auto_adjust_image_brightness:
+                        pixel_values = list(image.getdata())
+                        average_brightness = sum(sum(pixel) for pixel in pixel_values) / (len(pixel_values) * 255.0 * 3)
+                        if average_brightness <= 0.15:
+                            image = transforms.ColorJitter(brightness=(1.75,1.75))(image)
+                    # Only apply image augmentation to 50% of the images
+                    if valid and self.image_aug and (random.random() > 0.5):
+                        aug_type = random.choice([
+                            "corrput_only", "color_only", "both"])
+                        if aug_type != "corrput_only":
+                            image = transforms.ColorJitter(
+                                brightness=0.3, contrast=0.4, saturation=0.5, hue=0.03)(image)
+                        if aug_type != "color_only":
+                            image = image_corrupt(image)
+                    if self.image_aspect_ratio == 'pad':
+                        def expand2square(pil_img, background_color):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+                    image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                    preprocessed_images.append(image)
+                data_dict["images"] = preprocessed_images
+                if self.use_precomp_lang_embed:
+                    if content["instruction"][-1] == ".":
+                        content["instruction"] = content["instruction"][:-1]
+                    data_dict["lang_embed"] = torch.load(content["instruction"])['embeddings'][0] \
+                        if random.random() > self.cond_mask_prob else self.empty_lang_embed ##FIXED
+                else:
+                    instruction = content["instruction"] \
+                        if random.random() > self.cond_mask_prob else ""
+                    data_dict["input_ids"] = self.tokenizer(
+                        instruction,
+                        return_tensors="pt",
+                        padding="longest",
+                        truncation=False,
+                    ).input_ids[0]
+                    assert len(data_dict["input_ids"]) <= self.tokenizer_max_length, \
+                        f"Instruction length {len(data_dict['input_ids'])} exceeds the maximum length {self.tokenizer_max_length}."
+                for k, v in data_dict.items():
+                    if isinstance(v, np.ndarray):
+                        data_dict[k] = torch.from_numpy(v)
+                for k, v in data_dict.items():
+                    assert not isinstance(v, np.ndarray), f"key: {k}, value: {v}"
+                        # data_dict[k] = torch.from_numpy(v)
+                return data_dict
+            except BaseException as e:
+                # Print the error info
+                if data_dict is not None:
+                    print(f"Error catched when processing sample from {data_dict.get('dataset_name')}:", e)
+                else:
+                    print(f"Error catched when processing sample:", e)
+                traceback.print_exc()
+                # Try incresing the index
+                index = (index + 1) % len(self)
+class DataCollatorForVLAConsumerDataset(object):
+    """Collate examples for supervised training."""
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer) -> None:
+        self.tokenizer = tokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        batch = {
+            "states": [],
+            "actions": [],
+            "state_elem_mask": [],
+            "state_norm": [],
+            "images": [],
+            "data_indices": [],
+            "ctrl_freqs": []
+        }
+        input_ids = []
+        lang_embeds = []
+        lang_embed_lens = []
+        for instance in instances:
+            # Convert all the numpy arrays to tensor
+            keys_to_check = [
+                'states', 'actions',
+                'state_elem_mask', 'state_norm',
+            ]
+            for key in keys_to_check:
+                if isinstance(instance[key], torch.Tensor):
+                    item = instance[key]
+                else:
+                    item = torch.from_numpy(instance[key])
+                batch[key].append(item)
+            if "input_ids" in instance:
+                input_ids.append(instance["input_ids"])
+            else:
+                lang_embeds.append(instance["lang_embed"])
+                lang_embed_lens.append(instance["lang_embed"].shape[0])
+            batch["images"].append(torch.stack(instance["images"], dim=0))
+            batch["data_indices"].append(instance["data_idx"])
+            batch["ctrl_freqs"].append(instance["ctrl_freq"])
+        keys_to_stack = [
+            'states', 'actions',
+            'state_elem_mask', 'state_norm',
+            "images"
+        ]
+        for key in keys_to_stack:
+            batch[key] = torch.stack(batch[key], dim=0)
+        batch["ctrl_freqs"] = torch.tensor(batch["ctrl_freqs"])
+        if len(input_ids) > 0:
+            input_ids = torch.nn.utils.rnn.pad_sequence(
+                input_ids,
+                batch_first=True,
+                padding_value=self.tokenizer.pad_token_id)
+            batch["input_ids"] = input_ids
+            batch["lang_attn_mask"] = input_ids.ne(self.tokenizer.pad_token_id)
+        else:
+            lang_embeds = torch.nn.utils.rnn.pad_sequence(
+                lang_embeds,
+                batch_first=True,
+                padding_value=0)
+            input_lang_attn_mask = torch.zeros(
+                lang_embeds.shape[0], lang_embeds.shape[1], dtype=torch.bool)
+            for i, l in enumerate(lang_embed_lens):
+                input_lang_attn_mask[i, :l] = True
+            batch["lang_embeds"] = lang_embeds
+            batch["lang_attn_mask"] = input_lang_attn_mask
+        return batch

train/image_corrupt.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+import numpy as np
+np.bool = np.bool_
+import imgaug.augmenters as iaa
+from PIL import Image
+# Define our sequence of augmentation steps that will be applied to every image.
+seq = iaa.Sequential(
+    [
+        # Execute one of the following noise augmentations
+        iaa.OneOf([
+            iaa.AdditiveGaussianNoise(
+                loc=0, scale=(0.0, 0.05*255), per_channel=0.5
+            ),
+            iaa.AdditiveLaplaceNoise(scale=(0.0, 0.05*255), per_channel=0.5),
+            iaa.AdditivePoissonNoise(lam=(0.0, 0.05*255), per_channel=0.5)
+        ]),
+        # Execute one or none of the following blur augmentations
+        iaa.SomeOf((0, 1), [
+            iaa.OneOf([
+                iaa.GaussianBlur((0, 3.0)),
+                iaa.AverageBlur(k=(2, 7)),
+                iaa.MedianBlur(k=(3, 11)),
+            ]),
+            iaa.MotionBlur(k=(3, 36)),
+        ]),
+    ],
+    # do all of the above augmentations in random order
+    random_order=True
+)
+def image_corrupt(image: Image):
+    image_arr = np.array(image)
+    image_arr = image_arr[None, ...]
+    image_arr = seq(images=image_arr)
+    image = Image.fromarray(image_arr[0])
+    return image

train/sample.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from collections import defaultdict
+import torch
+import torch.nn.functional as F
+@torch.no_grad()
+def log_sample_res(
+    text_encoder, vision_encoder, rdt, args,
+    accelerator, weight_dtype, dataset_id2name, dataloader, logger
+):
+    logger.info(
+        f"Running sampling for {args.num_sample_batches} batches..."
+    )
+    rdt.eval()
+    loss_for_log = defaultdict(float)
+    loss_counter = defaultdict(int)
+    for step, batch in enumerate(dataloader):
+        if step >= args.num_sample_batches:
+            break
+        data_indices = batch["data_indices"]
+        ctrl_freqs = batch["ctrl_freqs"]
+        state_norm = batch["state_norm"].to(dtype=weight_dtype)
+        images = batch["images"].to(dtype=weight_dtype)
+        states = batch["states"].to(dtype=weight_dtype)
+        # We only use the last state as input
+        states = states[:, -1:, :]
+        actions = batch["actions"].to(dtype=weight_dtype)
+        state_elem_mask = batch["state_elem_mask"].to(dtype=weight_dtype)
+        batch_size, _, C, H, W = images.shape
+        image_embeds = vision_encoder(images.reshape(-1, C, H, W)).detach()
+        image_embeds = image_embeds.reshape((batch_size, -1, vision_encoder.hidden_size))
+        lang_attn_mask = batch["lang_attn_mask"]
+        text_embeds = batch["lang_embeds"].to(dtype=weight_dtype) \
+            if args.precomp_lang_embed \
+            else text_encoder(
+                input_ids=batch["input_ids"],
+                attention_mask=lang_attn_mask
+            )["last_hidden_state"].detach()
+        with torch.autocast(device_type='cuda',dtype=torch.bfloat16):
+            pred_actions = rdt.predict_action(
+                lang_tokens=text_embeds,
+                lang_attn_mask=lang_attn_mask,
+                img_tokens=image_embeds,
+                state_tokens=states,
+                action_mask=state_elem_mask.unsqueeze(1),
+                ctrl_freqs=ctrl_freqs
+            )
+        num_steps = pred_actions.shape[1]
+        expanded_state_elem_mask = state_elem_mask.unsqueeze(1).tile((1, num_steps, 1)).float()
+        expanded_state_norm = state_norm.unsqueeze(1).tile((1, num_steps, 1)).float()
+        loss = F.mse_loss(pred_actions, actions, reduction='none').float()
+        mse_loss_per_entry = ((loss * expanded_state_elem_mask).reshape((batch_size, -1)).sum(1)
+                            / expanded_state_elem_mask.reshape((batch_size, -1)).sum(1))
+        l2_loss_per_entry = loss.sqrt() / (expanded_state_norm + 1e-3)
+        l2_loss_per_entry = ((l2_loss_per_entry * expanded_state_elem_mask).reshape((batch_size, -1)).sum(1)
+                        / expanded_state_elem_mask.reshape((batch_size, -1)).sum(1))
+        dataset_indices, mse_losses, l2_losses = accelerator.gather_for_metrics(
+            (torch.LongTensor(data_indices).to(device=pred_actions.device),
+             mse_loss_per_entry, l2_loss_per_entry),
+        )
+        dataset_indices = dataset_indices.tolist()
+        if accelerator.is_main_process:
+            for loss_suffix, losses in zip(["_sample_mse", "_sample_l2err"], [mse_losses, l2_losses]):
+                for dataset_idx, loss_tensor in zip(dataset_indices, losses):
+                    loss_name = dataset_id2name[dataset_idx] + loss_suffix
+                    loss_for_log[loss_name] += loss_tensor.item()
+                    loss_counter[loss_name] += 1
+        mse_loss = (loss * expanded_state_elem_mask).sum() / expanded_state_elem_mask.sum()
+        mse_loss_scaler = accelerator.gather(mse_loss).mean().item()
+        loss_for_log["overall_avg_sample_mse"] += mse_loss_scaler
+        l2_loss = loss.sqrt() / (expanded_state_norm + 1e-3)
+        l2_loss = (l2_loss * expanded_state_elem_mask).sum() / expanded_state_elem_mask.sum()
+        l2_loss_scaler = accelerator.gather(l2_loss).mean().item()
+        loss_for_log["overall_avg_sample_l2err"] += l2_loss_scaler
+    for name in loss_for_log:
+        if name in ["overall_avg_sample_mse", "overall_avg_sample_l2err"]:
+            loss_scaler = loss_for_log[name]
+            loss_for_log[name] = round(loss_scaler / (args.num_sample_batches), 4)
+        else:
+            loss_for_log[name] = round(loss_for_log[name] / loss_counter[name], 4)
+    rdt.train()
+    torch.cuda.empty_cache()
+    return dict(loss_for_log)

train/train.py ADDED Viewed

	@@ -0,0 +1,509 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import copy
+import logging
+import math
+import os
+from pathlib import Path
+import diffusers
+import torch
+import torch.utils.checkpoint
+import transformers
+import yaml
+from accelerate import Accelerator
+from accelerate.utils import DeepSpeedPlugin, ProjectConfiguration, set_seed
+from diffusers.optimization import get_scheduler
+from diffusers.utils import is_wandb_available
+from huggingface_hub import create_repo, upload_folder
+from tqdm.auto import tqdm
+from safetensors.torch import load_model
+from models.ema_model import EMAModel
+from models.multimodal_encoder.siglip_encoder import SiglipVisionTower
+from models.multimodal_encoder.t5_encoder import T5Embedder
+from models.rdt_runner import RDTRunner
+from train.dataset import DataCollatorForVLAConsumerDataset, VLAConsumerDataset
+from train.sample import log_sample_res
+if is_wandb_available():
+    import wandb
+def save_model_card(repo_id: str, base_model=str, repo_folder=None):
+    yaml = f"""
+---
+license: mit
+base_model: {base_model}
+language:
+- en
+pipeline_tag: robotics
+library_name: transformers
+tags:
+- robotics
+- pytorch
+- multimodal
+- pretraining
+- vla
+- diffusion
+- rdt
+---
+    """
+    model_card = f"""
+# RDT - {repo_id}
+This is a RDT model derived from {base_model}. The weights were trained using [RDT](https://rdt-robotics.github.io/rdt-robotics/).
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+def train(args, logger):
+    # Read the config
+    with open(args.config_path, "r") as fp:
+        config = yaml.safe_load(fp)
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit)
+    accelerator = Accelerator(
+        deepspeed_plugin=DeepSpeedPlugin(
+            hf_ds_config=args.deepspeed
+        ) if args.deepspeed is not None else None,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_dir=logging_dir,
+        project_config=accelerator_project_config,
+    )
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    if args.precomp_lang_embed:
+        tokenizer, text_encoder = None, None
+    else:
+        text_embedder = T5Embedder(from_pretrained=args.pretrained_text_encoder_name_or_path,
+                                model_max_length=config["dataset"]["tokenizer_max_length"], device=accelerator.device)
+        tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+    vision_encoder = SiglipVisionTower(vision_tower=args.pretrained_vision_encoder_name_or_path, args=None)
+    image_processor = vision_encoder.image_processor
+    # Load from a pretrained checkpoint
+    if (
+        args.pretrained_model_name_or_path is not None
+        and not os.path.isfile(args.pretrained_model_name_or_path)
+    ):
+        logger.info("Constructing model from pretrained checkpoint.")
+        rdt = RDTRunner.from_pretrained(args.pretrained_model_name_or_path)
+    else:
+        logger.info("Constructing model from provided config.")
+        # Calculate the image condition length
+        img_cond_len = (config["common"]["img_history_size"]
+                        * config["common"]["num_cameras"]
+                        * vision_encoder.num_patches)
+        rdt = RDTRunner(
+            action_dim=config["common"]["state_dim"],
+            pred_horizon=config["common"]["action_chunk_size"],
+            config=config["model"],
+            lang_token_dim=config["model"]["lang_token_dim"],
+            img_token_dim=config["model"]["img_token_dim"],
+            state_token_dim=config["model"]["state_token_dim"],
+            max_lang_cond_len=config["dataset"]["tokenizer_max_length"],
+            img_cond_len=img_cond_len,
+            img_pos_embed_config=[
+                # No initial pos embed in the last grid size
+                # since we've already done in ViT
+                ("image", (config["common"]["img_history_size"],
+                    config["common"]["num_cameras"],
+                    -vision_encoder.num_patches)),
+            ],
+            lang_pos_embed_config=[
+                # Similarly, no initial pos embed for language
+                ("lang", -config["dataset"]["tokenizer_max_length"]),
+            ],
+            dtype=weight_dtype,
+        )
+    ema_rdt = copy.deepcopy(rdt)
+    ema_model = EMAModel(
+        ema_rdt,
+        update_after_step=config["model"]["ema"]["update_after_step"],
+        inv_gamma=config["model"]["ema"]["inv_gamma"],
+        power=config["model"]["ema"]["power"],
+        min_value=config["model"]["ema"]["min_value"],
+        max_value=config["model"]["ema"]["max_value"]
+    )
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    # which ensure saving model in huggingface format (config.json + pytorch_model.bin)
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            for model in models:
+                model_to_save = model.module if hasattr(model, "module") else model  # type: ignore
+                if isinstance(model_to_save, type(accelerator.unwrap_model(rdt))):
+                    model_to_save.save_pretrained(output_dir)
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    if args.gradient_checkpointing:
+        # TODO:
+        raise NotImplementedError("Gradient checkpointing is not yet implemented.")
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+    # Optimizer creation
+    params_to_optimize = rdt.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    # Dataset and DataLoaders creation:
+    train_dataset = VLAConsumerDataset(
+        config=config["dataset"],
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        num_cameras=config["common"]["num_cameras"],
+        img_history_size=config["common"]["img_history_size"],
+        dataset_type=args.dataset_type,
+        image_aug=args.image_aug,
+        cond_mask_prob=args.cond_mask_prob,
+        cam_ext_mask_prob=args.cam_ext_mask_prob,
+        state_noise_snr=args.state_noise_snr,
+        use_hdf5=args.load_from_hdf5,
+        use_precomp_lang_embed=args.precomp_lang_embed,
+        task_name=args.dataset_name,
+    )
+    sample_dataset = VLAConsumerDataset(
+        config=config["dataset"],
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        num_cameras=config["common"]["num_cameras"],
+        img_history_size=config["common"]["img_history_size"],
+        dataset_type=args.dataset_type,
+        image_aug=False,
+        cond_mask_prob=0,
+        cam_ext_mask_prob=-1,
+        state_noise_snr=None,
+        use_hdf5=args.load_from_hdf5,
+        use_precomp_lang_embed=args.precomp_lang_embed,
+        task_name=args.dataset_name,
+    )
+    data_collator = DataCollatorForVLAConsumerDataset(tokenizer)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=data_collator,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=True,
+        persistent_workers=True
+    )
+    sample_dataloader = torch.utils.data.DataLoader(
+        sample_dataset,
+        batch_size=args.sample_batch_size,
+        shuffle=True,
+        collate_fn=data_collator,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=True,
+        persistent_workers=True
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    # Prepare everything with our `accelerator`.
+    rdt, optimizer, train_dataloader, sample_dataloader, lr_scheduler = accelerator.prepare(
+        rdt, optimizer, train_dataloader, sample_dataloader, lr_scheduler
+    )
+    ema_rdt.to(accelerator.device, dtype=weight_dtype)
+    if text_encoder is not None:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+    if vision_encoder is not None:
+        vision_encoder.vision_tower.to(accelerator.device, dtype=weight_dtype)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("roboticDiffusionTransformer", config=vars(args))
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Load from a pretrained checkpoint
+    if (
+        args.resume_from_checkpoint is None
+        and args.pretrained_model_name_or_path is not None
+        and os.path.isfile(args.pretrained_model_name_or_path)
+    ):
+        # Since EMA is deprecated, we do not load EMA from the pretrained checkpoint
+        logger.info("Loading from a pretrained checkpoint.")
+        checkpoint = torch.load(args.pretrained_model_name_or_path)
+        rdt.module.load_state_dict(checkpoint["module"])
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            try:
+                accelerator.load_state(os.path.join(args.output_dir, path)) # load_module_strict=False
+            except:
+                # load deepspeed's state_dict
+                logger.info("Resuming training state failed. Attempting to only load from model checkpoint.")
+                checkpoint = torch.load(os.path.join(args.output_dir, path, "pytorch_model", "mp_rank_00_model_states.pt"))
+                rdt.module.load_state_dict(checkpoint["module"])
+            load_model(ema_rdt, os.path.join(args.output_dir, path, "ema", "model.safetensors"))
+            global_step = int(path.split("-")[1])
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    loss_for_log = {}
+    for epoch in range(first_epoch, args.num_train_epochs):
+        rdt.train()
+        # Set the progress_bar to correct position
+        if args.resume_from_checkpoint and epoch == first_epoch:
+            progress_bar.update(resume_step // args.gradient_accumulation_steps)
+        # Forward and backward...
+        for batch in train_dataloader:
+            with accelerator.accumulate(rdt):
+                images = batch["images"].to(dtype=weight_dtype)
+                states = batch["states"].to(dtype=weight_dtype) # (B, T, D_a)
+                # We only use the last state as input
+                states = states[:, -1:, :]
+                actions = batch["actions"].to(dtype=weight_dtype)
+                state_elem_mask = batch["state_elem_mask"].to(dtype=weight_dtype)
+                ctrl_freqs = batch["ctrl_freqs"]
+                with torch.no_grad():
+                    batch_size, _, C, H, W = images.shape
+                    image_embeds = vision_encoder(images.reshape(-1, C, H, W)).detach()
+                    image_embeds = image_embeds.reshape((batch_size, -1, vision_encoder.hidden_size))
+                    lang_attn_mask = batch["lang_attn_mask"]
+                    text_embeds = batch["lang_embeds"].to(dtype=weight_dtype) \
+                        if args.precomp_lang_embed \
+                        else text_encoder(
+                            input_ids=batch["input_ids"],
+                            attention_mask=lang_attn_mask
+                        )["last_hidden_state"].detach()
+                state_elem_mask = state_elem_mask.unsqueeze(1)
+                loss = rdt(
+                    lang_tokens=text_embeds,
+                    lang_attn_mask=lang_attn_mask,
+                    img_tokens=image_embeds,
+                    state_tokens=states,
+                    action_gt=actions,
+                    action_mask=state_elem_mask,
+                    ctrl_freqs=ctrl_freqs
+                )
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = rdt.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+            ema_model.step(accelerator.unwrap_model(rdt))
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.checkpointing_period == 0:
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    accelerator.save_state(save_path)
+                    ema_save_path = os.path.join(save_path, f"ema")
+                    accelerator.save_model(ema_rdt, ema_save_path)
+                    logger.info(f"Saved state to {save_path}")
+                if args.sample_period > 0 and global_step % args.sample_period == 0:
+                    sample_loss_for_log = log_sample_res(
+                        text_encoder,
+                        vision_encoder,
+                        rdt,    # We do not use EMA currently
+                        args,
+                        accelerator,
+                        weight_dtype,
+                        sample_dataset.get_dataset_id2name(),
+                        sample_dataloader,
+                        logger,
+                    )
+                    logger.info(sample_loss_for_log)
+                    accelerator.log(sample_loss_for_log, step=global_step)
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            logs.update(loss_for_log)
+            # logger.info(logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= args.max_train_steps:
+                break
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        accelerator.unwrap_model(rdt).save_pretrained(args.output_dir)
+        ema_save_path = os.path.join(args.output_dir, f"ema")
+        accelerator.save_model(ema_rdt, ema_save_path)
+        logger.info(f"Saved Model to {args.output_dir}")
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                token=args.hub_token,
+                allow_patterns=["pytorch_model.bin", "*.json", "*.md"],
+                # ignore_patterns=["step_*", "epoch_*"],
+            )
+    accelerator.end_training()