preview_dataset

Sleeping

App Files Files Community

Emil Ernerfeldt commited on Apr 18, 2024

Commit

23aef68

1 Parent(s): d63b041

More general support for any HuggingFace dataset, with streaming

Browse files

Files changed (3) hide show

README.md +10 -0
main.py +37 -33
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -15,5 +15,15 @@ pip install -r requirements.txt
 python main.py
 ```
 ## Note for the maintainer
 You can update this repository with the latest changes from https://github.com/rerun-io/rerun_template by running `scripts/template_update.py update --languages python`.

 python main.py
 ```
+Example datasets to explore (use `python main.py --dataset`):
+* `lerobot/aloha_sim_insertion_human`
+* `lerobot/aloha_sim_insertion_scripted`
+* `lerobot/aloha_sim_transfer_cube_human`
+* `lerobot/aloha_sim_transfer_cube_scripted`
+* `lerobot/pusht`
+* `lerobot/xarm_lift_medium`
+* `nateraw/kitti`
+* `sayakpaul/nyu_depth_v2`
 ## Note for the maintainer
 You can update this repository with the latest changes from https://github.com/rerun-io/rerun_template by running `scripts/template_update.py update --languages python`.

main.py CHANGED Viewed

@@ -3,12 +3,37 @@
 from __future__ import annotations
 import argparse
 import rerun as rr
 from datasets import load_dataset
 from PIL import Image
 from tqdm import tqdm
 def log_dataset_to_rerun(dataset) -> None:
     # Special time-like columns
@@ -17,10 +42,7 @@ def log_dataset_to_rerun(dataset) -> None:
     # Ignore these columns
     IGNORE = {"episode_data_index_from", "episode_data_index_to", "episode_id"}
-    num_rows = len(dataset)
-    for row_nr in tqdm(range(num_rows)):
-        row = dataset[row_nr]
         # Handle time-like columns first, since they set a state (time is an index in Rerun):
         for column_name in TIME_LIKE:
             if column_name in row:
@@ -32,50 +54,32 @@ def log_dataset_to_rerun(dataset) -> None:
                 else:
                     print(f"Unknown time-like column {column_name} with value {cell}")
-        # Now log actual data columns
-        for column_name in dataset.column_names:
             if column_name in TIME_LIKE or column_name in IGNORE:
                 continue
-            cell = row[column_name]
-            if isinstance(cell, Image.Image):
-                rr.log(column_name, rr.Image(cell))
-            elif isinstance(cell, list):
-                rr.log(column_name, rr.BarChart(cell))
-            elif isinstance(cell, float) or isinstance(cell, int):
-                rr.log(column_name, rr.Scalar(cell))
-            else:
-                # TODO(emilk): check if it is a tensor and then log it using rr.Tensor
-                rr.log(column_name, rr.TextDocument(str(cell)))
 def main():
-    # Define the available datasets
-    available_datasets = [
-        "lerobot/aloha_sim_insertion_human",
-        "lerobot/aloha_sim_insertion_scripted",
-        "lerobot/aloha_sim_transfer_cube_human",
-        "lerobot/aloha_sim_transfer_cube_scripted",
-        "lerobot/pusht",
-        "lerobot/xarm_lift_medium",
-    ]
-    # Create the parser
     parser = argparse.ArgumentParser(description="Log a HuggingFace dataset to Rerun.")
-    parser.add_argument("--dataset", choices=available_datasets, default="pusht", help="The name of the dataset to load")
     parser.add_argument("--episode-id", default=1, help="Which episode to select")
-    # Parse the arguments
     args = parser.parse_args()
     print("Loading dataset…")
-    dataset = load_dataset(args.dataset, split="train")
     print(f"Selecting episode {args.episode_id}…")
-    ds_subset = dataset.filter(lambda frame: frame["episode_id"] == args.episode_id)
     print("Starting Rerun…")
-    rr.init("rerun_example_lerobot", spawn=True)
     print("Logging to Rerun…")
     log_dataset_to_rerun(ds_subset)

 from __future__ import annotations
 import argparse
+import logging
+from typing import Any
+import numpy as np
 import rerun as rr
 from datasets import load_dataset
 from PIL import Image
 from tqdm import tqdm
+logger = logging.getLogger(__name__)
+def to_rerun(column_name: str, value: Any) -> Any:
+    """Do our best to interpret the value and convert it to a Rerun-compatible archetype."""
+    if isinstance(value, Image.Image):
+        if "depth" in column_name:
+            return rr.DepthImage(value)
+        else:
+            return rr.Image(value)
+    elif isinstance(value, np.ndarray):
+        return rr.Tensor(value)
+    elif isinstance(value, list):
+        if isinstance(value[0], float):
+            return rr.BarChart(value)
+        else:
+            return rr.TextDocument(str(value))  # Fallback to text
+    elif isinstance(value, float) or isinstance(value, int):
+        return rr.Scalar(value)
+    else:
+        return rr.TextDocument(str(value))  # Fallback to text
 def log_dataset_to_rerun(dataset) -> None:
     # Special time-like columns
     # Ignore these columns
     IGNORE = {"episode_data_index_from", "episode_data_index_to", "episode_id"}
+    for row in tqdm(dataset):
         # Handle time-like columns first, since they set a state (time is an index in Rerun):
         for column_name in TIME_LIKE:
             if column_name in row:
                 else:
                     print(f"Unknown time-like column {column_name} with value {cell}")
+        # Now log actual data columns:
+        for column_name, cell in row.items():
             if column_name in TIME_LIKE or column_name in IGNORE:
                 continue
+            rr.log(column_name, to_rerun(column_name, cell))
 def main():
+    # Ensure the logging gets written to stderr:
+    logging.getLogger().addHandler(logging.StreamHandler())
+    logging.getLogger().setLevel(logging.INFO)
     parser = argparse.ArgumentParser(description="Log a HuggingFace dataset to Rerun.")
+    parser.add_argument("--dataset", default="lerobot/pusht", help="The name of the dataset to load")
     parser.add_argument("--episode-id", default=1, help="Which episode to select")
     args = parser.parse_args()
     print("Loading dataset…")
+    dataset = load_dataset(args.dataset, split="train", streaming=True)
     print(f"Selecting episode {args.episode_id}…")
+    ds_subset = dataset.filter(lambda frame: "episode_id" not in frame or frame["episode_id"] == args.episode_id)
     print("Starting Rerun…")
+    rr.init(f"rerun_example_lerobot {args.dataset}", spawn=True)
     print("Logging to Rerun…")
     log_dataset_to_rerun(ds_subset)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 datasets
-Pillow
 rerun-sdk>=0.15.0,<0.16.0
 tqdm

 datasets
+h5py
+pillow
 rerun-sdk>=0.15.0,<0.16.0
 tqdm