Emil Ernerfeldt commited on
Commit
23aef68
·
1 Parent(s): d63b041

More general support for any HuggingFace dataset, with streaming

Browse files
Files changed (3) hide show
  1. README.md +10 -0
  2. main.py +37 -33
  3. requirements.txt +2 -1
README.md CHANGED
@@ -15,5 +15,15 @@ pip install -r requirements.txt
15
  python main.py
16
  ```
17
 
 
 
 
 
 
 
 
 
 
 
18
  ## Note for the maintainer
19
  You can update this repository with the latest changes from https://github.com/rerun-io/rerun_template by running `scripts/template_update.py update --languages python`.
 
15
  python main.py
16
  ```
17
 
18
+ Example datasets to explore (use `python main.py --dataset`):
19
+ * `lerobot/aloha_sim_insertion_human`
20
+ * `lerobot/aloha_sim_insertion_scripted`
21
+ * `lerobot/aloha_sim_transfer_cube_human`
22
+ * `lerobot/aloha_sim_transfer_cube_scripted`
23
+ * `lerobot/pusht`
24
+ * `lerobot/xarm_lift_medium`
25
+ * `nateraw/kitti`
26
+ * `sayakpaul/nyu_depth_v2`
27
+
28
  ## Note for the maintainer
29
  You can update this repository with the latest changes from https://github.com/rerun-io/rerun_template by running `scripts/template_update.py update --languages python`.
main.py CHANGED
@@ -3,12 +3,37 @@
3
  from __future__ import annotations
4
 
5
  import argparse
 
 
6
 
 
7
  import rerun as rr
8
  from datasets import load_dataset
9
  from PIL import Image
10
  from tqdm import tqdm
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def log_dataset_to_rerun(dataset) -> None:
14
  # Special time-like columns
@@ -17,10 +42,7 @@ def log_dataset_to_rerun(dataset) -> None:
17
  # Ignore these columns
18
  IGNORE = {"episode_data_index_from", "episode_data_index_to", "episode_id"}
19
 
20
- num_rows = len(dataset)
21
- for row_nr in tqdm(range(num_rows)):
22
- row = dataset[row_nr]
23
-
24
  # Handle time-like columns first, since they set a state (time is an index in Rerun):
25
  for column_name in TIME_LIKE:
26
  if column_name in row:
@@ -32,50 +54,32 @@ def log_dataset_to_rerun(dataset) -> None:
32
  else:
33
  print(f"Unknown time-like column {column_name} with value {cell}")
34
 
35
- # Now log actual data columns
36
- for column_name in dataset.column_names:
37
  if column_name in TIME_LIKE or column_name in IGNORE:
38
  continue
39
 
40
- cell = row[column_name]
41
- if isinstance(cell, Image.Image):
42
- rr.log(column_name, rr.Image(cell))
43
- elif isinstance(cell, list):
44
- rr.log(column_name, rr.BarChart(cell))
45
- elif isinstance(cell, float) or isinstance(cell, int):
46
- rr.log(column_name, rr.Scalar(cell))
47
- else:
48
- # TODO(emilk): check if it is a tensor and then log it using rr.Tensor
49
- rr.log(column_name, rr.TextDocument(str(cell)))
50
 
51
 
52
  def main():
53
- # Define the available datasets
54
- available_datasets = [
55
- "lerobot/aloha_sim_insertion_human",
56
- "lerobot/aloha_sim_insertion_scripted",
57
- "lerobot/aloha_sim_transfer_cube_human",
58
- "lerobot/aloha_sim_transfer_cube_scripted",
59
- "lerobot/pusht",
60
- "lerobot/xarm_lift_medium",
61
- ]
62
-
63
- # Create the parser
64
  parser = argparse.ArgumentParser(description="Log a HuggingFace dataset to Rerun.")
65
- parser.add_argument("--dataset", choices=available_datasets, default="pusht", help="The name of the dataset to load")
66
  parser.add_argument("--episode-id", default=1, help="Which episode to select")
67
-
68
- # Parse the arguments
69
  args = parser.parse_args()
70
 
71
  print("Loading dataset…")
72
- dataset = load_dataset(args.dataset, split="train")
73
 
74
  print(f"Selecting episode {args.episode_id}…")
75
- ds_subset = dataset.filter(lambda frame: frame["episode_id"] == args.episode_id)
76
 
77
  print("Starting Rerun…")
78
- rr.init("rerun_example_lerobot", spawn=True)
79
 
80
  print("Logging to Rerun…")
81
  log_dataset_to_rerun(ds_subset)
 
3
  from __future__ import annotations
4
 
5
  import argparse
6
+ import logging
7
+ from typing import Any
8
 
9
+ import numpy as np
10
  import rerun as rr
11
  from datasets import load_dataset
12
  from PIL import Image
13
  from tqdm import tqdm
14
 
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def to_rerun(column_name: str, value: Any) -> Any:
19
+ """Do our best to interpret the value and convert it to a Rerun-compatible archetype."""
20
+ if isinstance(value, Image.Image):
21
+ if "depth" in column_name:
22
+ return rr.DepthImage(value)
23
+ else:
24
+ return rr.Image(value)
25
+ elif isinstance(value, np.ndarray):
26
+ return rr.Tensor(value)
27
+ elif isinstance(value, list):
28
+ if isinstance(value[0], float):
29
+ return rr.BarChart(value)
30
+ else:
31
+ return rr.TextDocument(str(value)) # Fallback to text
32
+ elif isinstance(value, float) or isinstance(value, int):
33
+ return rr.Scalar(value)
34
+ else:
35
+ return rr.TextDocument(str(value)) # Fallback to text
36
+
37
 
38
  def log_dataset_to_rerun(dataset) -> None:
39
  # Special time-like columns
 
42
  # Ignore these columns
43
  IGNORE = {"episode_data_index_from", "episode_data_index_to", "episode_id"}
44
 
45
+ for row in tqdm(dataset):
 
 
 
46
  # Handle time-like columns first, since they set a state (time is an index in Rerun):
47
  for column_name in TIME_LIKE:
48
  if column_name in row:
 
54
  else:
55
  print(f"Unknown time-like column {column_name} with value {cell}")
56
 
57
+ # Now log actual data columns:
58
+ for column_name, cell in row.items():
59
  if column_name in TIME_LIKE or column_name in IGNORE:
60
  continue
61
 
62
+ rr.log(column_name, to_rerun(column_name, cell))
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  def main():
66
+ # Ensure the logging gets written to stderr:
67
+ logging.getLogger().addHandler(logging.StreamHandler())
68
+ logging.getLogger().setLevel(logging.INFO)
69
+
 
 
 
 
 
 
 
70
  parser = argparse.ArgumentParser(description="Log a HuggingFace dataset to Rerun.")
71
+ parser.add_argument("--dataset", default="lerobot/pusht", help="The name of the dataset to load")
72
  parser.add_argument("--episode-id", default=1, help="Which episode to select")
 
 
73
  args = parser.parse_args()
74
 
75
  print("Loading dataset…")
76
+ dataset = load_dataset(args.dataset, split="train", streaming=True)
77
 
78
  print(f"Selecting episode {args.episode_id}…")
79
+ ds_subset = dataset.filter(lambda frame: "episode_id" not in frame or frame["episode_id"] == args.episode_id)
80
 
81
  print("Starting Rerun…")
82
+ rr.init(f"rerun_example_lerobot {args.dataset}", spawn=True)
83
 
84
  print("Logging to Rerun…")
85
  log_dataset_to_rerun(ds_subset)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  datasets
2
- Pillow
 
3
  rerun-sdk>=0.15.0,<0.16.0
4
  tqdm
 
1
  datasets
2
+ h5py
3
+ pillow
4
  rerun-sdk>=0.15.0,<0.16.0
5
  tqdm