mertunsall commited on
Commit
8dc7996
·
1 Parent(s): 2a35e5e
Files changed (1) hide show
  1. app.py +46 -30
app.py CHANGED
@@ -26,26 +26,18 @@ def _extract_top_level(repo_id: str) -> tuple[list[str], list[str]]:
26
  return top_level_dirs, top_level_files
27
 
28
 
29
- def _summarize_directory(repo_id: str, directory: str) -> dict:
30
- """Return a lightweight summary of the contents of a top-level directory."""
31
  if not directory:
32
- return {}
33
 
34
  files = [path for path in _list_repo_files(repo_id) if path.startswith(f"{directory}/")]
35
  relative_paths = [path[len(directory) + 1 :] for path in files]
36
-
 
37
  child_dirs = sorted({rel.split("/", 1)[0] for rel in relative_paths if "/" in rel})
38
- child_files = sorted(rel for rel in relative_paths if rel and "/" not in rel)
39
-
40
- sample_files = child_files[:10]
41
- has_more_files = len(child_files) > len(sample_files)
42
-
43
- return {
44
- "folder": directory,
45
- "total_files": len(files),
46
- "direct_subdirectories": child_dirs,
47
- "sample_files": sample_files + (["..."] if has_more_files else []),
48
- }
49
 
50
 
51
  def refresh_repo(repo_id: str):
@@ -55,17 +47,17 @@ def refresh_repo(repo_id: str):
55
  print(f"[refresh_repo] Hub HTTP error for {repo_id}: {error}", flush=True)
56
  print(traceback.format_exc(), flush=True)
57
  return (
 
58
  gr.update(choices=[], value=None, interactive=False),
59
  gr.update(value=f"❌ Unable to load repo `{repo_id}`: {error}"),
60
- {}
61
  )
62
  except Exception as error: # pragma: no cover - network and auth edge cases
63
  print(f"[refresh_repo] Unexpected error for {repo_id}: {error}", flush=True)
64
  print(traceback.format_exc(), flush=True)
65
  return (
 
66
  gr.update(choices=[], value=None, interactive=False),
67
  gr.update(value=f"❌ Unexpected error loading `{repo_id}`: {error}"),
68
- {}
69
  )
70
 
71
  status_lines = [
@@ -79,26 +71,50 @@ def refresh_repo(repo_id: str):
79
  status_lines.append("• No sub-folders found at root.")
80
 
81
  dropdown_value = top_dirs[0] if top_dirs else None
82
- dropdown_update = gr.update(
 
 
 
 
 
83
  choices=top_dirs,
84
  value=dropdown_value,
85
  interactive=bool(top_dirs),
86
  label="Top-level folders",
87
  info="Choose a folder to explore"
88
  )
 
 
 
 
 
 
 
 
89
 
90
- folder_summary = _summarize_directory(repo_id, dropdown_value) if dropdown_value else {}
91
-
92
- return dropdown_update, gr.update(value="\n".join(status_lines)), folder_summary
93
 
94
 
95
- def update_directory(repo_id: str, directory: str):
 
96
  try:
97
- return _summarize_directory(repo_id, directory)
 
 
 
 
 
 
 
 
 
 
 
 
98
  except Exception as error:
99
- print(f"[update_directory] Error for {repo_id}/{directory}: {error}", flush=True)
100
  print(traceback.format_exc(), flush=True)
101
- return {}
102
 
103
 
104
  with gr.Blocks(title="HF Dataset Explorer") as demo:
@@ -119,24 +135,24 @@ Provide a dataset repository ID (e.g. `org/dataset`) to list its top-level folde
119
 
120
  status_display = gr.Markdown()
121
  folder_dropdown = gr.Dropdown(label="Top-level folders", interactive=False)
122
- folder_details = gr.JSON(label="Folder summary")
123
 
124
  reload_button.click(
125
  refresh_repo,
126
  inputs=repo_id_input,
127
- outputs=[folder_dropdown, status_display, folder_details],
128
  )
129
 
130
  folder_dropdown.change(
131
- update_directory,
132
  inputs=[repo_id_input, folder_dropdown],
133
- outputs=folder_details,
134
  )
135
 
136
  demo.load(
137
  refresh_repo,
138
  inputs=repo_id_input,
139
- outputs=[folder_dropdown, status_display, folder_details],
140
  )
141
 
142
  if __name__ == "__main__":
 
26
  return top_level_dirs, top_level_files
27
 
28
 
29
+ def _get_subdirectories(repo_id: str, directory: str) -> list[str]:
30
+ """Return the direct subdirectories of the given directory."""
31
  if not directory:
32
+ return []
33
 
34
  files = [path for path in _list_repo_files(repo_id) if path.startswith(f"{directory}/")]
35
  relative_paths = [path[len(directory) + 1 :] for path in files]
36
+
37
+ # Get immediate subdirectories (first level only)
38
  child_dirs = sorted({rel.split("/", 1)[0] for rel in relative_paths if "/" in rel})
39
+
40
+ return child_dirs
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  def refresh_repo(repo_id: str):
 
47
  print(f"[refresh_repo] Hub HTTP error for {repo_id}: {error}", flush=True)
48
  print(traceback.format_exc(), flush=True)
49
  return (
50
+ gr.update(choices=[], value=None, interactive=False),
51
  gr.update(choices=[], value=None, interactive=False),
52
  gr.update(value=f"❌ Unable to load repo `{repo_id}`: {error}"),
 
53
  )
54
  except Exception as error: # pragma: no cover - network and auth edge cases
55
  print(f"[refresh_repo] Unexpected error for {repo_id}: {error}", flush=True)
56
  print(traceback.format_exc(), flush=True)
57
  return (
58
+ gr.update(choices=[], value=None, interactive=False),
59
  gr.update(choices=[], value=None, interactive=False),
60
  gr.update(value=f"❌ Unexpected error loading `{repo_id}`: {error}"),
 
61
  )
62
 
63
  status_lines = [
 
71
  status_lines.append("• No sub-folders found at root.")
72
 
73
  dropdown_value = top_dirs[0] if top_dirs else None
74
+
75
+ # Get subdirectories for the first top-level folder
76
+ subdirs = _get_subdirectories(repo_id, dropdown_value) if dropdown_value else []
77
+ subdir_value = subdirs[0] if subdirs else None
78
+
79
+ first_dropdown_update = gr.update(
80
  choices=top_dirs,
81
  value=dropdown_value,
82
  interactive=bool(top_dirs),
83
  label="Top-level folders",
84
  info="Choose a folder to explore"
85
  )
86
+
87
+ second_dropdown_update = gr.update(
88
+ choices=subdirs,
89
+ value=subdir_value,
90
+ interactive=bool(subdirs),
91
+ label="Subdirectories",
92
+ info="Choose a subdirectory"
93
+ )
94
 
95
+ return first_dropdown_update, second_dropdown_update, gr.update(value="\n".join(status_lines))
 
 
96
 
97
 
98
+ def update_second_dropdown(repo_id: str, top_level_dir: str):
99
+ """Update the second dropdown when the first dropdown selection changes."""
100
  try:
101
+ if not top_level_dir:
102
+ return gr.update(choices=[], value=None, interactive=False)
103
+
104
+ subdirs = _get_subdirectories(repo_id, top_level_dir)
105
+ subdir_value = subdirs[0] if subdirs else None
106
+
107
+ return gr.update(
108
+ choices=subdirs,
109
+ value=subdir_value,
110
+ interactive=bool(subdirs),
111
+ label="Subdirectories",
112
+ info="Choose a subdirectory"
113
+ )
114
  except Exception as error:
115
+ print(f"[update_second_dropdown] Error for {repo_id}/{top_level_dir}: {error}", flush=True)
116
  print(traceback.format_exc(), flush=True)
117
+ return gr.update(choices=[], value=None, interactive=False)
118
 
119
 
120
  with gr.Blocks(title="HF Dataset Explorer") as demo:
 
135
 
136
  status_display = gr.Markdown()
137
  folder_dropdown = gr.Dropdown(label="Top-level folders", interactive=False)
138
+ subfolder_dropdown = gr.Dropdown(label="Subdirectories", interactive=False)
139
 
140
  reload_button.click(
141
  refresh_repo,
142
  inputs=repo_id_input,
143
+ outputs=[folder_dropdown, subfolder_dropdown, status_display],
144
  )
145
 
146
  folder_dropdown.change(
147
+ update_second_dropdown,
148
  inputs=[repo_id_input, folder_dropdown],
149
+ outputs=subfolder_dropdown,
150
  )
151
 
152
  demo.load(
153
  refresh_repo,
154
  inputs=repo_id_input,
155
+ outputs=[folder_dropdown, subfolder_dropdown, status_display],
156
  )
157
 
158
  if __name__ == "__main__":