mertunsall commited on
Commit
a51c831
·
1 Parent(s): 36ea8e3

Initial commit

Browse files
Files changed (1) hide show
  1. app.py +132 -0
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+
3
+ import gradio as gr
4
+ from huggingface_hub import HfApi, HfHubHTTPError
5
+
6
+
7
+ DEFAULT_REPO_ID = "mlfoundations-cua-dev/human_eval"
8
+
9
+
10
+ api = HfApi()
11
+
12
+
13
+ @lru_cache(maxsize=16)
14
+ def _list_repo_files(repo_id: str) -> list[str]:
15
+ """Return all file paths contained in a Hugging Face dataset repository."""
16
+ return api.list_repo_files(repo_id=repo_id, repo_type="dataset")
17
+
18
+
19
+ def _extract_top_level(repo_id: str) -> tuple[list[str], list[str]]:
20
+ """Split top-level folders and files for the given repository."""
21
+ files = _list_repo_files(repo_id)
22
+ top_level_dirs = sorted({path.split("/", 1)[0] for path in files if "/" in path})
23
+ top_level_files = sorted(path for path in files if "/" not in path)
24
+ return top_level_dirs, top_level_files
25
+
26
+
27
+ def _summarize_directory(repo_id: str, directory: str) -> dict:
28
+ """Return a lightweight summary of the contents of a top-level directory."""
29
+ if not directory:
30
+ return {}
31
+
32
+ files = [path for path in _list_repo_files(repo_id) if path.startswith(f"{directory}/")]
33
+ relative_paths = [path[len(directory) + 1 :] for path in files]
34
+
35
+ child_dirs = sorted({rel.split("/", 1)[0] for rel in relative_paths if "/" in rel})
36
+ child_files = sorted(rel for rel in relative_paths if rel and "/" not in rel)
37
+
38
+ sample_files = child_files[:10]
39
+ has_more_files = len(child_files) > len(sample_files)
40
+
41
+ return {
42
+ "folder": directory,
43
+ "total_files": len(files),
44
+ "direct_subdirectories": child_dirs,
45
+ "sample_files": sample_files + (["..."] if has_more_files else []),
46
+ }
47
+
48
+
49
+ def refresh_repo(repo_id: str):
50
+ try:
51
+ top_dirs, top_files = _extract_top_level(repo_id)
52
+ except HfHubHTTPError as error:
53
+ return (
54
+ gr.Dropdown.update(choices=[], value=None, interactive=False),
55
+ gr.Markdown.update(value=f"❌ Unable to load repo `{repo_id}`: {error}"),
56
+ {}
57
+ )
58
+ except Exception as error: # pragma: no cover - network and auth edge cases
59
+ return (
60
+ gr.Dropdown.update(choices=[], value=None, interactive=False),
61
+ gr.Markdown.update(value=f"❌ Unexpected error loading `{repo_id}`: {error}"),
62
+ {}
63
+ )
64
+
65
+ status_lines = [
66
+ f"✅ Loaded `{repo_id}`",
67
+ f"• Top-level folders: {len(top_dirs)}",
68
+ ]
69
+
70
+ if top_files:
71
+ status_lines.append(f"• Loose files at root: {len(top_files)}")
72
+ if not top_dirs:
73
+ status_lines.append("• No sub-folders found at root.")
74
+
75
+ dropdown_value = top_dirs[0] if top_dirs else None
76
+ dropdown_update = gr.Dropdown.update(
77
+ choices=top_dirs,
78
+ value=dropdown_value,
79
+ interactive=bool(top_dirs),
80
+ label="Top-level folders",
81
+ info="Choose a folder to explore"
82
+ )
83
+
84
+ folder_summary = _summarize_directory(repo_id, dropdown_value) if dropdown_value else {}
85
+
86
+ return dropdown_update, gr.Markdown.update(value="\n".join(status_lines)), folder_summary
87
+
88
+
89
+ def update_directory(repo_id: str, directory: str):
90
+ return _summarize_directory(repo_id, directory)
91
+
92
+
93
+ with gr.Blocks(title="HF Dataset Explorer") as demo:
94
+ gr.Markdown(
95
+ """# Hugging Face Dataset Explorer
96
+
97
+ Provide a dataset repository ID (e.g. `org/dataset`) to list its top-level folders."""
98
+ )
99
+
100
+ with gr.Row():
101
+ repo_id_input = gr.Textbox(
102
+ value=DEFAULT_REPO_ID,
103
+ label="Dataset repo ID",
104
+ placeholder="owner/dataset",
105
+ info="Any public dataset on the Hugging Face Hub"
106
+ )
107
+ reload_button = gr.Button("Load repo", variant="primary")
108
+
109
+ status_display = gr.Markdown()
110
+ folder_dropdown = gr.Dropdown(label="Top-level folders", interactive=False)
111
+ folder_details = gr.JSON(label="Folder summary")
112
+
113
+ reload_button.click(
114
+ refresh_repo,
115
+ inputs=repo_id_input,
116
+ outputs=[folder_dropdown, status_display, folder_details],
117
+ )
118
+
119
+ folder_dropdown.change(
120
+ update_directory,
121
+ inputs=[repo_id_input, folder_dropdown],
122
+ outputs=folder_details,
123
+ )
124
+
125
+ demo.load(
126
+ refresh_repo,
127
+ inputs=repo_id_input,
128
+ outputs=[folder_dropdown, status_display, folder_details],
129
+ )
130
+
131
+ if __name__ == "__main__":
132
+ demo.launch()