cgeorgiaw HF Staff commited on
Commit
a2b2adc
·
1 Parent(s): 0e40351

first commit, quick app

Browse files
Files changed (2) hide show
  1. README.md +1 -0
  2. app.py +207 -0
README.md CHANGED
@@ -8,6 +8,7 @@ sdk_version: 5.46.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ hf_oauth: true
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import uuid
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import List, Dict, Any
8
+
9
+ import gradio as gr
10
+ from huggingface_hub import CommitScheduler
11
+
12
+ # ------------------------------
13
+ # Config
14
+ # ------------------------------
15
+ DATASET_REPO_ID = "hugging-science/dataset-quest-index"
16
+ COMMIT_EVERY_MIN = 2
17
+
18
+ # Local folder where submissions are accumulated before CommitScheduler pushes them
19
+ LOCAL_SUBMISSIONS_DIR = Path("submissions")
20
+ LOCAL_SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
21
+ LOCAL_FILE = LOCAL_SUBMISSIONS_DIR / f"records_{uuid.uuid4().hex}.jsonl"
22
+
23
+ scheduler = CommitScheduler(
24
+ repo_id=DATASET_REPO_ID,
25
+ repo_type="dataset",
26
+ folder_path=LOCAL_SUBMISSIONS_DIR,
27
+ path_in_repo="data",
28
+ every=COMMIT_EVERY_MIN,
29
+ )
30
+
31
+
32
+ # ------------------------------
33
+ # Utilities
34
+ # ------------------------------
35
+ def _now_iso() -> str:
36
+ return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
37
+
38
+
39
+ def read_all_records() -> List[Dict[str, Any]]:
40
+ """Read all jsonl records from LOCAL_SUBMISSIONS_DIR into a list."""
41
+ records: List[Dict[str, Any]] = []
42
+ for p in sorted(LOCAL_SUBMISSIONS_DIR.glob("*.jsonl")):
43
+ try:
44
+ with p.open("r", encoding="utf-8") as f:
45
+ for line in f:
46
+ line = line.strip()
47
+ if not line:
48
+ continue
49
+ try:
50
+ records.append(json.loads(line))
51
+ except Exception:
52
+ # Skip malformed lines
53
+ pass
54
+ except FileNotFoundError:
55
+ pass
56
+ return records
57
+
58
+
59
+ def append_record(record: Dict[str, Any]) -> None:
60
+ LOCAL_FILE.parent.mkdir(parents=True, exist_ok=True)
61
+ with LOCAL_FILE.open("a", encoding="utf-8") as f:
62
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
63
+
64
+
65
+ def filter_records(records: List[Dict[str, Any]], field: str | None, search: str | None) -> List[Dict[str, Any]]:
66
+ def match(rec: Dict[str, Any]) -> bool:
67
+ ok = True
68
+ if field and field != "All":
69
+ ok = ok and (rec.get("field") == field)
70
+ if search:
71
+ s = search.lower()
72
+ hay = " ".join(
73
+ str(rec.get(k, "")) for k in ["dataset_name", "dataset_url", "description", "user", "field"]
74
+ ).lower()
75
+ ok = ok and (s in hay)
76
+ return ok
77
+
78
+ return [r for r in records if match(r)]
79
+
80
+
81
+ # ------------------------------
82
+ # App logic
83
+ # ------------------------------
84
+ FIELDS = [
85
+ "NLP",
86
+ "Computer Vision",
87
+ "Audio",
88
+ "Multimodal",
89
+ "Reinforcement Learning",
90
+ "Time Series",
91
+ "Tabular",
92
+ "Other",
93
+ ]
94
+
95
+ SIZE_UNITS = ["KB", "MB", "GB", "TB"]
96
+
97
+
98
+ def submit_entry(
99
+ dataset_name: str,
100
+ dataset_url: str,
101
+ description: str,
102
+ size_value: float,
103
+ size_unit: str,
104
+ field: str,
105
+ profile: gr.OAuthProfile | None,
106
+ ):
107
+ # Basic validation
108
+ errors = []
109
+ if not dataset_name.strip():
110
+ errors.append("Dataset name is required.")
111
+ if not dataset_url.strip() or not dataset_url.startswith(("http://", "https://", "https://huggingface.co/")):
112
+ errors.append("Dataset URL must be an http(s) link.")
113
+ if size_value is None or size_value < 0:
114
+ errors.append("Approximate size must be a non-negative number.")
115
+ if field not in FIELDS:
116
+ errors.append("Please choose a field.")
117
+
118
+ if errors:
119
+ return gr.update(value=f"Submission failed:\n- " + "\n- ".join(errors), visible=True), gr.update(visible=False)
120
+
121
+ user_display = profile.name if profile else "anonymous"
122
+ user_handle = getattr(profile, "preferred_username", None) if profile else None
123
+
124
+ record = {
125
+ "id": uuid.uuid4().hex,
126
+ "created_at": _now_iso(),
127
+ "dataset_name": dataset_name.strip(),
128
+ "dataset_url": dataset_url.strip(),
129
+ "description": description.strip(),
130
+ "approx_size": float(size_value),
131
+ "size_unit": size_unit,
132
+ "field": field,
133
+ "user": user_handle or user_display,
134
+ }
135
+
136
+ append_record(record)
137
+ # Return success notice and refresh table
138
+ ok = f"Thanks, {user_display}. Your entry has been saved locally and will sync to the Hub within ~{COMMIT_EVERY_MIN} minutes."
139
+ updated = read_all_records()
140
+ # Project to a neat table
141
+ rows = [
142
+ [r["dataset_name"], r["dataset_url"], r["description"], f"{r['approx_size']} {r['size_unit']}", r["field"], r["user"], r["created_at"]]
143
+ for r in updated
144
+ ]
145
+ return gr.update(value=ok, visible=True), rows
146
+
147
+
148
+ def refresh_table(field: str, search: str):
149
+ data = read_all_records()
150
+ data = filter_records(data, field, search)
151
+ rows = [
152
+ [r["dataset_name"], r["dataset_url"], r["description"], f"{r['approx_size']} {r['size_unit']}", r["field"], r["user"], r["created_at"]]
153
+ for r in data
154
+ ]
155
+ return rows
156
+
157
+
158
+ # ------------------------------
159
+ # UI
160
+ # ------------------------------
161
+ with gr.Blocks(title="Community Dataset Index", css=".wrap {max-width: 1200px; margin: 0 auto}") as demo:
162
+ gr.Markdown("# Community Dataset Index\nContribute datasets with a short description. Sign in to record your HF username.")
163
+ gr.LoginButton()
164
+
165
+ with gr.Row(elem_classes=["wrap"]):
166
+ with gr.Column(scale=1):
167
+ gr.Markdown("### Submit a dataset")
168
+ name = gr.Textbox(label="Dataset name", placeholder="e.g. The Pile")
169
+ url = gr.Textbox(label="Dataset URL (HF, website or paper)", placeholder="https://huggingface.co/datasets/... or https://...")
170
+ desc = gr.Textbox(label="Short description", lines=4)
171
+ with gr.Row():
172
+ size_val = gr.Number(label="Approx. size", minimum=0, value=0)
173
+ size_unit = gr.Dropdown(SIZE_UNITS, value="GB", label="Unit")
174
+ field = gr.Dropdown(FIELDS, label="Field", value="NLP")
175
+ submit = gr.Button("Submit", variant="primary")
176
+ notice = gr.Markdown(visible=False)
177
+ with gr.Column(scale=2):
178
+ gr.Markdown("### Browse & filter")
179
+ with gr.Row():
180
+ field_filter = gr.Dropdown(["All"] + FIELDS, value="All", label="Field filter")
181
+ search = gr.Textbox(label="Search", placeholder="Search name, URL, description, user…")
182
+ refresh = gr.Button("Refresh")
183
+ table = gr.Dataframe(
184
+ headers=["Name", "URL", "Description", "Size", "Field", "User", "Created"],
185
+ datatype=["str", "str", "str", "str", "str", "str", "str"],
186
+ interactive=False,
187
+ wrap=True,
188
+ )
189
+
190
+ # Wire events
191
+ submit.click(
192
+ submit_entry,
193
+ inputs=[name, url, desc, size_val, size_unit, field, gr.OAuthProfile()],
194
+ outputs=[notice, table],
195
+ show_progress="minimal",
196
+ )
197
+
198
+ refresh.click(refresh_table, inputs=[field_filter, search], outputs=table)
199
+ field_filter.change(refresh_table, inputs=[field_filter, search], outputs=table)
200
+ search.submit(refresh_table, inputs=[field_filter, search], outputs=table)
201
+
202
+ # Populate on launch
203
+ demo.load(lambda: refresh_table("All", ""), inputs=None, outputs=table)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ demo.launch()