|
from __future__ import annotations |
|
|
|
import json |
|
import uuid |
|
from datetime import datetime |
|
from pathlib import Path |
|
from typing import List, Dict, Any |
|
|
|
import gradio as gr |
|
from huggingface_hub import CommitScheduler, snapshot_download |
|
|
|
|
|
|
|
|
|
DATASET_REPO_ID = "hugging-science/dataset-quest-index" |
|
COMMIT_EVERY_MIN = 2 |
|
|
|
LOCAL_SUBMISSIONS_DIR = Path("submissions") |
|
LOCAL_SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True) |
|
LOCAL_FILE = LOCAL_SUBMISSIONS_DIR / f"records_{uuid.uuid4().hex}.jsonl" |
|
|
|
scheduler = CommitScheduler( |
|
repo_id=DATASET_REPO_ID, |
|
repo_type="dataset", |
|
folder_path=LOCAL_SUBMISSIONS_DIR, |
|
path_in_repo="data", |
|
every=COMMIT_EVERY_MIN, |
|
) |
|
|
|
|
|
|
|
|
|
def _now_iso() -> str: |
|
return datetime.utcnow().replace(microsecond=0).isoformat() + "Z" |
|
|
|
|
|
def read_all_records() -> List[Dict[str, Any]]: |
|
records: List[Dict[str, Any]] = [] |
|
|
|
local_files = sorted(LOCAL_SUBMISSIONS_DIR.glob("*.jsonl")) |
|
sources = list(local_files) |
|
|
|
if not sources: |
|
try: |
|
snap_dir = Path(snapshot_download( |
|
repo_id=DATASET_REPO_ID, |
|
repo_type="dataset", |
|
allow_patterns="data/*.jsonl" |
|
)) |
|
hub_data_dir = snap_dir / "data" |
|
sources = sorted(hub_data_dir.glob("*.jsonl")) |
|
except Exception: |
|
|
|
sources = [] |
|
|
|
for p in sources: |
|
try: |
|
with p.open("r", encoding="utf-8") as f: |
|
for line in f: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
try: |
|
records.append(json.loads(line)) |
|
except Exception: |
|
pass |
|
except FileNotFoundError: |
|
pass |
|
return records |
|
|
|
|
|
def append_record(record: Dict[str, Any]) -> None: |
|
LOCAL_FILE.parent.mkdir(parents=True, exist_ok=True) |
|
with LOCAL_FILE.open("a", encoding="utf-8") as f: |
|
f.write(json.dumps(record, ensure_ascii=False) + "\n") |
|
|
|
|
|
def filter_records(records: List[Dict[str, Any]], field: str | None, search: str | None) -> List[Dict[str, Any]]: |
|
def match(rec: Dict[str, Any]) -> bool: |
|
ok = True |
|
if field and field != "All": |
|
ok = ok and (rec.get("field") == field) |
|
if search: |
|
s = search.lower() |
|
hay = " ".join( |
|
str(rec.get(k, "")) for k in ["dataset_name", "dataset_url", "description", "user", "field"] |
|
).lower() |
|
ok = ok and (s in hay) |
|
return ok |
|
|
|
return [r for r in records if match(r)] |
|
|
|
|
|
|
|
|
|
|
|
SIZE_UNITS = ["KB", "MB", "GB", "TB"] |
|
|
|
|
|
def submit_entry( |
|
dataset_name: str, |
|
dataset_url: str, |
|
description: str, |
|
size_value: float, |
|
size_unit: str, |
|
field: str, |
|
profile: gr.OAuthProfile | None, |
|
): |
|
errors = [] |
|
if not dataset_name.strip(): |
|
errors.append("Dataset name is required.") |
|
if not dataset_url.strip() or not dataset_url.startswith(("http://", "https://", "https://huggingface.co/")): |
|
errors.append("Dataset URL must be an http(s) link.") |
|
if size_value is None or size_value < 0: |
|
errors.append("Approximate size must be a non-negative number.") |
|
if not field.strip(): |
|
errors.append("Please provide a field.") |
|
|
|
|
|
existing_records = read_all_records() |
|
for record in existing_records: |
|
if record.get("dataset_url", "").strip().lower() == dataset_url.strip().lower(): |
|
errors.append(f"Dataset URL already exists: {record.get('dataset_url')}") |
|
if record.get("dataset_name", "").strip().lower() == dataset_name.strip().lower(): |
|
errors.append(f"Dataset name already exists: {record.get('dataset_name')}") |
|
|
|
if errors: |
|
return gr.update(value=f"Submission failed:\n- " + "\n- ".join(errors), visible=True), gr.update(visible=False) |
|
|
|
user_display = profile.name if profile else "anonymous" |
|
user_handle = getattr(profile, "preferred_username", None) if profile else None |
|
|
|
record = { |
|
"id": uuid.uuid4().hex, |
|
"created_at": _now_iso(), |
|
"dataset_name": dataset_name.strip(), |
|
"dataset_url": dataset_url.strip(), |
|
"description": description.strip(), |
|
"approx_size": float(size_value), |
|
"size_unit": size_unit, |
|
"field": field.strip(), |
|
"user": user_handle or user_display, |
|
} |
|
|
|
append_record(record) |
|
ok = f"Thanks, {user_display}. Your entry has been saved locally and will sync to the Hub within ~{COMMIT_EVERY_MIN} minutes." |
|
updated = read_all_records() |
|
rows = [ |
|
[r["dataset_name"], f'<a href="{r["dataset_url"]}" target="_blank">{r["dataset_url"]}</a>', r["description"], f"{r['approx_size']} {r['size_unit']}", r["field"], r["user"], r["created_at"]] |
|
for r in updated |
|
] |
|
return gr.update(value=ok, visible=True), rows |
|
|
|
|
|
def refresh_table(field: str, search: str): |
|
data = read_all_records() |
|
data = filter_records(data, field, search) |
|
rows = [ |
|
[r["dataset_name"], f'<a href="{r["dataset_url"]}" target="_blank">{r["dataset_url"]}</a>', r["description"], f"{r['approx_size']} {r['size_unit']}", r["field"], r["user"], r["created_at"]] |
|
for r in data |
|
] |
|
return rows |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Community Dataset Index", css=".wrap {margin: 0 auto}", fill_width=True) as demo: |
|
gr.Markdown("# Community Dataset Index\nContribute datasets with a short description. Sign in to record your HF username.") |
|
gr.LoginButton() |
|
|
|
with gr.Row(elem_classes=["wrap"]): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### Submit a dataset") |
|
name = gr.Textbox(label="Dataset name", placeholder="e.g. The Pile") |
|
url = gr.Textbox(label="Dataset URL (HF, website or paper)", placeholder="https://huggingface.co/datasets/... or https://...") |
|
desc = gr.Textbox(label="Short description", lines=4) |
|
with gr.Row(): |
|
size_val = gr.Number(label="Approx. size", minimum=0, value=0) |
|
size_unit = gr.Dropdown(SIZE_UNITS, value="GB", label="Unit") |
|
field = gr.Textbox(label="Field (e.g. PDEs, multi-omics, single-cell, catalysts, etc.)") |
|
submit = gr.Button("Submit", variant="primary") |
|
notice = gr.Markdown(visible=False) |
|
with gr.Column(scale=2): |
|
gr.Markdown("### Browse & filter") |
|
with gr.Row(): |
|
field_filter = gr.Textbox(label="Field filter (leave blank for all)") |
|
search = gr.Textbox(label="Search", placeholder="Search name, URL, description, user…") |
|
refresh = gr.Button("Refresh") |
|
table = gr.Dataframe( |
|
headers=["Name", "URL", "Description", "Size", "Field", "User", "Created"], |
|
datatype=["str", "html", "str", "str", "str", "str", "str"], |
|
interactive=False, |
|
wrap=True, |
|
show_fullscreen_button=True, |
|
) |
|
|
|
submit.click( |
|
submit_entry, |
|
inputs=[name, url, desc, size_val, size_unit, field], |
|
outputs=[notice, table], |
|
show_progress="minimal", |
|
) |
|
|
|
refresh.click(refresh_table, inputs=[field_filter, search], outputs=table) |
|
field_filter.change(refresh_table, inputs=[field_filter, search], outputs=table) |
|
search.submit(refresh_table, inputs=[field_filter, search], outputs=table) |
|
|
|
demo.load(lambda: refresh_table("", ""), inputs=None, outputs=table) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(ssr_mode=False) |
|
|