from __future__ import annotations import json import uuid from datetime import datetime from pathlib import Path from typing import List, Dict, Any import gradio as gr from huggingface_hub import CommitScheduler # ------------------------------ # Config # ------------------------------ DATASET_REPO_ID = "hugging-science/dataset-quest-index" COMMIT_EVERY_MIN = 2 LOCAL_SUBMISSIONS_DIR = Path("submissions") LOCAL_SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True) LOCAL_FILE = LOCAL_SUBMISSIONS_DIR / f"records_{uuid.uuid4().hex}.jsonl" scheduler = CommitScheduler( repo_id=DATASET_REPO_ID, repo_type="dataset", folder_path=LOCAL_SUBMISSIONS_DIR, path_in_repo="data", every=COMMIT_EVERY_MIN, ) # ------------------------------ # Utilities # ------------------------------ def _now_iso() -> str: return datetime.utcnow().replace(microsecond=0).isoformat() + "Z" def read_all_records() -> List[Dict[str, Any]]: records: List[Dict[str, Any]] = [] for p in sorted(LOCAL_SUBMISSIONS_DIR.glob("*.jsonl")): try: with p.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: records.append(json.loads(line)) except Exception: pass except FileNotFoundError: pass return records def append_record(record: Dict[str, Any]) -> None: LOCAL_FILE.parent.mkdir(parents=True, exist_ok=True) with LOCAL_FILE.open("a", encoding="utf-8") as f: f.write(json.dumps(record, ensure_ascii=False) + "\n") def filter_records(records: List[Dict[str, Any]], field: str | None, search: str | None) -> List[Dict[str, Any]]: def match(rec: Dict[str, Any]) -> bool: ok = True if field and field != "All": ok = ok and (rec.get("field") == field) if search: s = search.lower() hay = " ".join( str(rec.get(k, "")) for k in ["dataset_name", "dataset_url", "description", "user", "field"] ).lower() ok = ok and (s in hay) return ok return [r for r in records if match(r)] # ------------------------------ # App logic # ------------------------------ SIZE_UNITS = ["KB", "MB", "GB", "TB"] def submit_entry( dataset_name: str, dataset_url: str, description: str, size_value: float, size_unit: str, field: str, profile: gr.OAuthProfile | None, ): errors = [] if not dataset_name.strip(): errors.append("Dataset name is required.") if not dataset_url.strip() or not dataset_url.startswith(("http://", "https://", "https://huggingface.co/")): errors.append("Dataset URL must be an http(s) link.") if size_value is None or size_value < 0: errors.append("Approximate size must be a non-negative number.") if not field.strip(): errors.append("Please provide a field.") # Check for existing dataset URL and name existing_records = read_all_records() for record in existing_records: if record.get("dataset_url", "").strip().lower() == dataset_url.strip().lower(): errors.append(f"Dataset URL already exists: {record.get('dataset_url')}") if record.get("dataset_name", "").strip().lower() == dataset_name.strip().lower(): errors.append(f"Dataset name already exists: {record.get('dataset_name')}") if errors: return gr.update(value=f"Submission failed:\n- " + "\n- ".join(errors), visible=True), gr.update(visible=False) user_display = profile.name if profile else "anonymous" user_handle = getattr(profile, "preferred_username", None) if profile else None record = { "id": uuid.uuid4().hex, "created_at": _now_iso(), "dataset_name": dataset_name.strip(), "dataset_url": dataset_url.strip(), "description": description.strip(), "approx_size": float(size_value), "size_unit": size_unit, "field": field.strip(), "user": user_handle or user_display, } append_record(record) ok = f"Thanks, {user_display}. Your entry has been saved locally and will sync to the Hub within ~{COMMIT_EVERY_MIN} minutes." updated = read_all_records() rows = [ [r["dataset_name"], f'{r["dataset_url"]}', r["description"], f"{r['approx_size']} {r['size_unit']}", r["field"], r["user"], r["created_at"]] for r in updated ] return gr.update(value=ok, visible=True), rows def refresh_table(field: str, search: str): data = read_all_records() data = filter_records(data, field, search) rows = [ [r["dataset_name"], f'{r["dataset_url"]}', r["description"], f"{r['approx_size']} {r['size_unit']}", r["field"], r["user"], r["created_at"]] for r in data ] return rows # ------------------------------ # UI # ------------------------------ with gr.Blocks(title="Community Dataset Index", css=".wrap {max-width: 1200px; margin: 0 auto}") as demo: gr.Markdown("# Community Dataset Index\nContribute datasets with a short description. Sign in to record your HF username.") gr.LoginButton() with gr.Row(elem_classes=["wrap"]): with gr.Column(scale=1): gr.Markdown("### Submit a dataset") name = gr.Textbox(label="Dataset name", placeholder="e.g. The Pile") url = gr.Textbox(label="Dataset URL (HF, website or paper)", placeholder="https://huggingface.co/datasets/... or https://...") desc = gr.Textbox(label="Short description", lines=4) with gr.Row(): size_val = gr.Number(label="Approx. size", minimum=0, value=0) size_unit = gr.Dropdown(SIZE_UNITS, value="GB", label="Unit") field = gr.Textbox(label="Field (e.g. PDEs, multi-omics, single-cell, catalysts, etc.)") submit = gr.Button("Submit", variant="primary") notice = gr.Markdown(visible=False) with gr.Column(scale=2): gr.Markdown("### Browse & filter") with gr.Row(): field_filter = gr.Textbox(label="Field filter (leave blank for all)") search = gr.Textbox(label="Search", placeholder="Search name, URL, description, user…") refresh = gr.Button("Refresh") table = gr.Dataframe( headers=["Name", "URL", "Description", "Size", "Field", "User", "Created"], datatype=["str", "html", "str", "str", "str", "str", "str"], interactive=False, wrap=True, ) submit.click( submit_entry, inputs=[name, url, desc, size_val, size_unit, field], outputs=[notice, table], show_progress="minimal", ) refresh.click(refresh_table, inputs=[field_filter, search], outputs=table) field_filter.change(refresh_table, inputs=[field_filter, search], outputs=table) search.submit(refresh_table, inputs=[field_filter, search], outputs=table) demo.load(lambda: refresh_table("", ""), inputs=None, outputs=table) if __name__ == "__main__": demo.launch()