Spaces:

hugging-science
/

dataset-insight-portal

Running

File size: 10,176 Bytes

2ccb279

# from huggingface_hub import list_datasets, DatasetCard
# import re
# import pandas as pd
# import os
# import time
# import random
# from concurrent.futures import ThreadPoolExecutor, as_completed
# from requests.exceptions import HTTPError


# # ---------- Retry helper ----------
# def retry_load_card(dataset_id, retries=5, base_wait=60):
#     """
#     Try to load a dataset card with retries if 429 (rate limit) occurs.
#     Uses Retry-After header if available, otherwise exponential backoff.
#     """
#     for attempt in range(retries):
#         try:
#             return DatasetCard.load(dataset_id)
#         except HTTPError as e:
#             if e.response is not None and e.response.status_code == 429:
#                 wait_time = e.response.headers.get("Retry-After")
#                 if wait_time is not None:
#                     wait_time = int(wait_time)
#                 else:
#                     wait_time = base_wait * (2 ** attempt) + random.randint(0, 10)
#                 print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...")
#                 time.sleep(wait_time)
#                 continue
#             else:
#                 raise  # don't retry for other HTTP errors
#         except Exception as e:
#             print(f"[ERROR] {dataset_id}: {e}")
#             raise
#     raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.")


# # ---------- Heuristic functions with reasons ----------
# def check_card_quality(card_text, metadata, dataset_url):
#     reasons = []
#     length = len(card_text)
#     word_count = len(card_text.split())

#     if metadata is None or len(metadata) == 0:
#         print(length, word_count, dataset_url)
#         if length < 200:
#             reasons.append("No metadata and no description")
#             return "minimal", reasons, word_count
#         else:
#             reasons.append("No metadata but has description")
#             return "minimal", reasons, word_count
#     else:
#         if length < 200:
#             reasons.append(f"Short description (char count={length}, words={word_count})")
#             return "minimal", reasons, word_count
#         else:
#             return "rich", reasons, word_count
    
# # ---------- Worker function for one dataset ----------
# def process_dataset(ds, save_dir):
#     try:
#         card = retry_load_card(ds.id)
#         card_text = card.text or ""
#         metadata = card.data.to_dict() if card.data else {}
#         dataset_url = f"https://huggingface.co/datasets/{ds.id}"

#         # Save README locally
#         readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md")
#         with open(readme_path, "w", encoding="utf-8") as f:
#             f.write(card_text)

#         category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url)
#         row = {
#             "dataset_id": ds.id,
#             "dataset_url": dataset_url,
#             "downloads": getattr(ds, "downloads", None),
#             "reason": "; ".join(reasons),
#             "readme_path": readme_path,
#             "word_count": word_count,
#             "category": category,
#         }
#         return row
#     except Exception as e:
#         return {
#             "dataset_id": ds.id,
#             "dataset_url": f"https://huggingface.co/datasets/{ds.id}",
#             "downloads": getattr(ds, "downloads", None),
#             "reason": f"Failed to load card",
#             "readme_path": None,
#             "word_count": 0,
#             "category": "minimal",
#         }


# # ---------- Main ----------
# def collect_dataset_ids(limit=1000, save_dir="dataset_readmes", max_workers=16):
#     minimal_results = []
#     rich_results = []

#     os.makedirs(save_dir, exist_ok=True)

#     print(f"Fetching up to {limit} datasets (sorted by downloads)...")
#     datasets = list_datasets()

#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
#         futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets]
#         for i, f in enumerate(as_completed(futures), 1):
#             row = f.result()
#             if row["category"] == "minimal":
#                 minimal_results.append(row)
#             else:
#                 rich_results.append(row)
#     return minimal_results, rich_results


# if __name__ == "__main__":
#     minimal, rich = collect_dataset_ids(limit=1000, max_workers=16)

#     # Save separate CSV files
#     if minimal:
#         pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False)
#     if rich:
#         pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False)

#     print("\nSaved results to:")
#     if minimal:
#         print(" - minimal_dataset_cards.csv")
#     if rich:
#         print(" - rich_dataset_cards.csv")
#     print(" - README files in ./dataset_readmes/")

#     print("\nSummary:")
#     print(f"Minimal: {len(minimal)}")
#     print(f"Rich: {len(rich)}")


from huggingface_hub import list_datasets, DatasetCard
import re
import pandas as pd
import os
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.exceptions import HTTPError


# # ---------- Retry helper ----------
# def retry_load_card(dataset_id, retries=5, base_wait=60):
#     for attempt in range(retries):
#         try:
#             return DatasetCard.load(dataset_id)
#         except HTTPError as e:
#             if e.response is not None and e.response.status_code == 429:
#                 wait_time = e.response.headers.get("Retry-After")
#                 if wait_time is not None:
#                     wait_time = int(wait_time)
#                 else:
#                     wait_time = base_wait * (2 ** attempt) + random.randint(0, 10)
#                 print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...")
#                 time.sleep(wait_time)
#                 continue
#             else:
#                 raise
#         except Exception as e:
#             print(f"[ERROR] {dataset_id}: {e}")
#             raise
#     raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.")


# ---------- Heuristic functions with reasons ----------
def check_card_quality(card_text, metadata, dataset_url):
    reasons = []
    length = len(card_text)
    word_count = len(card_text.split())

    if metadata is None or len(metadata) == 0:
        print(length, word_count, dataset_url)
        if length < 200:
            reasons.append("No metadata and no description")
            return "minimal", reasons, word_count
        else:
            reasons.append("No metadata but has description")
            return "minimal", reasons, word_count
    else:
        if length < 200:
            reasons.append(f"Short description (char count={length}, words={word_count})")
            return "minimal", reasons, word_count
        else:
            return "rich", reasons, word_count
    

# ---------- Worker function for one dataset ----------
def process_dataset(ds, save_dir):
    try:
        card = DatasetCard.load(ds.id)
        card_text = card.text or ""
        metadata = card.data.to_dict() if card.data else {}
        dataset_url = f"https://huggingface.co/datasets/{ds.id}"

        # Save README locally
        readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md")
        with open(readme_path, "w", encoding="utf-8") as f:
            f.write(card_text)

        category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url)

        row = {
            "dataset_id": ds.id,
            "dataset_url": dataset_url,
            "downloads": getattr(ds, "downloads", None),
            "author": metadata.get("author", None),
            "license": metadata.get("license", None),
            "tags": ", ".join(metadata.get("tags", [])) if metadata.get("tags") else None,
            "task_categories": ", ".join(metadata.get("task_categories", [])) if metadata.get("task_categories") else None,
            "last_modified": getattr(ds, "lastModified", None),
            "reason": "; ".join(reasons),
            "readme_path": readme_path,
            "word_count": word_count,
            "category": category,
        }
        return row
    except Exception as e:
        return {
            "dataset_id": ds.id,
            "dataset_url": f"https://huggingface.co/datasets/{ds.id}",
            "downloads": getattr(ds, "downloads", None),
            "author": None,
            "license": None,
            "tags": None,
            "task_categories": None,
            "last_modified": None,
            "reason": "Failed to load card",
            "readme_path": None,
            "word_count": 0,
            "category": "minimal",
        }


# ---------- Main ----------
def collect_dataset_ids(save_dir="dataset_readmes", max_workers=16):
    minimal_results = []
    rich_results = []

    os.makedirs(save_dir, exist_ok=True)

    datasets = list_datasets()

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets]
        for i, f in enumerate(as_completed(futures), 1):
            row = f.result()
            if row["category"] == "minimal":
                minimal_results.append(row)
            else:
                rich_results.append(row)
    return minimal_results, rich_results


if __name__ == "__main__":
    minimal, rich = collect_dataset_ids(limit=100, max_workers=16)

    # Save separate CSV files
    if minimal:
        pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False)
    if rich:
        pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False)

    print("\nSaved results to:")
    if minimal:
        print(" - minimal_dataset_cards.csv")
    if rich:
        print(" - rich_dataset_cards.csv")
    print(" - README files in ./dataset_readmes/")

    print("\nSummary:")
    print(f"Minimal: {len(minimal)}")
    print(f"Rich: {len(rich)}")