# from huggingface_hub import list_datasets, DatasetCard | |
# import re | |
# import pandas as pd | |
# import os | |
# import time | |
# import random | |
# from concurrent.futures import ThreadPoolExecutor, as_completed | |
# from requests.exceptions import HTTPError | |
# # ---------- Retry helper ---------- | |
# def retry_load_card(dataset_id, retries=5, base_wait=60): | |
# """ | |
# Try to load a dataset card with retries if 429 (rate limit) occurs. | |
# Uses Retry-After header if available, otherwise exponential backoff. | |
# """ | |
# for attempt in range(retries): | |
# try: | |
# return DatasetCard.load(dataset_id) | |
# except HTTPError as e: | |
# if e.response is not None and e.response.status_code == 429: | |
# wait_time = e.response.headers.get("Retry-After") | |
# if wait_time is not None: | |
# wait_time = int(wait_time) | |
# else: | |
# wait_time = base_wait * (2 ** attempt) + random.randint(0, 10) | |
# print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...") | |
# time.sleep(wait_time) | |
# continue | |
# else: | |
# raise # don't retry for other HTTP errors | |
# except Exception as e: | |
# print(f"[ERROR] {dataset_id}: {e}") | |
# raise | |
# raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.") | |
# # ---------- Heuristic functions with reasons ---------- | |
# def check_card_quality(card_text, metadata, dataset_url): | |
# reasons = [] | |
# length = len(card_text) | |
# word_count = len(card_text.split()) | |
# if metadata is None or len(metadata) == 0: | |
# print(length, word_count, dataset_url) | |
# if length < 200: | |
# reasons.append("No metadata and no description") | |
# return "minimal", reasons, word_count | |
# else: | |
# reasons.append("No metadata but has description") | |
# return "minimal", reasons, word_count | |
# else: | |
# if length < 200: | |
# reasons.append(f"Short description (char count={length}, words={word_count})") | |
# return "minimal", reasons, word_count | |
# else: | |
# return "rich", reasons, word_count | |
# # ---------- Worker function for one dataset ---------- | |
# def process_dataset(ds, save_dir): | |
# try: | |
# card = retry_load_card(ds.id) | |
# card_text = card.text or "" | |
# metadata = card.data.to_dict() if card.data else {} | |
# dataset_url = f"https://huggingface.co/datasets/{ds.id}" | |
# # Save README locally | |
# readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md") | |
# with open(readme_path, "w", encoding="utf-8") as f: | |
# f.write(card_text) | |
# category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url) | |
# row = { | |
# "dataset_id": ds.id, | |
# "dataset_url": dataset_url, | |
# "downloads": getattr(ds, "downloads", None), | |
# "reason": "; ".join(reasons), | |
# "readme_path": readme_path, | |
# "word_count": word_count, | |
# "category": category, | |
# } | |
# return row | |
# except Exception as e: | |
# return { | |
# "dataset_id": ds.id, | |
# "dataset_url": f"https://huggingface.co/datasets/{ds.id}", | |
# "downloads": getattr(ds, "downloads", None), | |
# "reason": f"Failed to load card", | |
# "readme_path": None, | |
# "word_count": 0, | |
# "category": "minimal", | |
# } | |
# # ---------- Main ---------- | |
# def collect_dataset_ids(limit=1000, save_dir="dataset_readmes", max_workers=16): | |
# minimal_results = [] | |
# rich_results = [] | |
# os.makedirs(save_dir, exist_ok=True) | |
# print(f"Fetching up to {limit} datasets (sorted by downloads)...") | |
# datasets = list_datasets() | |
# with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
# futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets] | |
# for i, f in enumerate(as_completed(futures), 1): | |
# row = f.result() | |
# if row["category"] == "minimal": | |
# minimal_results.append(row) | |
# else: | |
# rich_results.append(row) | |
# return minimal_results, rich_results | |
# if __name__ == "__main__": | |
# minimal, rich = collect_dataset_ids(limit=1000, max_workers=16) | |
# # Save separate CSV files | |
# if minimal: | |
# pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False) | |
# if rich: | |
# pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False) | |
# print("\nSaved results to:") | |
# if minimal: | |
# print(" - minimal_dataset_cards.csv") | |
# if rich: | |
# print(" - rich_dataset_cards.csv") | |
# print(" - README files in ./dataset_readmes/") | |
# print("\nSummary:") | |
# print(f"Minimal: {len(minimal)}") | |
# print(f"Rich: {len(rich)}") | |
from huggingface_hub import list_datasets, DatasetCard | |
import re | |
import pandas as pd | |
import os | |
import time | |
import random | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from requests.exceptions import HTTPError | |
# # ---------- Retry helper ---------- | |
# def retry_load_card(dataset_id, retries=5, base_wait=60): | |
# for attempt in range(retries): | |
# try: | |
# return DatasetCard.load(dataset_id) | |
# except HTTPError as e: | |
# if e.response is not None and e.response.status_code == 429: | |
# wait_time = e.response.headers.get("Retry-After") | |
# if wait_time is not None: | |
# wait_time = int(wait_time) | |
# else: | |
# wait_time = base_wait * (2 ** attempt) + random.randint(0, 10) | |
# print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...") | |
# time.sleep(wait_time) | |
# continue | |
# else: | |
# raise | |
# except Exception as e: | |
# print(f"[ERROR] {dataset_id}: {e}") | |
# raise | |
# raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.") | |
# ---------- Heuristic functions with reasons ---------- | |
def check_card_quality(card_text, metadata, dataset_url): | |
reasons = [] | |
length = len(card_text) | |
word_count = len(card_text.split()) | |
if metadata is None or len(metadata) == 0: | |
print(length, word_count, dataset_url) | |
if length < 200: | |
reasons.append("No metadata and no description") | |
return "minimal", reasons, word_count | |
else: | |
reasons.append("No metadata but has description") | |
return "minimal", reasons, word_count | |
else: | |
if length < 200: | |
reasons.append(f"Short description (char count={length}, words={word_count})") | |
return "minimal", reasons, word_count | |
else: | |
return "rich", reasons, word_count | |
# ---------- Worker function for one dataset ---------- | |
def process_dataset(ds, save_dir): | |
try: | |
card = DatasetCard.load(ds.id) | |
card_text = card.text or "" | |
metadata = card.data.to_dict() if card.data else {} | |
dataset_url = f"https://huggingface.co/datasets/{ds.id}" | |
# Save README locally | |
readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md") | |
with open(readme_path, "w", encoding="utf-8") as f: | |
f.write(card_text) | |
category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url) | |
row = { | |
"dataset_id": ds.id, | |
"dataset_url": dataset_url, | |
"downloads": getattr(ds, "downloads", None), | |
"author": metadata.get("author", None), | |
"license": metadata.get("license", None), | |
"tags": ", ".join(metadata.get("tags", [])) if metadata.get("tags") else None, | |
"task_categories": ", ".join(metadata.get("task_categories", [])) if metadata.get("task_categories") else None, | |
"last_modified": getattr(ds, "lastModified", None), | |
"reason": "; ".join(reasons), | |
"readme_path": readme_path, | |
"word_count": word_count, | |
"category": category, | |
} | |
return row | |
except Exception as e: | |
return { | |
"dataset_id": ds.id, | |
"dataset_url": f"https://huggingface.co/datasets/{ds.id}", | |
"downloads": getattr(ds, "downloads", None), | |
"author": None, | |
"license": None, | |
"tags": None, | |
"task_categories": None, | |
"last_modified": None, | |
"reason": "Failed to load card", | |
"readme_path": None, | |
"word_count": 0, | |
"category": "minimal", | |
} | |
# ---------- Main ---------- | |
def collect_dataset_ids(save_dir="dataset_readmes", max_workers=16): | |
minimal_results = [] | |
rich_results = [] | |
os.makedirs(save_dir, exist_ok=True) | |
datasets = list_datasets() | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets] | |
for i, f in enumerate(as_completed(futures), 1): | |
row = f.result() | |
if row["category"] == "minimal": | |
minimal_results.append(row) | |
else: | |
rich_results.append(row) | |
return minimal_results, rich_results | |
if __name__ == "__main__": | |
minimal, rich = collect_dataset_ids(limit=100, max_workers=16) | |
# Save separate CSV files | |
if minimal: | |
pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False) | |
if rich: | |
pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False) | |
print("\nSaved results to:") | |
if minimal: | |
print(" - minimal_dataset_cards.csv") | |
if rich: | |
print(" - rich_dataset_cards.csv") | |
print(" - README files in ./dataset_readmes/") | |
print("\nSummary:") | |
print(f"Minimal: {len(minimal)}") | |
print(f"Rich: {len(rich)}") | |