# from huggingface_hub import list_datasets, DatasetCard # import re # import pandas as pd # import os # import time # import random # from concurrent.futures import ThreadPoolExecutor, as_completed # from requests.exceptions import HTTPError # # ---------- Retry helper ---------- # def retry_load_card(dataset_id, retries=5, base_wait=60): # """ # Try to load a dataset card with retries if 429 (rate limit) occurs. # Uses Retry-After header if available, otherwise exponential backoff. # """ # for attempt in range(retries): # try: # return DatasetCard.load(dataset_id) # except HTTPError as e: # if e.response is not None and e.response.status_code == 429: # wait_time = e.response.headers.get("Retry-After") # if wait_time is not None: # wait_time = int(wait_time) # else: # wait_time = base_wait * (2 ** attempt) + random.randint(0, 10) # print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...") # time.sleep(wait_time) # continue # else: # raise # don't retry for other HTTP errors # except Exception as e: # print(f"[ERROR] {dataset_id}: {e}") # raise # raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.") # # ---------- Heuristic functions with reasons ---------- # def check_card_quality(card_text, metadata, dataset_url): # reasons = [] # length = len(card_text) # word_count = len(card_text.split()) # if metadata is None or len(metadata) == 0: # print(length, word_count, dataset_url) # if length < 200: # reasons.append("No metadata and no description") # return "minimal", reasons, word_count # else: # reasons.append("No metadata but has description") # return "minimal", reasons, word_count # else: # if length < 200: # reasons.append(f"Short description (char count={length}, words={word_count})") # return "minimal", reasons, word_count # else: # return "rich", reasons, word_count # # ---------- Worker function for one dataset ---------- # def process_dataset(ds, save_dir): # try: # card = retry_load_card(ds.id) # card_text = card.text or "" # metadata = card.data.to_dict() if card.data else {} # dataset_url = f"https://huggingface.co/datasets/{ds.id}" # # Save README locally # readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md") # with open(readme_path, "w", encoding="utf-8") as f: # f.write(card_text) # category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url) # row = { # "dataset_id": ds.id, # "dataset_url": dataset_url, # "downloads": getattr(ds, "downloads", None), # "reason": "; ".join(reasons), # "readme_path": readme_path, # "word_count": word_count, # "category": category, # } # return row # except Exception as e: # return { # "dataset_id": ds.id, # "dataset_url": f"https://huggingface.co/datasets/{ds.id}", # "downloads": getattr(ds, "downloads", None), # "reason": f"Failed to load card", # "readme_path": None, # "word_count": 0, # "category": "minimal", # } # # ---------- Main ---------- # def collect_dataset_ids(limit=1000, save_dir="dataset_readmes", max_workers=16): # minimal_results = [] # rich_results = [] # os.makedirs(save_dir, exist_ok=True) # print(f"Fetching up to {limit} datasets (sorted by downloads)...") # datasets = list_datasets() # with ThreadPoolExecutor(max_workers=max_workers) as executor: # futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets] # for i, f in enumerate(as_completed(futures), 1): # row = f.result() # if row["category"] == "minimal": # minimal_results.append(row) # else: # rich_results.append(row) # return minimal_results, rich_results # if __name__ == "__main__": # minimal, rich = collect_dataset_ids(limit=1000, max_workers=16) # # Save separate CSV files # if minimal: # pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False) # if rich: # pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False) # print("\nSaved results to:") # if minimal: # print(" - minimal_dataset_cards.csv") # if rich: # print(" - rich_dataset_cards.csv") # print(" - README files in ./dataset_readmes/") # print("\nSummary:") # print(f"Minimal: {len(minimal)}") # print(f"Rich: {len(rich)}") from huggingface_hub import list_datasets, DatasetCard import re import pandas as pd import os import time import random from concurrent.futures import ThreadPoolExecutor, as_completed from requests.exceptions import HTTPError # # ---------- Retry helper ---------- # def retry_load_card(dataset_id, retries=5, base_wait=60): # for attempt in range(retries): # try: # return DatasetCard.load(dataset_id) # except HTTPError as e: # if e.response is not None and e.response.status_code == 429: # wait_time = e.response.headers.get("Retry-After") # if wait_time is not None: # wait_time = int(wait_time) # else: # wait_time = base_wait * (2 ** attempt) + random.randint(0, 10) # print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...") # time.sleep(wait_time) # continue # else: # raise # except Exception as e: # print(f"[ERROR] {dataset_id}: {e}") # raise # raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.") # ---------- Heuristic functions with reasons ---------- def check_card_quality(card_text, metadata, dataset_url): reasons = [] length = len(card_text) word_count = len(card_text.split()) if metadata is None or len(metadata) == 0: print(length, word_count, dataset_url) if length < 200: reasons.append("No metadata and no description") return "minimal", reasons, word_count else: reasons.append("No metadata but has description") return "minimal", reasons, word_count else: if length < 200: reasons.append(f"Short description (char count={length}, words={word_count})") return "minimal", reasons, word_count else: return "rich", reasons, word_count # ---------- Worker function for one dataset ---------- def process_dataset(ds, save_dir): try: card = DatasetCard.load(ds.id) card_text = card.text or "" metadata = card.data.to_dict() if card.data else {} dataset_url = f"https://huggingface.co/datasets/{ds.id}" # Save README locally readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md") with open(readme_path, "w", encoding="utf-8") as f: f.write(card_text) category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url) row = { "dataset_id": ds.id, "dataset_url": dataset_url, "downloads": getattr(ds, "downloads", None), "author": metadata.get("author", None), "license": metadata.get("license", None), "tags": ", ".join(metadata.get("tags", [])) if metadata.get("tags") else None, "task_categories": ", ".join(metadata.get("task_categories", [])) if metadata.get("task_categories") else None, "last_modified": getattr(ds, "lastModified", None), "reason": "; ".join(reasons), "readme_path": readme_path, "word_count": word_count, "category": category, } return row except Exception as e: return { "dataset_id": ds.id, "dataset_url": f"https://huggingface.co/datasets/{ds.id}", "downloads": getattr(ds, "downloads", None), "author": None, "license": None, "tags": None, "task_categories": None, "last_modified": None, "reason": "Failed to load card", "readme_path": None, "word_count": 0, "category": "minimal", } # ---------- Main ---------- def collect_dataset_ids(save_dir="dataset_readmes", max_workers=16): minimal_results = [] rich_results = [] os.makedirs(save_dir, exist_ok=True) datasets = list_datasets() with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets] for i, f in enumerate(as_completed(futures), 1): row = f.result() if row["category"] == "minimal": minimal_results.append(row) else: rich_results.append(row) return minimal_results, rich_results if __name__ == "__main__": minimal, rich = collect_dataset_ids(limit=100, max_workers=16) # Save separate CSV files if minimal: pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False) if rich: pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False) print("\nSaved results to:") if minimal: print(" - minimal_dataset_cards.csv") if rich: print(" - rich_dataset_cards.csv") print(" - README files in ./dataset_readmes/") print("\nSummary:") print(f"Minimal: {len(minimal)}") print(f"Rich: {len(rich)}")