dataset-insight-portal / heuristic_approach.py
Santosh
updated ryan science tags
2ccb279
raw
history blame
10.2 kB
# from huggingface_hub import list_datasets, DatasetCard
# import re
# import pandas as pd
# import os
# import time
# import random
# from concurrent.futures import ThreadPoolExecutor, as_completed
# from requests.exceptions import HTTPError
# # ---------- Retry helper ----------
# def retry_load_card(dataset_id, retries=5, base_wait=60):
# """
# Try to load a dataset card with retries if 429 (rate limit) occurs.
# Uses Retry-After header if available, otherwise exponential backoff.
# """
# for attempt in range(retries):
# try:
# return DatasetCard.load(dataset_id)
# except HTTPError as e:
# if e.response is not None and e.response.status_code == 429:
# wait_time = e.response.headers.get("Retry-After")
# if wait_time is not None:
# wait_time = int(wait_time)
# else:
# wait_time = base_wait * (2 ** attempt) + random.randint(0, 10)
# print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...")
# time.sleep(wait_time)
# continue
# else:
# raise # don't retry for other HTTP errors
# except Exception as e:
# print(f"[ERROR] {dataset_id}: {e}")
# raise
# raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.")
# # ---------- Heuristic functions with reasons ----------
# def check_card_quality(card_text, metadata, dataset_url):
# reasons = []
# length = len(card_text)
# word_count = len(card_text.split())
# if metadata is None or len(metadata) == 0:
# print(length, word_count, dataset_url)
# if length < 200:
# reasons.append("No metadata and no description")
# return "minimal", reasons, word_count
# else:
# reasons.append("No metadata but has description")
# return "minimal", reasons, word_count
# else:
# if length < 200:
# reasons.append(f"Short description (char count={length}, words={word_count})")
# return "minimal", reasons, word_count
# else:
# return "rich", reasons, word_count
# # ---------- Worker function for one dataset ----------
# def process_dataset(ds, save_dir):
# try:
# card = retry_load_card(ds.id)
# card_text = card.text or ""
# metadata = card.data.to_dict() if card.data else {}
# dataset_url = f"https://huggingface.co/datasets/{ds.id}"
# # Save README locally
# readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md")
# with open(readme_path, "w", encoding="utf-8") as f:
# f.write(card_text)
# category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url)
# row = {
# "dataset_id": ds.id,
# "dataset_url": dataset_url,
# "downloads": getattr(ds, "downloads", None),
# "reason": "; ".join(reasons),
# "readme_path": readme_path,
# "word_count": word_count,
# "category": category,
# }
# return row
# except Exception as e:
# return {
# "dataset_id": ds.id,
# "dataset_url": f"https://huggingface.co/datasets/{ds.id}",
# "downloads": getattr(ds, "downloads", None),
# "reason": f"Failed to load card",
# "readme_path": None,
# "word_count": 0,
# "category": "minimal",
# }
# # ---------- Main ----------
# def collect_dataset_ids(limit=1000, save_dir="dataset_readmes", max_workers=16):
# minimal_results = []
# rich_results = []
# os.makedirs(save_dir, exist_ok=True)
# print(f"Fetching up to {limit} datasets (sorted by downloads)...")
# datasets = list_datasets()
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
# futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets]
# for i, f in enumerate(as_completed(futures), 1):
# row = f.result()
# if row["category"] == "minimal":
# minimal_results.append(row)
# else:
# rich_results.append(row)
# return minimal_results, rich_results
# if __name__ == "__main__":
# minimal, rich = collect_dataset_ids(limit=1000, max_workers=16)
# # Save separate CSV files
# if minimal:
# pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False)
# if rich:
# pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False)
# print("\nSaved results to:")
# if minimal:
# print(" - minimal_dataset_cards.csv")
# if rich:
# print(" - rich_dataset_cards.csv")
# print(" - README files in ./dataset_readmes/")
# print("\nSummary:")
# print(f"Minimal: {len(minimal)}")
# print(f"Rich: {len(rich)}")
from huggingface_hub import list_datasets, DatasetCard
import re
import pandas as pd
import os
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.exceptions import HTTPError
# # ---------- Retry helper ----------
# def retry_load_card(dataset_id, retries=5, base_wait=60):
# for attempt in range(retries):
# try:
# return DatasetCard.load(dataset_id)
# except HTTPError as e:
# if e.response is not None and e.response.status_code == 429:
# wait_time = e.response.headers.get("Retry-After")
# if wait_time is not None:
# wait_time = int(wait_time)
# else:
# wait_time = base_wait * (2 ** attempt) + random.randint(0, 10)
# print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...")
# time.sleep(wait_time)
# continue
# else:
# raise
# except Exception as e:
# print(f"[ERROR] {dataset_id}: {e}")
# raise
# raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.")
# ---------- Heuristic functions with reasons ----------
def check_card_quality(card_text, metadata, dataset_url):
reasons = []
length = len(card_text)
word_count = len(card_text.split())
if metadata is None or len(metadata) == 0:
print(length, word_count, dataset_url)
if length < 200:
reasons.append("No metadata and no description")
return "minimal", reasons, word_count
else:
reasons.append("No metadata but has description")
return "minimal", reasons, word_count
else:
if length < 200:
reasons.append(f"Short description (char count={length}, words={word_count})")
return "minimal", reasons, word_count
else:
return "rich", reasons, word_count
# ---------- Worker function for one dataset ----------
def process_dataset(ds, save_dir):
try:
card = DatasetCard.load(ds.id)
card_text = card.text or ""
metadata = card.data.to_dict() if card.data else {}
dataset_url = f"https://huggingface.co/datasets/{ds.id}"
# Save README locally
readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md")
with open(readme_path, "w", encoding="utf-8") as f:
f.write(card_text)
category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url)
row = {
"dataset_id": ds.id,
"dataset_url": dataset_url,
"downloads": getattr(ds, "downloads", None),
"author": metadata.get("author", None),
"license": metadata.get("license", None),
"tags": ", ".join(metadata.get("tags", [])) if metadata.get("tags") else None,
"task_categories": ", ".join(metadata.get("task_categories", [])) if metadata.get("task_categories") else None,
"last_modified": getattr(ds, "lastModified", None),
"reason": "; ".join(reasons),
"readme_path": readme_path,
"word_count": word_count,
"category": category,
}
return row
except Exception as e:
return {
"dataset_id": ds.id,
"dataset_url": f"https://huggingface.co/datasets/{ds.id}",
"downloads": getattr(ds, "downloads", None),
"author": None,
"license": None,
"tags": None,
"task_categories": None,
"last_modified": None,
"reason": "Failed to load card",
"readme_path": None,
"word_count": 0,
"category": "minimal",
}
# ---------- Main ----------
def collect_dataset_ids(save_dir="dataset_readmes", max_workers=16):
minimal_results = []
rich_results = []
os.makedirs(save_dir, exist_ok=True)
datasets = list_datasets()
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets]
for i, f in enumerate(as_completed(futures), 1):
row = f.result()
if row["category"] == "minimal":
minimal_results.append(row)
else:
rich_results.append(row)
return minimal_results, rich_results
if __name__ == "__main__":
minimal, rich = collect_dataset_ids(limit=100, max_workers=16)
# Save separate CSV files
if minimal:
pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False)
if rich:
pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False)
print("\nSaved results to:")
if minimal:
print(" - minimal_dataset_cards.csv")
if rich:
print(" - rich_dataset_cards.csv")
print(" - README files in ./dataset_readmes/")
print("\nSummary:")
print(f"Minimal: {len(minimal)}")
print(f"Rich: {len(rich)}")