Spaces:

hugging-science
/

dataset-insight-portal

Running

dataset-insight-portal / heuristic_approach.py

Santosh

updated ryan science tags

2ccb279 2 days ago

10.2 kB

	# from huggingface_hub import list_datasets, DatasetCard
	# import re
	# import pandas as pd
	# import os
	# import time
	# import random
	# from concurrent.futures import ThreadPoolExecutor, as_completed
	# from requests.exceptions import HTTPError


	# # ---------- Retry helper ----------
	# def retry_load_card(dataset_id, retries=5, base_wait=60):
	# """
	# Try to load a dataset card with retries if 429 (rate limit) occurs.
	# Uses Retry-After header if available, otherwise exponential backoff.
	# """
	# for attempt in range(retries):
	# try:
	# return DatasetCard.load(dataset_id)
	# except HTTPError as e:
	# if e.response is not None and e.response.status_code == 429:
	# wait_time = e.response.headers.get("Retry-After")
	# if wait_time is not None:
	# wait_time = int(wait_time)
	# else:
	# wait_time = base_wait * (2 ** attempt) + random.randint(0, 10)
	# print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...")
	# time.sleep(wait_time)
	# continue
	# else:
	# raise # don't retry for other HTTP errors
	# except Exception as e:
	# print(f"[ERROR] {dataset_id}: {e}")
	# raise
	# raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.")


	# # ---------- Heuristic functions with reasons ----------
	# def check_card_quality(card_text, metadata, dataset_url):
	# reasons = []
	# length = len(card_text)
	# word_count = len(card_text.split())

	# if metadata is None or len(metadata) == 0:
	# print(length, word_count, dataset_url)
	# if length < 200:
	# reasons.append("No metadata and no description")
	# return "minimal", reasons, word_count
	# else:
	# reasons.append("No metadata but has description")
	# return "minimal", reasons, word_count
	# else:
	# if length < 200:
	# reasons.append(f"Short description (char count={length}, words={word_count})")
	# return "minimal", reasons, word_count
	# else:
	# return "rich", reasons, word_count

	# # ---------- Worker function for one dataset ----------
	# def process_dataset(ds, save_dir):
	# try:
	# card = retry_load_card(ds.id)
	# card_text = card.text or ""
	# metadata = card.data.to_dict() if card.data else {}
	# dataset_url = f"https://huggingface.co/datasets/{ds.id}"

	# # Save README locally
	# readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md")
	# with open(readme_path, "w", encoding="utf-8") as f:
	# f.write(card_text)

	# category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url)
	# row = {
	# "dataset_id": ds.id,
	# "dataset_url": dataset_url,
	# "downloads": getattr(ds, "downloads", None),
	# "reason": "; ".join(reasons),
	# "readme_path": readme_path,
	# "word_count": word_count,
	# "category": category,
	# }
	# return row
	# except Exception as e:
	# return {
	# "dataset_id": ds.id,
	# "dataset_url": f"https://huggingface.co/datasets/{ds.id}",
	# "downloads": getattr(ds, "downloads", None),
	# "reason": f"Failed to load card",
	# "readme_path": None,
	# "word_count": 0,
	# "category": "minimal",
	# }


	# # ---------- Main ----------
	# def collect_dataset_ids(limit=1000, save_dir="dataset_readmes", max_workers=16):
	# minimal_results = []
	# rich_results = []

	# os.makedirs(save_dir, exist_ok=True)

	# print(f"Fetching up to {limit} datasets (sorted by downloads)...")
	# datasets = list_datasets()

	# with ThreadPoolExecutor(max_workers=max_workers) as executor:
	# futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets]
	# for i, f in enumerate(as_completed(futures), 1):
	# row = f.result()
	# if row["category"] == "minimal":
	# minimal_results.append(row)
	# else:
	# rich_results.append(row)
	# return minimal_results, rich_results


	# if __name__ == "__main__":
	# minimal, rich = collect_dataset_ids(limit=1000, max_workers=16)

	# # Save separate CSV files
	# if minimal:
	# pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False)
	# if rich:
	# pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False)

	# print("\nSaved results to:")
	# if minimal:
	# print(" - minimal_dataset_cards.csv")
	# if rich:
	# print(" - rich_dataset_cards.csv")
	# print(" - README files in ./dataset_readmes/")

	# print("\nSummary:")
	# print(f"Minimal: {len(minimal)}")
	# print(f"Rich: {len(rich)}")


	from huggingface_hub import list_datasets, DatasetCard
	import re
	import pandas as pd
	import os
	import time
	import random
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from requests.exceptions import HTTPError


	# # ---------- Retry helper ----------
	# def retry_load_card(dataset_id, retries=5, base_wait=60):
	# for attempt in range(retries):
	# try:
	# return DatasetCard.load(dataset_id)
	# except HTTPError as e:
	# if e.response is not None and e.response.status_code == 429:
	# wait_time = e.response.headers.get("Retry-After")
	# if wait_time is not None:
	# wait_time = int(wait_time)
	# else:
	# wait_time = base_wait * (2 ** attempt) + random.randint(0, 10)
	# print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...")
	# time.sleep(wait_time)
	# continue
	# else:
	# raise
	# except Exception as e:
	# print(f"[ERROR] {dataset_id}: {e}")
	# raise
	# raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.")


	# ---------- Heuristic functions with reasons ----------
	def check_card_quality(card_text, metadata, dataset_url):
	reasons = []
	length = len(card_text)
	word_count = len(card_text.split())

	if metadata is None or len(metadata) == 0:
	print(length, word_count, dataset_url)
	if length < 200:
	reasons.append("No metadata and no description")
	return "minimal", reasons, word_count
	else:
	reasons.append("No metadata but has description")
	return "minimal", reasons, word_count
	else:
	if length < 200:
	reasons.append(f"Short description (char count={length}, words={word_count})")
	return "minimal", reasons, word_count
	else:
	return "rich", reasons, word_count


	# ---------- Worker function for one dataset ----------
	def process_dataset(ds, save_dir):
	try:
	card = DatasetCard.load(ds.id)
	card_text = card.text or ""
	metadata = card.data.to_dict() if card.data else {}
	dataset_url = f"https://huggingface.co/datasets/{ds.id}"

	# Save README locally
	readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md")
	with open(readme_path, "w", encoding="utf-8") as f:
	f.write(card_text)

	category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url)

	row = {
	"dataset_id": ds.id,
	"dataset_url": dataset_url,
	"downloads": getattr(ds, "downloads", None),
	"author": metadata.get("author", None),
	"license": metadata.get("license", None),
	"tags": ", ".join(metadata.get("tags", [])) if metadata.get("tags") else None,
	"task_categories": ", ".join(metadata.get("task_categories", [])) if metadata.get("task_categories") else None,
	"last_modified": getattr(ds, "lastModified", None),
	"reason": "; ".join(reasons),
	"readme_path": readme_path,
	"word_count": word_count,
	"category": category,
	}
	return row
	except Exception as e:
	return {
	"dataset_id": ds.id,
	"dataset_url": f"https://huggingface.co/datasets/{ds.id}",
	"downloads": getattr(ds, "downloads", None),
	"author": None,
	"license": None,
	"tags": None,
	"task_categories": None,
	"last_modified": None,
	"reason": "Failed to load card",
	"readme_path": None,
	"word_count": 0,
	"category": "minimal",
	}


	# ---------- Main ----------
	def collect_dataset_ids(save_dir="dataset_readmes", max_workers=16):
	minimal_results = []
	rich_results = []

	os.makedirs(save_dir, exist_ok=True)

	datasets = list_datasets()

	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets]
	for i, f in enumerate(as_completed(futures), 1):
	row = f.result()
	if row["category"] == "minimal":
	minimal_results.append(row)
	else:
	rich_results.append(row)
	return minimal_results, rich_results


	if __name__ == "__main__":
	minimal, rich = collect_dataset_ids(limit=100, max_workers=16)

	# Save separate CSV files
	if minimal:
	pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False)
	if rich:
	pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False)

	print("\nSaved results to:")
	if minimal:
	print(" - minimal_dataset_cards.csv")
	if rich:
	print(" - rich_dataset_cards.csv")
	print(" - README files in ./dataset_readmes/")

	print("\nSummary:")
	print(f"Minimal: {len(minimal)}")
	print(f"Rich: {len(rich)}")