Spaces:

shayan5422
/

back_rag_huggingface

Paused

App Files Files Community

back_rag_huggingface / huggingface_model_descriptions.py

shayan5422

Upload 6 files

8181a7b verified 7 months ago

raw

history blame contribute delete

10.7 kB

	import os
	import requests
	from tqdm import tqdm
	import time
	import re
	import json
	from huggingface_hub import HfApi, hf_hub_download
	from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError, HFValidationError
	from requests.exceptions import RequestException
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import pickle # Add pickle for caching

	# Create a directory to store JSON data
	OUTPUT_DIR = "model_data_json"
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# Number of worker threads for parallel processing - REDUCED
	NUM_WORKERS = 4

	# Add a delay between download attempts across threads
	DOWNLOAD_DELAY_SECONDS = 0.2 # Adjust as needed

	# --- README Cleaning ---
	def clean_readme_content(text):
	"""Basic cleaning of README markdown: remove code blocks, links."""
	if not text:
	return ""

	# Remove fenced code blocks (``` ... ```)
	text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
	# Remove inline code (`...`)
	text = re.sub(r'`[^`]+`', '', text)
	# Remove markdown links ([text](url))
	text = re.sub(r'\[([^]]+)\]\([^)]+\)', r'\1', text) # Keep link text
	# Remove standalone URLs (simple version)
	text = re.sub(r'https?://\S+', '', text)
	# Remove markdown images (![alt](url))
	text = re.sub(r'!\[[^]]*\]\([^)]+\)', '', text)
	# Replace multiple newlines/spaces with single ones
	text = ' '.join(text.split())
	return text
	# ---

	MODELS_CACHE_FILE = "models_list_cache.pkl" # File to cache the raw model list

	def get_all_models_with_downloads(min_downloads=10000):
	"""Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
	models_list = None

	# 1. Check for cache
	if os.path.exists(MODELS_CACHE_FILE):
	try:
	print(f"Loading cached model list from {MODELS_CACHE_FILE}...")
	with open(MODELS_CACHE_FILE, 'rb') as f:
	models_list = pickle.load(f)
	print(f"Loaded {len(models_list)} models from cache.")
	except Exception as e:
	print(f"Error loading cache file {MODELS_CACHE_FILE}: {e}. Fetching from API.")
	models_list = None # Ensure fetching if cache loading fails

	# 2. Fetch from API if cache doesn't exist or failed to load
	if models_list is None:
	print(f"Fetching all models with more than {min_downloads} downloads from API...")
	try:
	print("Initializing HfApi...")
	api = HfApi()
	print("HfApi initialized. Calling list_models...")
	# Fetch the iterator
	models_iterator = api.list_models(sort="downloads", direction=-1, fetch_config=False, cardData=True)
	print("list_models call returned. Converting iterator to list...")
	# Convert the iterator to a list TO ALLOW CACHING
	models_list = list(models_iterator)
	print(f"Converted to list with {len(models_list)} models.")

	# Save to cache
	try:
	print(f"Saving model list to cache file: {MODELS_CACHE_FILE}...")
	with open(MODELS_CACHE_FILE, 'wb') as f:
	pickle.dump(models_list, f)
	print("Model list saved to cache.")
	except Exception as e:
	print(f"Error saving cache file {MODELS_CACHE_FILE}: {e}")

	except Exception as e:
	print(f"Error during HfApi initialization or list_models call: {e}")
	return [] # Return empty list on error

	# 3. Filter the loaded/fetched list
	if not models_list:
	print("Model list is empty after fetching/loading.")
	return []

	qualifying_models = []
	print(f"Filtering {len(models_list)} models by download count...")
	for model in models_list: # Iterate through the list (from cache or API)
	# No need for prints inside this loop now, as it should be fast
	if not hasattr(model, 'downloads') or model.downloads is None:
	continue

	if model.downloads < min_downloads:
	# Since the list is sorted by downloads, we can stop
	break

	qualifying_models.append(model)

	print(f"Found {len(qualifying_models)} models with more than {min_downloads} downloads")
	return qualifying_models

	def get_model_readme(model_id):
	"""Get README.md content for a specific model using hf_hub_download. Returns None if not found or inaccessible."""
	filenames_to_try = ["README.md", "readme.md"]
	branches_to_try = ["main", "master"]

	for branch in branches_to_try:
	for filename in filenames_to_try:
	try:
	# print(f"Attempting download: repo={model_id}, branch={branch}, file={filename}") # Debug
	# Use hf_hub_download which uses stored token
	readme_path = hf_hub_download(
	repo_id=model_id,
	filename=filename,
	revision=branch,
	repo_type="model",
	local_files_only=False, # Ensure it tries to download
	# token=True # Often not needed if logged in via CLI, but can be explicit
	)

	# If download succeeded, read the content
	# print(f"Successfully downloaded {filename} from {branch} to {readme_path}") # Debug
	with open(readme_path, 'r', encoding='utf-8') as f:
	content = f.read()
	return content

	except RepositoryNotFoundError:
	print(f"Repository {model_id} not found.")
	return None # If repo doesn't exist, no point trying other files/branches
	except EntryNotFoundError:
	# print(f"{filename} not found in branch {branch} for {model_id}. Trying next...") # Debug
	continue # File not found in this specific branch/filename combination, try next
	except HFValidationError as e: # Catch invalid repo ID or filename errors
	print(f"Validation error for {model_id} (branch: {branch}, file: {filename}): {e}")
	continue # Try next filename/branch
	except Exception as e: # Catch other potential errors (like 401 HfHubHTTPError, network issues)
	print(f"Error downloading {filename} from branch {branch} for {model_id}: {e}")
	# Check if it's a likely authentication error (401/403)
	if "401" in str(e) or "403" in str(e):
	print(f"Authentication error (401/403) for {model_id}. Ensure you are logged in and accepted terms.")
	return None # Don't try other files/branches if auth failed
	# For other errors, we continue to the next filename/branch attempt
	continue

	# If all attempts failed
	print(f"Could not fetch README for {model_id} from any standard location.")
	return None

	def get_filename_for_model(model_id):
	"""Generate JSON filename for a model"""
	safe_id = model_id.replace("/", "_")
	return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # Change extension to .json

	def save_model_data(model_id, data):
	"""Save model data (description, tags, downloads) to a JSON file."""
	filename = get_filename_for_model(model_id)
	try:
	with open(filename, "w", encoding="utf-8") as f:
	json.dump(data, f, ensure_ascii=False, indent=4)
	return filename
	except Exception as e:
	print(f"Error saving JSON for {model_id} to {filename}: {e}")
	return None

	def file_exists_for_model(model_id):
	"""Check if a JSON file already exists for this model"""
	filename = get_filename_for_model(model_id)
	return os.path.exists(filename)

	def process_model(model):
	"""Process a single model - fetch README, clean it, save as JSON."""
	model_id = model.modelId
	downloads = model.downloads
	tags = getattr(model, 'tags', []) # Get tags if available

	# Check if JSON file already exists
	if file_exists_for_model(model_id):
	return (model_id, downloads, None, "skipped")

	# --- Add Delay Before Download Attempt ---
	time.sleep(DOWNLOAD_DELAY_SECONDS)
	# ---------------------------------------

	# Get model README content
	readme_content = get_model_readme(model_id)

	# If README is not available, skip saving this model
	if readme_content is None:
	return (model_id, downloads, None, "no_readme")

	# Clean the README
	cleaned_readme = clean_readme_content(readme_content)

	# Prepare data payload
	model_data = {
	"model_id": model_id,
	"downloads": downloads,
	"tags": tags,
	"description": cleaned_readme
	}

	# Save data as JSON
	filename = save_model_data(model_id, model_data)
	if filename:
	return (model_id, downloads, filename, "downloaded")
	else:
	return (model_id, downloads, None, "save_failed")

	def main():
	qualifying_models = get_all_models_with_downloads(min_downloads=10000)
	if not qualifying_models:
	print("No qualifying models found")
	return

	print(f"Processing {len(qualifying_models)} models, saving to '{OUTPUT_DIR}'...")
	downloaded = 0
	skipped = 0
	no_readme = 0
	failed = 0

	with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
	future_to_model = {executor.submit(process_model, model): model for model in qualifying_models}

	for future in tqdm(as_completed(future_to_model), total=len(qualifying_models)):
	try:
	model_id, downloads, filename, status = future.result()
	if status == "downloaded":
	# Don't print every success to avoid clutter
	# print(f"Saved data for {model_id} ({downloads} downloads) to {filename}")
	downloaded += 1
	elif status == "skipped":
	skipped += 1
	elif status == "no_readme":
	no_readme += 1
	else: # save_failed or other errors
	failed += 1
	except Exception as e:
	# Extract model_id for better error reporting if possible
	processed_model = future_to_model[future]
	print(f"Error processing model {getattr(processed_model, 'modelId', 'unknown')}: {e}")
	failed += 1

	print(f"\nCompleted! Downloaded: {downloaded}, Skipped existing: {skipped}, No README found: {no_readme}, Failed: {failed}")

	if __name__ == "__main__":
	main()