|
|
import os |
|
|
import requests |
|
|
from tqdm import tqdm |
|
|
import time |
|
|
import re |
|
|
import json |
|
|
from huggingface_hub import HfApi, hf_hub_download |
|
|
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError, HFValidationError |
|
|
from requests.exceptions import RequestException |
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
import pickle |
|
|
|
|
|
|
|
|
OUTPUT_DIR = "model_data_json" |
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
NUM_WORKERS = 4 |
|
|
|
|
|
|
|
|
DOWNLOAD_DELAY_SECONDS = 0.2 |
|
|
|
|
|
|
|
|
def clean_readme_content(text): |
|
|
"""Basic cleaning of README markdown: remove code blocks, links.""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) |
|
|
|
|
|
text = re.sub(r'`[^`]+`', '', text) |
|
|
|
|
|
text = re.sub(r'\[([^]]+)\]\([^)]+\)', r'\1', text) |
|
|
|
|
|
text = re.sub(r'https?://\S+', '', text) |
|
|
|
|
|
text = re.sub(r'!\[[^]]*\]\([^)]+\)', '', text) |
|
|
|
|
|
text = ' '.join(text.split()) |
|
|
return text |
|
|
|
|
|
|
|
|
MODELS_CACHE_FILE = "models_list_cache.pkl" |
|
|
|
|
|
def get_all_models_with_downloads(min_downloads=10000): |
|
|
"""Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list.""" |
|
|
models_list = None |
|
|
|
|
|
|
|
|
if os.path.exists(MODELS_CACHE_FILE): |
|
|
try: |
|
|
print(f"Loading cached model list from {MODELS_CACHE_FILE}...") |
|
|
with open(MODELS_CACHE_FILE, 'rb') as f: |
|
|
models_list = pickle.load(f) |
|
|
print(f"Loaded {len(models_list)} models from cache.") |
|
|
except Exception as e: |
|
|
print(f"Error loading cache file {MODELS_CACHE_FILE}: {e}. Fetching from API.") |
|
|
models_list = None |
|
|
|
|
|
|
|
|
if models_list is None: |
|
|
print(f"Fetching all models with more than {min_downloads} downloads from API...") |
|
|
try: |
|
|
print("Initializing HfApi...") |
|
|
api = HfApi() |
|
|
print("HfApi initialized. Calling list_models...") |
|
|
|
|
|
models_iterator = api.list_models(sort="downloads", direction=-1, fetch_config=False, cardData=True) |
|
|
print("list_models call returned. Converting iterator to list...") |
|
|
|
|
|
models_list = list(models_iterator) |
|
|
print(f"Converted to list with {len(models_list)} models.") |
|
|
|
|
|
|
|
|
try: |
|
|
print(f"Saving model list to cache file: {MODELS_CACHE_FILE}...") |
|
|
with open(MODELS_CACHE_FILE, 'wb') as f: |
|
|
pickle.dump(models_list, f) |
|
|
print("Model list saved to cache.") |
|
|
except Exception as e: |
|
|
print(f"Error saving cache file {MODELS_CACHE_FILE}: {e}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error during HfApi initialization or list_models call: {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
if not models_list: |
|
|
print("Model list is empty after fetching/loading.") |
|
|
return [] |
|
|
|
|
|
qualifying_models = [] |
|
|
print(f"Filtering {len(models_list)} models by download count...") |
|
|
for model in models_list: |
|
|
|
|
|
if not hasattr(model, 'downloads') or model.downloads is None: |
|
|
continue |
|
|
|
|
|
if model.downloads < min_downloads: |
|
|
|
|
|
break |
|
|
|
|
|
qualifying_models.append(model) |
|
|
|
|
|
print(f"Found {len(qualifying_models)} models with more than {min_downloads} downloads") |
|
|
return qualifying_models |
|
|
|
|
|
def get_model_readme(model_id): |
|
|
"""Get README.md content for a specific model using hf_hub_download. Returns None if not found or inaccessible.""" |
|
|
filenames_to_try = ["README.md", "readme.md"] |
|
|
branches_to_try = ["main", "master"] |
|
|
|
|
|
for branch in branches_to_try: |
|
|
for filename in filenames_to_try: |
|
|
try: |
|
|
|
|
|
|
|
|
readme_path = hf_hub_download( |
|
|
repo_id=model_id, |
|
|
filename=filename, |
|
|
revision=branch, |
|
|
repo_type="model", |
|
|
local_files_only=False, |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
with open(readme_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
return content |
|
|
|
|
|
except RepositoryNotFoundError: |
|
|
print(f"Repository {model_id} not found.") |
|
|
return None |
|
|
except EntryNotFoundError: |
|
|
|
|
|
continue |
|
|
except HFValidationError as e: |
|
|
print(f"Validation error for {model_id} (branch: {branch}, file: {filename}): {e}") |
|
|
continue |
|
|
except Exception as e: |
|
|
print(f"Error downloading {filename} from branch {branch} for {model_id}: {e}") |
|
|
|
|
|
if "401" in str(e) or "403" in str(e): |
|
|
print(f"Authentication error (401/403) for {model_id}. Ensure you are logged in and accepted terms.") |
|
|
return None |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
print(f"Could not fetch README for {model_id} from any standard location.") |
|
|
return None |
|
|
|
|
|
def get_filename_for_model(model_id): |
|
|
"""Generate JSON filename for a model""" |
|
|
safe_id = model_id.replace("/", "_") |
|
|
return os.path.join(OUTPUT_DIR, f"{safe_id}.json") |
|
|
|
|
|
def save_model_data(model_id, data): |
|
|
"""Save model data (description, tags, downloads) to a JSON file.""" |
|
|
filename = get_filename_for_model(model_id) |
|
|
try: |
|
|
with open(filename, "w", encoding="utf-8") as f: |
|
|
json.dump(data, f, ensure_ascii=False, indent=4) |
|
|
return filename |
|
|
except Exception as e: |
|
|
print(f"Error saving JSON for {model_id} to {filename}: {e}") |
|
|
return None |
|
|
|
|
|
def file_exists_for_model(model_id): |
|
|
"""Check if a JSON file already exists for this model""" |
|
|
filename = get_filename_for_model(model_id) |
|
|
return os.path.exists(filename) |
|
|
|
|
|
def process_model(model): |
|
|
"""Process a single model - fetch README, clean it, save as JSON.""" |
|
|
model_id = model.modelId |
|
|
downloads = model.downloads |
|
|
tags = getattr(model, 'tags', []) |
|
|
|
|
|
|
|
|
if file_exists_for_model(model_id): |
|
|
return (model_id, downloads, None, "skipped") |
|
|
|
|
|
|
|
|
time.sleep(DOWNLOAD_DELAY_SECONDS) |
|
|
|
|
|
|
|
|
|
|
|
readme_content = get_model_readme(model_id) |
|
|
|
|
|
|
|
|
if readme_content is None: |
|
|
return (model_id, downloads, None, "no_readme") |
|
|
|
|
|
|
|
|
cleaned_readme = clean_readme_content(readme_content) |
|
|
|
|
|
|
|
|
model_data = { |
|
|
"model_id": model_id, |
|
|
"downloads": downloads, |
|
|
"tags": tags, |
|
|
"description": cleaned_readme |
|
|
} |
|
|
|
|
|
|
|
|
filename = save_model_data(model_id, model_data) |
|
|
if filename: |
|
|
return (model_id, downloads, filename, "downloaded") |
|
|
else: |
|
|
return (model_id, downloads, None, "save_failed") |
|
|
|
|
|
def main(): |
|
|
qualifying_models = get_all_models_with_downloads(min_downloads=10000) |
|
|
if not qualifying_models: |
|
|
print("No qualifying models found") |
|
|
return |
|
|
|
|
|
print(f"Processing {len(qualifying_models)} models, saving to '{OUTPUT_DIR}'...") |
|
|
downloaded = 0 |
|
|
skipped = 0 |
|
|
no_readme = 0 |
|
|
failed = 0 |
|
|
|
|
|
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: |
|
|
future_to_model = {executor.submit(process_model, model): model for model in qualifying_models} |
|
|
|
|
|
for future in tqdm(as_completed(future_to_model), total=len(qualifying_models)): |
|
|
try: |
|
|
model_id, downloads, filename, status = future.result() |
|
|
if status == "downloaded": |
|
|
|
|
|
|
|
|
downloaded += 1 |
|
|
elif status == "skipped": |
|
|
skipped += 1 |
|
|
elif status == "no_readme": |
|
|
no_readme += 1 |
|
|
else: |
|
|
failed += 1 |
|
|
except Exception as e: |
|
|
|
|
|
processed_model = future_to_model[future] |
|
|
print(f"Error processing model {getattr(processed_model, 'modelId', 'unknown')}: {e}") |
|
|
failed += 1 |
|
|
|
|
|
print(f"\nCompleted! Downloaded: {downloaded}, Skipped existing: {skipped}, No README found: {no_readme}, Failed: {failed}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |