File size: 4,006 Bytes
5e1b2e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import argparse
import yaml
import os
import requests
from huggingface_hub import snapshot_download, hf_hub_download

def load_config(config_path):
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)

def download_model(model_config, full_ckpts=False):
    model_id = model_config['model_id']
    local_dir = model_config['local_dir']
    platform = model_config['platform']
    url = model_config.get('url')  # Get URL if it exists, None otherwise
    filename = model_config.get('filename')

    # Ensure the local directory exists
    os.makedirs(local_dir, exist_ok=True)
    
    if platform == "HuggingFace":
        if full_ckpts:
            print(f"Downloading full model {model_id} from HuggingFace to {local_dir}")
            snapshot_download(
                repo_id=model_id,
                local_dir=local_dir,
                local_dir_use_symlinks=False,
                allow_patterns=["*.pth", "*.bin", "*.json"],  # Common model file extensions
                ignore_patterns=["*.md", "*.txt"],  # Ignore non-model files
            )
            print(f"Successfully downloaded {model_id} to {local_dir}")
        else:
            if not filename:
                raise ValueError(f"No filename provided for model: {model_id}")
            print(f"Downloading file {filename} for model {model_id} from HuggingFace to {local_dir}")
            hf_hub_download(
                repo_id=model_id,
                filename=filename,
                local_dir=local_dir,
            )
            print(f"Successfully downloaded {filename} to {local_dir}")
    elif platform == "GitHub":
        if not url:
            raise ValueError(f"No URL provided for GitHub model: {model_id}")
        if not filename:
            filename = os.path.basename(url)
        full_path = os.path.join(local_dir, filename)
        print(f"Downloading model {model_id} from GitHub URL {url} to {full_path}")
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(full_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            print(f"Successfully downloaded {model_id} to {full_path}")
        else:
            raise ValueError(f"Failed to download {model_id} from {url}: HTTP {response.status_code}")
    else:
        raise ValueError(f"Unsupported platform: {platform}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Download model checkpoints from HuggingFace or GitHub.")
    parser.add_argument('--config', type=str, default="configs/model_ckpts.yaml",
                        help="Path to the YAML configuration file")
    parser.add_argument('--full_ckpts', action='store_true',
                        help="If true, download all models using snapshot_download, else just download models with for_inference in YAML")
    parser.add_argument('--include_base_model', action='store_true',
                        help="If true, download all models (base_model true and false), else just download base_model false")
    parser.add_argument('--base_model_only', action='store_true',
                        help="If true, download only models with base_model true, ignoring for_inference")
    args = parser.parse_args()

    # Load the YAML configuration
    config = load_config(args.config)

    # Iterate through models in the config
    for model_config in config:
        if args.base_model_only:
            if not model_config.get('base_model', False):
                continue
        else:
            if not args.full_ckpts and not model_config.get('for_inference', False):
                continue
            if not args.include_base_model and model_config.get('base_model', False):
                continue
        download_model(model_config, full_ckpts=args.full_ckpts)