|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Utils to train DistilBERT |
|
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) |
|
""" |
|
import json |
|
import logging |
|
import os |
|
import socket |
|
|
|
import git |
|
import numpy as np |
|
import torch |
|
|
|
|
|
logging.basicConfig( |
|
format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d - %(message)s", |
|
datefmt="%m/%d/%Y %H:%M:%S", |
|
level=logging.INFO, |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def git_log(folder_path: str): |
|
""" |
|
Log commit info. |
|
""" |
|
repo = git.Repo(search_parent_directories=True) |
|
repo_infos = { |
|
"repo_id": str(repo), |
|
"repo_sha": str(repo.head.object.hexsha), |
|
"repo_branch": str(repo.active_branch), |
|
} |
|
|
|
with open(os.path.join(folder_path, "git_log.json"), "w") as f: |
|
json.dump(repo_infos, f, indent=4) |
|
|
|
|
|
def init_gpu_params(params): |
|
""" |
|
Handle single and multi-GPU / multi-node. |
|
""" |
|
if params.n_gpu <= 0: |
|
params.local_rank = 0 |
|
params.master_port = -1 |
|
params.is_master = True |
|
params.multi_gpu = False |
|
return |
|
|
|
assert torch.cuda.is_available() |
|
|
|
logger.info("Initializing GPUs") |
|
if params.n_gpu > 1: |
|
assert params.local_rank != -1 |
|
|
|
params.world_size = int(os.environ["WORLD_SIZE"]) |
|
params.n_gpu_per_node = int(os.environ["N_GPU_NODE"]) |
|
params.global_rank = int(os.environ["RANK"]) |
|
|
|
|
|
params.n_nodes = params.world_size // params.n_gpu_per_node |
|
params.node_id = params.global_rank // params.n_gpu_per_node |
|
params.multi_gpu = True |
|
|
|
assert params.n_nodes == int(os.environ["N_NODES"]) |
|
assert params.node_id == int(os.environ["NODE_RANK"]) |
|
|
|
|
|
else: |
|
assert params.local_rank == -1 |
|
|
|
params.n_nodes = 1 |
|
params.node_id = 0 |
|
params.local_rank = 0 |
|
params.global_rank = 0 |
|
params.world_size = 1 |
|
params.n_gpu_per_node = 1 |
|
params.multi_gpu = False |
|
|
|
|
|
assert params.n_nodes >= 1 |
|
assert 0 <= params.node_id < params.n_nodes |
|
assert 0 <= params.local_rank <= params.global_rank < params.world_size |
|
assert params.world_size == params.n_nodes * params.n_gpu_per_node |
|
|
|
|
|
params.is_master = params.node_id == 0 and params.local_rank == 0 |
|
params.multi_node = params.n_nodes > 1 |
|
|
|
|
|
PREFIX = f"--- Global rank: {params.global_rank} - " |
|
logger.info(PREFIX + "Number of nodes: %i" % params.n_nodes) |
|
logger.info(PREFIX + "Node ID : %i" % params.node_id) |
|
logger.info(PREFIX + "Local rank : %i" % params.local_rank) |
|
logger.info(PREFIX + "World size : %i" % params.world_size) |
|
logger.info(PREFIX + "GPUs per node : %i" % params.n_gpu_per_node) |
|
logger.info(PREFIX + "Master : %s" % str(params.is_master)) |
|
logger.info(PREFIX + "Multi-node : %s" % str(params.multi_node)) |
|
logger.info(PREFIX + "Multi-GPU : %s" % str(params.multi_gpu)) |
|
logger.info(PREFIX + "Hostname : %s" % socket.gethostname()) |
|
|
|
|
|
torch.cuda.set_device(params.local_rank) |
|
|
|
|
|
if params.multi_gpu: |
|
logger.info("Initializing PyTorch distributed") |
|
torch.distributed.init_process_group( |
|
init_method="env://", |
|
backend="nccl", |
|
) |
|
|
|
|
|
def set_seed(args): |
|
""" |
|
Set the random seed. |
|
""" |
|
np.random.seed(args.seed) |
|
torch.manual_seed(args.seed) |
|
if args.n_gpu > 0: |
|
torch.cuda.manual_seed_all(args.seed) |
|
|