# -*- coding : utf-8 -*- # Description : gpu, model, Parallelism, data, train and inference config definition import math, json from constants import * from dataclasses import dataclass from enum import Enum from functools import total_ordering class ActivationRecomputation(Enum): NONE = 0 """No activation recomputation; requires the most amount of memory.""" SELECTIVE = 1 """Selectively checkpoints and recomputes only parts of each transformer layer that take up a considerable amount of memory but are not computationally expensive to recompute, i.e. Q K V matrix multiplies, QK^T matrix multiply, softmax, softmax dropout, and attention over V.""" FULL = 2 """Full activation recomputation stores the input to EVERY transformer layer, which is sharded across the tensor parallel group, thus requiring an extra all-gather (ignored for now) per layer and add communication overhead; requires the lease amount of memory; requires an extra forward pass.""" @total_ordering class DSZeRO(Enum): NONE = 0 """No DeepSPeed ZeRO; requires the most amount of memory.""" STAGE_1 = 1 """ZeRO stage 1 shards the optimizer states across the data parallel group.""" STAGE_2 = 2 """ZeRO stage 2 shards the optimizer states and gradients across the data parallel group.""" STAGE_3 = 3 """ZeRO stage 3 shards the optimizer states, gradients, and model weights across the data parallel group.""" def __lt__(self, other): # 炫技写法 if other.__class__ is self.__class__: return self.value < other.value # Enum 枚举类自动赋值 return NotImplemented def __eq__(self, other): if isinstance(other, DSZeRO): return self.value == other.value return NotImplemented @dataclass class GPUEfficiencyConfig: flops_efficiency: float = 1.0 hbm_memory_efficiency: float = 1.0 intra_node_memory_efficiency: float = 1.0 inter_node_memory_efficiency: float = 1.0 @dataclass class InferenceConfig: """Inference configuration dataclass.""" batch_size_per_gpu: int = None # batch size seq_len: int = 522 # input sequence length generate_len: int = 1526 # number of tokens to generate context_len: int = None # context length use_kv_cache: bool = True # whether to use key/value cache bytes_per_param: int = BYTES_FP16 # model weight bytes layernorm_dtype_bytes: int = BYTES_FP16 # layernorm data type bytes kv_cache_dtype_bytes: int = BYTES_FP16 # key/value cache data type bytes def __post_init__(self): if self.context_len is None: self.context_len = self.seq_len + self.generate_len @dataclass class ParallelismConfig: """dataclass module provides a decorator and functions for automatically adding generated special methods such as __init__() and __repr__() to user-defined classes """ tp_size: int = 1 # tensor parallelism size, Megatron-LM tensor parallelism implementation pp_size: int = 1 # pipeline parallelism size, Megatron-LM pipeline parallelism implementation dp_size: int = 1 # data parallelism size, DeepSpeed Zero parallelism implementation sp_size: int = 1 # sequence parallelism size, Megatron-LM sequence parallelism implementation @dataclass class ModelConfig: num_layers: int # number of transformer layers (blocks) n_head: int # number of attention heads hidden_dim: int # hidden dimension vocab_size: int # vocabulary size num_key_value_heads: int = None max_seq_len: int = None # max sequence length ffn_embed_dim: int = None # hidden dimension of FFN, default to 4 * hidden_dim model_type: str = None # model type as tagged on Hugging Face (e.g., gpt2, opt, llama.) model_name: str = None # model name as tagged on Hugging Face (e.g., gpt2-xl, opt, llama-13b.) def __post_init__(self): if self.num_key_value_heads is None: # 如果不存在,设置默认值 self.num_key_value_heads = self.n_head if self.ffn_embed_dim is None: self.ffn_embed_dim = self.hidden_dim * 4 @dataclass class GPUConfig: # 1, gpu 型号和显存大小 name: str # GPU config name memory_GPU_in_GB: float # memory per GPU in GB # 2, gpu 显存带宽、节点内带宽、节点间带宽 hbm_bandwidth_in_GB_per_sec: float # GPU HBM bandwidth in GB/s intra_node_bandwidth_in_GB_per_sec: float # intra node GPU bandwidth in GB/s.(PCIE/NVLINK) intra_node_min_message_latency: float # minimum intra node message latency in seconds inter_node_bandwidth_in_GB_per_sec: float = 200 # inter node bandwidth in GB/s, assuming Mellanox 200Gbps HDR Infiniband # 3, 不同精度的 Tensor core 的计算性能 peak_fp32_TFLOPS: float = None # peak Tensor TFLOPS for FP32 peak_fp16_TFLOPS: float = None # peak Tensor TFLOPS for FP16 peak_int8_TFLOPS: float = None # peak Tensor TFLOPS for INT8 peak_int4_TFLOPS: float = None # peak Tensor TFLOPS for INT4 FLOPS_EFFICIENCY = 0.7 HBM_MEMORY_EFFICIENCY = 0.9 def __post_init__(self): """object creation of DataClass starts with __init__() (constructor-calling) and ends with __post__init__() (post-init processing). """ if self.peak_fp32_TFLOPS is None: self.peak_fp32_TFLOPS = math.ceil(self.peak_fp16_TFLOPS / 2) if self.peak_int8_TFLOPS is None: self.peak_int8_TFLOPS = 2 * self.peak_fp16_TFLOPS if self.peak_int4_TFLOPS is None: self.peak_int4_TFLOPS = 4 * self.peak_fp16_TFLOPS if self.FLOPS_EFFICIENCY: self.peak_fp32_TFLOPS *= self.FLOPS_EFFICIENCY self.peak_fp16_TFLOPS *= self.FLOPS_EFFICIENCY self.peak_int8_TFLOPS *= self.FLOPS_EFFICIENCY self.peak_int4_TFLOPS *= self.FLOPS_EFFICIENCY if self.HBM_MEMORY_EFFICIENCY: self.hbm_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY self.intra_node_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY class LLMConfigs(object): def __init__(self, gpu_config: GPUConfig, model_config: ModelConfig, parallelism_config: ParallelismConfig = ParallelismConfig(), inference_config: InferenceConfig = InferenceConfig(), gpu_efficiency_config: GPUEfficiencyConfig = GPUEfficiencyConfig() ) -> None: self.model_config = model_config self.gpu_config = gpu_config self.parallelism_config = parallelism_config self.inference_config = inference_config # 用户自行指定配置 self.gpu_efficiency_config = gpu_efficiency_config # 用户自行指定配置 def get_model_and_gpu_config_by_name(model_name="llama-13b", gpu_name="v100-pcie-32gb") -> dict: """Read model and gpu configs from a json file.""" config_files = ["configs/model_configs.json", "configs/gpu_configs.json"] model_config, gpu_config = {}, {} for config_filename in config_files: with open(config_filename, "r") as f: config_json = json.load(f) if "model" in config_filename: assert model_name in config_json, f"model name {model_name} not found in {config_filename}" config_dict = config_json[model_name] model_config = ModelConfig(**config_dict) elif "gpu" in config_filename: assert gpu_name in config_json, f"gpu name {gpu_name} not found in {config_filename}" config_dict = config_json[gpu_name] gpu_config = GPUConfig(**config_dict) else: assert False, f"unknown config type when reading: {type}" return model_config, gpu_config def get_TFLOPS_per_gpu(gpu_config: GPUConfig, data_type="fp16", flops_efficiency=1.0) -> float: """Get the expected TFLOPS per GPU for the specified data type configuration/GPU (adjusted by flops_efficiency) Returns: float: TFLOPS per GPU and unit is T. """ if data_type == "int8": gemm_TFOPS = gpu_config.peak_int8_TFLOPS elif data_type == "fp16": gemm_TFOPS = gpu_config.peak_fp16_TFLOPS else: print("weight_bits and activation_bits must be 8, or 16!") return gemm_TFOPS * flops_efficiency def get_gpu_hbm_bandwidth(gpu_config: GPUConfig, hbm_memory_efficiency=1.0) -> float: return ( gpu_config.hbm_bandwidth_in_GB_per_sec * hbm_memory_efficiency ) def get_intra_node_bandwidth(gpu_config: GPUConfig, intra_node_memory_efficiency=1.0) -> float: return ( gpu_config.intra_node_bandwidth_in_GB_per_sec * intra_node_memory_efficiency ) def get_inter_node_bandwidth(gpu_config: GPUConfig, inter_node_memory_efficiency=1.0) -> float: return ( gpu_config.inter_node_bandwidth_in_GB_per_sec * inter_node_memory_efficiency )