Spaces:
Sleeping
Sleeping
# -*- coding : utf-8 -*- | |
# author : honggao.zhang | |
# Create : 2023-7-19 | |
# Version : 0.1.0 | |
# Description : transformer model(llm) profiling tools, can be used to profile the model's flops, memory, and latency. | |
# Reference : https://github.com/cli99/llm-analysis | |
import logging | |
from pprint import pformat | |
import pprint | |
import pandas as pd | |
import os | |
from config import * | |
from utils import * | |
from math import floor | |
logger = logging.getLogger() | |
class CountCausalLMParams(object): | |
def __init__(self, model_config: ModelConfig) -> None: | |
self.h = model_config.hidden_dim | |
self.l = model_config.num_layers | |
self.V = model_config.vocab_size | |
self.model_config = model_config | |
def count_params_embedding(self, shared_embedding: bool = True) -> int: | |
"""Get the number of parameters in the embedding layer. params_te = vocab_size * d_model | |
Args: | |
shared_embedding (bool, optional): whether the output embedding \ | |
shares weights with the input embedding. Defaults to True. | |
Returns: | |
int: the number of parameters in the embedding layer | |
""" | |
num_params_input_embedding = self.V * self.h | |
num_params_output_embedding = self.V * self.h if not shared_embedding else 0 | |
return num_params_input_embedding + num_params_output_embedding | |
def count_params_per_layer_attn(self) -> int: | |
"""Get the number of parameters per layer in the attention module | |
which include 4 linear layer: query/key/value projection and output matrices. | |
params_attn(mha) = params_q + params_k + params_v + params_o = 4 * d_model**2 | |
Returns: | |
int: the number of parameters per layer in the attention module(mha) | |
""" | |
return 4 * self.h ** 2 | |
def count_params_per_layer_mlp(self) -> int: | |
"""Get the number of parameters in the MLP linear layers, including the | |
intermediate and output matrices. | |
params_mlp = prams_fc1 + params_fc2 = d_model * 4_d_model + 4_d_model * d_model = 8 * d_model**2 | |
Returns: | |
int: the number of parameters in the two MLP linear layers | |
""" | |
return 8 * self.h ** 2 | |
def count_params_per_layer_ln(self) -> int: | |
"""Get the number of parameters per layer in the two layer normalization module. | |
params_ln = 4 * d_model | |
Returns: | |
int: the number of parameters per layer in the two layer normalization module | |
""" | |
return 4 * self.h | |
def count_params_per_layer(self, ln_ignore=True) -> tuple: | |
"""Get the number of params per layer in the transformer decoder blocks, | |
mainly including the attention and MLP layers | |
params_per_layer = params_attn + params_mlp + params_ln | |
= 4d_model^2 + 8d_model^2 + 2*4d_model = 12d_model^2 + 8d_model | |
Return: | |
int: the number of params per layer in the transformer decoder blocks | |
""" | |
params_per_layer_attn = self.count_params_per_layer_attn() | |
params_per_layer_mlp = self.count_params_per_layer_mlp() | |
params_per_layer_ln = 0 if ln_ignore else 2 * self.count_params_per_layer_ln() | |
params_per_layer = ( | |
params_per_layer_attn | |
+ params_per_layer_mlp | |
+ params_per_layer_ln | |
) | |
dict_params_per_layer = { | |
"params_per_layer": params_per_layer, | |
"params_attn": params_per_layer_attn, | |
"params_mlp": params_per_layer_mlp, | |
"params_layernorm": params_per_layer_ln, | |
} | |
return params_per_layer, dict_params_per_layer | |
def count_params_model(self) -> int: | |
"""Get the total number of parameters in the model including all layers and token embedding layer. | |
params_model = params_embedding + params_per_layer * num_layers | |
= V * d_model + 12 * d_model**2 * num_layers | |
Returns: | |
int: the total number of parameters in the model | |
""" | |
params_per_layer, dict_params_per_layer = self.count_params_per_layer() | |
return (params_per_layer * self.l | |
+ self.count_params_embedding() | |
) | |
def __call__(self, hidden_dim, num_layers, vocab_size) -> int: | |
return (vocab_size * hidden_dim | |
+ 12 * hidden_dim ** 2 * num_layers | |
) | |
class CountCausalLMFlops(object): | |
"""The count is model-specific and does not depend on the parallelism strategy. | |
And ignore layer normalization and other element-wise operations.""" | |
def __init__(self, model_config: ModelConfig, batch_size: int, seq_len: int, simp_count=False) -> None: | |
self.h = model_config.hidden_dim | |
self.l = model_config.num_layers | |
self.V = model_config.vocab_size | |
self.b = batch_size | |
self.s = seq_len | |
if not simp_count: | |
llm_params = CountCausalLMParams(model_config) | |
self.model_flops = llm_params(self.h, self.l, self.V) * 2 | |
def count_flops_fwd_per_layer_attn(self, batch_size: int, seq_len: int) -> int: | |
"""Get the number of floating point operations (flops) for the forward | |
pass of the attention module in a transformer layer, given the batch | |
size and sequence length. | |
mainly including four linear calculations: query/key/value projection and output | |
matrices multiplication、self-attention internal operation, and element-wise operations are ignored. | |
flops_attn = flops_q + flops_k + flops_v + flops_output + flops_self_attention | |
= 4(bsh^2) + 2(2bs^2h) | |
Args: | |
batch_size (int): batch size | |
seq_len (int): sequence length | |
Returns: | |
int: flops for the forward pass of the attention module in a transformer layer | |
""" | |
return ( | |
8 * batch_size * seq_len * self.h ** 2 | |
+ 4 * batch_size * seq_len ** 2 * self.h | |
) | |
def count_flops_fwd_per_layer_mlp(self, batch_size: int, seq_len: int) -> int: | |
"""Count two flops of matrices multiplication(two linear layers in the MLP module.) | |
flops_mlp = flops_fc1 + flops_fc2 = 2bs(4h^2) + 2bs(4h^2) = 16bsh^2 | |
""" | |
return 16 * batch_size * seq_len * self.h ** 2 | |
def count_flops_fwd_per_layer(self, batch_size: int, seq_len: int, ln_ignore=True) -> tuple: | |
flops_fwd_per_layer_attn = self.count_flops_fwd_per_layer_attn(batch_size, seq_len) | |
flops_fwd_per_layer_mlp = self.count_flops_fwd_per_layer_mlp(batch_size, seq_len) | |
flops_fwd_per_layer_ln = 0 | |
flops_fwd_per_layer = ( | |
flops_fwd_per_layer_attn | |
+ flops_fwd_per_layer_mlp | |
+ flops_fwd_per_layer_ln | |
) | |
dict_flops_fwd_per_layer = { | |
"flops_fwd_per_layer": flops_fwd_per_layer, | |
"flops_attn": flops_fwd_per_layer_attn, | |
"flops_mlp": flops_fwd_per_layer_mlp, | |
"flops_layernorm": flops_fwd_per_layer_ln, | |
} | |
return flops_fwd_per_layer, dict_flops_fwd_per_layer | |
def count_flops_logits_layer(self,) -> int: | |
"""flops of output token logits layer""" | |
return 2 * self.b * self.s * self.h * self.V | |
def count_flops_fwd_model(self, batch_size: int, seq_len: int) -> int: | |
"""Count flops of the forward pass of the transformer model, given the batch size and sequence length.""" | |
num_flops_fwd_model = ( | |
self.count_flops_fwd_per_layer(batch_size, seq_len)[0] * self.l | |
+ self.count_flops_logits_layer() | |
) | |
# validate | |
assert within_range( | |
num_flops_fwd_model, | |
( | |
24 * self.b * self.s * self.l * self.h**2 | |
* (1 + self.s / (6 * self.h) + self.V / (12 * self.l * self.h)) | |
), | |
TOLERANCE, | |
) | |
return num_flops_fwd_model | |
def count_flops_bwd_model(self, batch_size: int, seq_len: int) -> int: | |
"""Get the number of floating point operations (flops) for the backward | |
pass of the entire transformer model, given the batch size and sequence""" | |
return 2 * self.count_flops_fwd_model(batch_size, seq_len) | |
class CountCausalLMMemory(object): | |
"""Count memory of the model and layers.""" | |
def __init__(self, llm_configs: LLMConfigs) -> None: | |
self.model_config = llm_configs.model_config | |
self.h = self.model_config.hidden_dim | |
self.l = self.model_config.num_layers | |
self.V = self.model_config.vocab_size | |
self.b = llm_configs.inference_config.batch_size_per_gpu | |
self.s = llm_configs.inference_config.seq_len | |
self.o = llm_configs.inference_config.generate_len | |
self.bytes_per_param = llm_configs.inference_config.bytes_per_param | |
self.tp_size = llm_configs.parallelism_config.tp_size | |
self.pp_size = llm_configs.parallelism_config.pp_size | |
self.num_layers_per_gpu = int(self.l / self.pp_size) | |
self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9 # 单位 GB | |
self.llm_params = CountCausalLMParams(self.model_config) | |
def count_memory_weights(self, embedding_dtype_bytes: int = BYTES_FP16): | |
"""Get the memory of the model weights""" | |
params_per_layer, dict_params_per_layer = self.llm_params.count_params_per_layer() | |
params_embedding = self.llm_params.count_params_embedding() | |
memory_weight_per_layer = ( | |
(params_per_layer / self.tp_size) * self.bytes_per_param | |
) | |
memory_weight_per_gpu = memory_weight_per_layer * self.num_layers_per_gpu | |
memory_embedding = (params_embedding / self.tp_size) * embedding_dtype_bytes | |
memory_weight_per_gpu = memory_weight_per_gpu + memory_embedding | |
return memory_weight_per_gpu | |
def count_memory_activation_per_layer_attn( | |
self, | |
batch_size: int, | |
seq_len: int, | |
is_inference: bool = True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL | |
) -> float: | |
"""Count the memory (in bytes) required to store the activations of the | |
attention in a transformer layer, given the batch size, sequence length, | |
whether it is inference or training, the activation recomputation strategy, | |
and the activation data type. | |
""" | |
if activation_recomputation == ActivationRecomputation.FULL: | |
return (batch_size * seq_len * self.h / self.tp_size) * self.bytes_per_param | |
def count_memory_activation_per_layer_mlp( | |
self, | |
is_inference: bool = True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
) -> float: | |
""" The `mlp` activations include the input to the two linear layers.""" | |
if activation_recomputation == ActivationRecomputation.FULL: | |
return 0 | |
return 0 | |
def count_memory_activation_per_layer_layernorm( | |
self, | |
is_inference: bool = True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
layernorm_dtype_bytes: int = BYTES_FP16 | |
) -> float: | |
if activation_recomputation == ActivationRecomputation.FULL: | |
return 0 | |
return 0 | |
def count_memory_activation_per_layer( | |
self, | |
batch_size: int, | |
seq_len: int, | |
is_inference: bool = True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
layernorm_dtype_bytes: int = BYTES_FP16 | |
) -> float: | |
if activation_recomputation == ActivationRecomputation.FULL: | |
return ( | |
(batch_size * seq_len * self.h / self.tp_size) * self.bytes_per_param | |
) | |
return 0 | |
def count_memory_kv_cache_per_layer( | |
self, | |
batch_size: int, | |
seq_len: int, | |
generate_len: int, | |
kv_cache_dtype_bytes: int = BYTES_FP16, | |
) -> float: | |
"""Get the memory (in bytes) required to store the key and value cache | |
for a transformer layer in inference, given the batch size, sequence | |
length, activation data type, and tensor parallelism size. | |
memory_kv_cache = 4blh(s+o) unit is byte | |
Args: | |
batch_size (int): batch size | |
context_len (int): seq_len + generate_len | |
Returns: | |
float: the memory (in bytes) required to store the key and value cache for a transformer layer in inference | |
""" | |
return ( | |
(2 * batch_size * (seq_len + generate_len) * self.h) / self.tp_size | |
) * kv_cache_dtype_bytes | |
def count_memory_per_gpu( | |
self, | |
batch_size: int, | |
seq_len: int, | |
generate_len: int, | |
is_inference: bool = True, | |
use_kv_cache: bool = True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
layernorm_dtype_bytes: int = BYTES_FP16, | |
kv_cache_dtype_bytes: int = BYTES_FP16 | |
) -> tuple: | |
# 1, prefill stage count memory and max_batch_size | |
weight_memory_per_gpu = self.count_memory_weights() # count model weights memory | |
memory_left = self.gpu_memory_in_GB - weight_memory_per_gpu | |
prefill_activation_memory_batch_size_1 = ( # count model activations and kv cache memory of prefill stage | |
self.count_memory_activation_per_layer( | |
1, seq_len, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes | |
) | |
* self.num_layers_per_gpu | |
) | |
prefill_max_batch_size_per_gpu = int( | |
memory_left / prefill_activation_memory_batch_size_1 | |
) | |
prefill_activation_memory_per_gpu = ( | |
self.count_memory_activation_per_layer( | |
batch_size, seq_len, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes | |
) | |
* self.num_layers_per_gpu | |
) | |
assert memory_left > prefill_activation_memory_per_gpu, ( | |
f"weight_memory_per_gpu {num_to_string(weight_memory_per_gpu)}, activation memory {num_to_string(prefill_activation_memory_per_gpu)} is too large can't fit in GPU memory! memory_left is {num_to_string(memory_left)}!" | |
) | |
# 2, decode stage count memory and max_batch_size | |
if use_kv_cache: | |
kv_cache_memory_batch_size_1 = ( | |
self.count_memory_kv_cache_per_layer( | |
1, | |
seq_len + generate_len, | |
kv_cache_dtype_bytes | |
) | |
* self.num_layers_per_gpu | |
) | |
kv_cache_memory_per_gpu = ( | |
self.count_memory_kv_cache_per_layer( | |
batch_size, | |
seq_len + generate_len, | |
kv_cache_dtype_bytes | |
) | |
* self.num_layers_per_gpu | |
) | |
decode_activation_memory_batch_size_1 = ( | |
# seq_len 1 is used for decoding | |
self.count_memory_activation_per_layer( | |
1, 1, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes | |
) | |
* self.num_layers_per_gpu | |
) | |
decode_activation_memory_per_gpu = ( | |
# seq_len 1 is used for decoding | |
self.count_memory_activation_per_layer( | |
batch_size, 1, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes | |
) | |
* self.num_layers_per_gpu | |
) | |
decode_max_batch_size_per_gpu = int( | |
memory_left / (decode_activation_memory_batch_size_1 + kv_cache_memory_batch_size_1) | |
) | |
max_batch_total_tokens = decode_max_batch_size_per_gpu * (seq_len + generate_len) | |
# llama2-70b 模型使用了 GQA 技术,kv cache 对应的 head 数目为 8,所以 max_batch_total_tokens 参数可取值为 16384*8。 | |
if self.model_config.model_name == "llama2-70b": | |
max_batch_total_tokens *= 8 | |
assert batch_size <= decode_max_batch_size_per_gpu, ( | |
f"batch_size_per_gpu {batch_size} is too large to fit" | |
" in GPU memory, decode_max_batch_size_per_gpu:" | |
f" {decode_max_batch_size_per_gpu}" | |
) | |
assert memory_left > ( | |
kv_cache_memory_per_gpu + decode_activation_memory_per_gpu | |
), ("kv_cache and activation memory with batch_size_per_gpu =" | |
f" {batch_size} is too large to fit in GPU memory" | |
) | |
else: | |
# 上下文长度不再是新生成的那个 token,而是 seq_len + generate_len | |
decode_activation_memory_batch_size_1 = ( | |
self.count_memory_activation_per_layer( | |
1, seq_len + generate_len, True, ActivationRecomputation.FULL, layernorm_dtype_bytes | |
) | |
* self.num_layers_per_gpu | |
) | |
decode_max_batch_size_per_gpu = int( | |
memory_left / decode_activation_memory_batch_size_1 | |
) | |
assert batch_size <= decode_max_batch_size_per_gpu, ( | |
f"batch_size {batch_size} is too large to fit" | |
" in GPU memory, decode_max_batch_size_per_gpu:" | |
f" {decode_max_batch_size_per_gpu}" | |
) | |
decode_activation_memory_per_gpu = ( | |
self.count_memory_activation_per_layer( | |
batch_size, seq_len + generate_len, True, ActivationRecomputation.FULL, layernorm_dtype_bytes | |
) | |
* self.num_layers_per_gpu | |
) | |
kv_cache_memory_per_gpu = 0 | |
decode_memory_total = (weight_memory_per_gpu + decode_activation_memory_per_gpu + kv_cache_memory_per_gpu) | |
# memory summary | |
memory_prefill_summary_dict = { | |
"weight_memory_per_gpu": weight_memory_per_gpu, | |
"prefill_activation_memory_batch_size_1": prefill_activation_memory_batch_size_1, | |
"prefill_max_batch_size_per_gpu": prefill_max_batch_size_per_gpu, | |
"prefill_activation_memory_per_gpu": prefill_activation_memory_per_gpu, | |
} | |
memory_decode_summary_dict = { | |
"weight_memory_per_gpu": weight_memory_per_gpu, | |
"decode_activation_memory_per_gpu": decode_activation_memory_per_gpu, | |
"kv_cache_memory_per_gpu": kv_cache_memory_per_gpu, | |
"decode_memory_total": decode_memory_total, | |
"decode_max_batch_size_per_gpu": decode_max_batch_size_per_gpu, | |
"max_batch_total_tokens": max_batch_total_tokens * 0.97, | |
} | |
return memory_prefill_summary_dict, memory_decode_summary_dict | |
class CountCausalLMLatency(object): | |
"""Count latency by roof-line performance model.""" | |
def __init__(self, llm_configs: LLMConfigs, data_type="fp16") -> None: | |
self.model_config = llm_configs.model_config | |
self.gpu_config = llm_configs.gpu_config | |
self.inference_config = llm_configs.inference_config | |
self.parallelism_config = llm_configs.parallelism_config | |
self.h = self.model_config.hidden_dim | |
self.l = self.model_config.num_layers | |
self.V = self.model_config.vocab_size | |
self.b = llm_configs.inference_config.batch_size_per_gpu | |
self.s = llm_configs.inference_config.seq_len | |
self.o = llm_configs.inference_config.generate_len | |
self.bytes_per_param = llm_configs.inference_config.bytes_per_param | |
self.tp_size = self.parallelism_config.tp_size | |
self.pp_size = self.parallelism_config.pp_size | |
self.num_layers_per_gpu = int(self.l / self.parallelism_config.pp_size) | |
self.gpu_hbm_bandwidth = get_gpu_hbm_bandwidth(self.gpu_config) * 10**9 # 单位 GB/s | |
self.gpu_intra_node_bandwidth = get_intra_node_bandwidth(self.gpu_config) * 10**9 # 互连带宽,单位 GB/s | |
self.gpu_TFLOPS = get_TFLOPS_per_gpu(self.gpu_config) * 10**12 # 单位 TFLOPS | |
self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9 # 单位 GB | |
self.llm_params = CountCausalLMParams(self.model_config) | |
self.llm_memory = CountCausalLMMemory(llm_configs) | |
self.llm_flops = CountCausalLMFlops(self.model_config, self.b, self.o) | |
def common_count_latency_for_ops( | |
self, | |
batch_size: int, | |
seq_len: int, | |
is_inference=True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
ops_type: str="attn", | |
stage="decode_" | |
) -> float: | |
"""Count the latency for the forward layer or model, assuming the compute and memory operations are perfectly overlapped. | |
Args: | |
flops (float): flops of the forward layer or model | |
memory (float): r/w memory(bytes) of the forward layer or model | |
tp_size (float): tensor parallelism size | |
gpu_TFLOPS (float): GPU TFLOPS in T(10^12)FLOPS | |
gpu_hbm_bandwidth (float): GPU HBM bandwidth in GB/s(10^9) | |
Returns: | |
float: the latency in seconds for the forward pass | |
""" | |
if ops_type=="attn": | |
flops = self.llm_flops.count_flops_fwd_per_layer_attn(batch_size, seq_len) | |
weight_memory = self.llm_params.count_params_per_layer_attn() * self.bytes_per_param | |
activation_memory = self.llm_memory.count_memory_activation_per_layer_attn( | |
batch_size, seq_len, is_inference, activation_recomputation | |
) | |
elif ops_type=="mlp": | |
flops = self.llm_flops.count_flops_fwd_per_layer_mlp(batch_size, seq_len) | |
weight_memory = self.llm_params.count_params_per_layer_mlp() * self.bytes_per_param | |
activation_memory = self.llm_memory.count_memory_activation_per_layer_mlp(is_inference, activation_recomputation) | |
elif ops_type=="layernorm": | |
activation_memory = self.llm_memory.count_memory_activation_per_layer_layernorm( | |
is_inference, activation_recomputation) # activation_memory | |
weight_memory = 0 # layernorm has no matrix weight, only vector weight, is ignored | |
flops = 0 # layernorm is not compute bound, flops is very small | |
else: | |
print("error! unsupported ops_type") | |
activation_memory = 0 | |
memory = weight_memory + activation_memory | |
compute_latency = flops / (self.tp_size * self.gpu_TFLOPS) # 单位秒 | |
memory_latency = memory / (self.tp_size * self.gpu_hbm_bandwidth) | |
if memory_latency > compute_latency: | |
print(f"{stage} stage: memory_latency {latency_to_string(memory_latency)} > compute_latency {latency_to_string(compute_latency)}, this {ops_type} layer is memory bound!") | |
else: | |
print(f"{stage} stage: memory_latency {latency_to_string(memory_latency)} <= compute_latency {latency_to_string(compute_latency)}, this {ops_type} layer is compute bound!") | |
return max(compute_latency, memory_latency) | |
def count_latency_fwd_per_layer_tp_comm(self, batch_size: int, seq_len: int) -> float: | |
"""Count the latency of a single allreduce communication across the | |
tensor parallel group in the forward pass of a transformer layer. | |
The latency is the max of the latency for the allreduce and the minimum | |
message latency through intra-node connect. | |
""" | |
is_ring_allreduce = False | |
if self.tp_size == 1: | |
return 0 | |
# \phi is communication data, if tp_size is large enough num_data_per_all_reduce can be 2bsh | |
if is_ring_allreduce: | |
num_data_per_all_reduce = ( | |
2 * batch_size * seq_len * self.h * | |
(self.tp_size - 1) / (self.tp_size) | |
) | |
else: | |
bsh = batch_size * seq_len * self.h | |
num_data_per_all_reduce = ( | |
6 * bsh * (self.tp_size - 1) / (self.tp_size) + | |
3 * bsh | |
) | |
latency_per_all_reduce = ( | |
num_data_per_all_reduce * self.bytes_per_param | |
/ (self.gpu_intra_node_bandwidth) | |
) | |
# intra_node_min_message_latency: 节点内连接的最小消息延迟 | |
return max( | |
latency_per_all_reduce, | |
self.gpu_config.intra_node_min_message_latency, | |
) | |
def count_latency_fwd_per_layer( | |
self, | |
batch_size: int, | |
seq_len: int, | |
is_inference: bool=True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
layernorm_dtype_bytes: int = BYTES_FP16, | |
stage="decode_" | |
) -> tuple: | |
latency_fwd_per_layer_attn = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, ops_type="attn", stage=stage) | |
latency_fwd_per_layer_mlp = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, ops_type="mlp", stage=stage) | |
latency_fwd_per_layer_layernorm = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, "layernorm", stage=stage) | |
latency_fwd_per_layer_tp_comm = self.count_latency_fwd_per_layer_tp_comm(batch_size, seq_len) | |
latency_per_layer = ( | |
latency_fwd_per_layer_attn | |
+ latency_fwd_per_layer_mlp | |
+ 2 * latency_fwd_per_layer_layernorm # 2 个 layernorm 层 | |
+ 2 * latency_fwd_per_layer_tp_comm # 一次 AllReduce 产生的通讯量为 2bsh | |
) | |
dict_latency_per_layer = { | |
"latency_per_layer": (latency_per_layer), | |
"latency_attn": (latency_fwd_per_layer_attn), | |
"latency_mlp": (latency_fwd_per_layer_mlp), | |
"latency_layernorm": (2 * latency_fwd_per_layer_layernorm), | |
"latency_tp_comm": (2 * latency_fwd_per_layer_tp_comm), | |
} | |
return latency_per_layer, dict_latency_per_layer | |
def count_latency_fwd_input_embedding( | |
self, batch_size: int, seq_len: int | |
) -> float: | |
"""Get the latency for the forward pass of the input embedding layer, | |
given the batch size, sequence length, and data type of the embedding | |
weight. | |
Args: | |
batch_size (int): batch size | |
seq_len (int): sequence length | |
dtype_bytes (int, optional): number of bytes in the data type for the embedding weight. Defaults to BYTES_FP32. | |
Returns: | |
float: the latency in seconds for the forward pass of the input embedding layer | |
""" | |
memory_latency = ( | |
self.model_config.vocab_size | |
* self.model_config.hidden_dim | |
* self.bytes_per_param | |
/ (self.gpu_hbm_bandwidth) | |
) | |
comm_latency = self.count_latency_fwd_per_layer_tp_comm( | |
batch_size, seq_len | |
) | |
return memory_latency + comm_latency | |
def count_latency_fwd_output_embedding_loss( | |
self, batch_size: int, seq_len: int | |
) -> float: | |
"""Get the latency for the forward pass of the output embedding layer (computing the logits). The operation is compute bound. With tensor parallelism size > 1, an allgather communicates `batch_size * seq_len` elements, which is ignored here. Refer to https://arxiv.org/abs/1909.08053 for more details. | |
Args: | |
batch_size (int): batch size | |
seq_len (int): sequence length | |
Returns: | |
float: the latency in seconds for the forward pass of the output embedding layer | |
""" | |
compute_latency = ( | |
2 * batch_size * seq_len * self.h * self.V | |
/ self.tp_size | |
/ self.gpu_TFLOPS | |
) | |
return compute_latency | |
def count_latency_kv_cache( | |
self, | |
batch_size: int, | |
seq_len: int, | |
generate_len: int, | |
use_kv_cache: bool = True, | |
kv_cache_dtype_bytes: int = BYTES_FP16 | |
) -> tuple: | |
"""Get the latency for the forward pass of the key and value cache in a transformer layer, given the batch size, sequence length, and whether the key and value cache is used. | |
Args: | |
batch_size (int): batch size | |
seq_len (int): sequence length | |
generate_len (int): number of tokens to generate | |
use_kv_cache (bool, optional): whether the key and value cache is used. Defaults to True. | |
Returns: | |
float: the latency in seconds for the forward pass of the key and value cache in a transformer layer | |
""" | |
if not use_kv_cache: | |
return 0 | |
kv_cache_memory_list_per_gpu, kv_cache_latency_list = [], [] | |
for context_len in range(seq_len, seq_len + generate_len + 1): | |
kv_cache_memory_per_gpu = ( | |
self.llm_memory.count_memory_kv_cache_per_layer( | |
batch_size, | |
context_len, | |
kv_cache_dtype_bytes | |
) * self.num_layers_per_gpu | |
) | |
kv_cache_latency = ( | |
kv_cache_memory_per_gpu / self.gpu_hbm_bandwidth | |
) | |
kv_cache_memory_list_per_gpu.append(kv_cache_memory_per_gpu) | |
kv_cache_latency_list.append(kv_cache_latency) | |
kv_cache_avg_latency = average(kv_cache_latency_list) | |
kv_cache_peak_latency = max(kv_cache_latency_list) | |
return kv_cache_avg_latency, kv_cache_peak_latency | |
def count_latency_fwd_model( | |
self, | |
batch_size: int, | |
seq_len: int, | |
is_inference: bool = True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
layernorm_dtype_bytes: int = BYTES_FP32, | |
breakdown_prefix: str = "", | |
) -> tuple: | |
latency_fwd_per_layer, breakdown_per_layer = self.count_latency_fwd_per_layer( | |
batch_size, | |
seq_len, | |
is_inference, | |
activation_recomputation, | |
layernorm_dtype_bytes, | |
stage=breakdown_prefix | |
) | |
num_layers_per_gpu = self.num_layers_per_gpu | |
latency_fwd_all_layers = latency_fwd_per_layer * self.num_layers_per_gpu | |
latency_fwd_input_embedding = self.count_latency_fwd_input_embedding(batch_size, seq_len) | |
latency_fwd_output_embedding_loss = self.count_latency_fwd_output_embedding_loss(batch_size, seq_len) | |
model_latency = ( | |
latency_fwd_all_layers | |
+ latency_fwd_input_embedding | |
+ latency_fwd_output_embedding_loss | |
) | |
model_latency_breakdown = { | |
breakdown_prefix + "latency_fwd_per_layer": breakdown_per_layer, | |
breakdown_prefix + "latency_fwd_attn": (breakdown_per_layer["latency_attn"] * num_layers_per_gpu), | |
breakdown_prefix + "latency_fwd_mlp": (breakdown_per_layer["latency_mlp"] * num_layers_per_gpu), | |
breakdown_prefix + "latency_fwd_layernorm": (breakdown_per_layer["latency_layernorm"] * num_layers_per_gpu), | |
breakdown_prefix + "latency_fwd_tp_comm": (breakdown_per_layer["latency_tp_comm"] * num_layers_per_gpu), | |
breakdown_prefix + "latency_fwd_input_embedding": (latency_fwd_input_embedding), | |
breakdown_prefix + "latency_fwd_output_embedding_loss": (latency_fwd_output_embedding_loss), | |
} | |
return model_latency, model_latency_breakdown | |
def count_latency_fwd( | |
self, | |
batch_size: int, | |
seq_len: int, | |
generate_len: int, | |
use_kv_cache: bool = True, | |
kv_cache_dtype_bytes: int = BYTES_FP16, | |
is_inference: bool = True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
layernorm_dtype_bytes: int = BYTES_FP32, | |
) -> tuple: | |
# 1, 预填充阶段 | |
prefill_latency, prefill_latency_breakdown = self.count_latency_fwd_model( | |
batch_size, | |
seq_len, | |
is_inference=is_inference, | |
layernorm_dtype_bytes=layernorm_dtype_bytes, | |
breakdown_prefix="prefill_", | |
) | |
prefill_latency_breakdown.update( | |
{ | |
"prefill_latency": prefill_latency, | |
} | |
) | |
# 2, 解码阶段 | |
kv_cache_avg_latency, kv_cache_peak_latency = self.count_latency_kv_cache( | |
batch_size, | |
seq_len, | |
generate_len, | |
use_kv_cache, | |
kv_cache_dtype_bytes | |
) | |
decode_model_latency, decode_latency_breakdown = self.count_latency_fwd_model( | |
batch_size, | |
1 if use_kv_cache else (seq_len + generate_len) * (2/3), # k、v cache 占 2/3,重新计算 | |
is_inference=is_inference, | |
activation_recomputation=activation_recomputation, | |
layernorm_dtype_bytes=layernorm_dtype_bytes, | |
breakdown_prefix="decode_", | |
) | |
decode_avg_latency = decode_model_latency + kv_cache_avg_latency | |
decode_peak_latency = decode_model_latency + kv_cache_peak_latency | |
decode_latency_breakdown.update( | |
{ | |
"kv_cache_avg_latency": (kv_cache_avg_latency), | |
"kv_cache_peak_latency": (kv_cache_peak_latency), | |
"decode_avg_latency": (decode_avg_latency), | |
"decode_peak_latency": (decode_peak_latency) | |
} | |
) | |
return prefill_latency_breakdown, decode_latency_breakdown | |
class LLMProfiler(object): | |
"""Measures the latency, memory, number of estimated floating-point operations and parameters of each module in a PyTorch model.""" | |
def __init__(self, llm_configs: LLMConfigs) -> None: | |
self.model_config = llm_configs.model_config | |
self.gpu_config = llm_configs.gpu_config | |
self.inference_config = llm_configs.inference_config | |
self.parallelism_config = llm_configs.parallelism_config | |
self.gpu_efficiency_config = llm_configs.gpu_efficiency_config | |
self.h = self.model_config.hidden_dim | |
self.l = self.model_config.num_layers | |
self.V = self.model_config.vocab_size | |
self.b = llm_configs.inference_config.batch_size_per_gpu | |
self.s = llm_configs.inference_config.seq_len | |
self.o = llm_configs.inference_config.generate_len | |
self.bytes_per_param = llm_configs.inference_config.bytes_per_param | |
self.tp_size = self.parallelism_config.tp_size | |
self.pp_size = self.parallelism_config.pp_size | |
self.num_layers_per_gpu = int(self.l / self.parallelism_config.pp_size) | |
self.gpu_hbm_bandwidth = get_gpu_hbm_bandwidth(self.gpu_config) * 10**9 # 单位 GB/s | |
self.gpu_intra_node_bandwidth = get_intra_node_bandwidth(self.gpu_config) * 10**9 # 互连带宽,单位 GB/s | |
self.gpu_TFLOPS = get_TFLOPS_per_gpu(self.gpu_config) * 10**12 # 单位 TFLOPS | |
self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9 # 单位 GB | |
self.llm_params = CountCausalLMParams(self.model_config) | |
self.llm_flops = CountCausalLMFlops(self.model_config, self.b, self.s) | |
self.llm_memory = CountCausalLMMemory(llm_configs) | |
self.llm_latency = CountCausalLMLatency(llm_configs) | |
self.inference_results = [] | |
def infer_profile( | |
self, | |
batch_size_per_gpu: int = 1, | |
seq_len: int = 522, | |
generate_len: int = 1526, | |
use_kv_cache: bool = True, | |
activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL, | |
layernorm_dtype_bytes: int = 2, | |
kv_cache_dtype_bytes: int = 2, | |
flops_efficiency: float = None, | |
hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY, | |
intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY, | |
inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY, | |
print_flag=True | |
) -> dict: | |
"""LLM inference analysis given the llm configs and inputs. | |
Args: | |
generate_len (int, optional): number of tokens to generate for generative models. Defaults to 100. | |
use_kv_cache (bool, optional): whether to use kv_cache. Defaults to True. | |
layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations. Defaults to BYTES_FP32. | |
Often has to be at least FP16 in inference to maintain model accuracy. | |
Returns: | |
dict: a summary dict of the training analysis | |
""" | |
if self.model_config.max_seq_len is not None: | |
assert( | |
seq_len + generate_len <= self.model_config.max_seq_len | |
), f"seq_len {seq_len} exceeds the max_seq_len {self.model_config.max_seq_len}" | |
if self.l % self.pp_size != 0: | |
logger.warning( | |
"Warning: the number of layers is not divisible by pp_size, please taking the floor!" | |
) | |
pp_instance_factor = self.pp_size | |
infer_config_dict = { | |
"inference_config":{ | |
"model_name": self.model_config.model_name, | |
"batch_size_per_gpu": batch_size_per_gpu, | |
"seq_len": seq_len, | |
"tp_size": self.tp_size, | |
"pp_size": self.pp_size, | |
"generate_len": generate_len, | |
"use_kv_cache": use_kv_cache, | |
}, | |
"gpu_config": { | |
"name": self.gpu_config.name, | |
"memory_GPU_in_GB": f"{self.gpu_config.memory_GPU_in_GB} GB", | |
"gpu_hbm_bandwidth": f"{get_gpu_hbm_bandwidth(self.gpu_config)} GB/s", | |
"gpu_intra_node_bandwidth": f"{get_intra_node_bandwidth(self.gpu_config)} GB/s", | |
"gpu_TFLOPS": f"{get_TFLOPS_per_gpu(self.gpu_config)} TFLOPS", | |
} | |
} | |
params_per_layer, dict_params_per_layer = self.llm_params.count_params_per_layer() | |
num_params_model = self.llm_params.count_params_model() | |
flops_fwd_per_layer, dict_flops_fwd_per_layer = self.llm_flops.count_flops_fwd_per_layer(self.b, self.s) | |
num_flops_fwd_model = self.llm_flops.count_flops_fwd_model(self.b, self.s) | |
memory_prefill_summary_dict, memory_decode_summary_dict = self.llm_memory.count_memory_per_gpu( | |
batch_size_per_gpu, | |
seq_len, | |
generate_len, | |
is_inference=True, | |
use_kv_cache=use_kv_cache, | |
activation_recomputation=activation_recomputation, | |
layernorm_dtype_bytes=layernorm_dtype_bytes, | |
kv_cache_dtype_bytes=kv_cache_dtype_bytes | |
) | |
prefill_latency_breakdown, decode_latency_breakdown = self.llm_latency.count_latency_fwd( | |
batch_size_per_gpu, | |
seq_len, | |
generate_len, | |
use_kv_cache=use_kv_cache, | |
activation_recomputation=activation_recomputation, | |
layernorm_dtype_bytes=layernorm_dtype_bytes, | |
kv_cache_dtype_bytes=kv_cache_dtype_bytes | |
) | |
infer_result_dict = { | |
"model_params": num_params_model, | |
"model_flops": num_flops_fwd_model, | |
"prefill_first_token_latency": prefill_latency_breakdown["prefill_latency"], | |
"decode_per_token_latency": decode_latency_breakdown["decode_avg_latency"], | |
"kv_cache_latency": decode_latency_breakdown["kv_cache_avg_latency"], | |
"total_infer_latency": prefill_latency_breakdown["prefill_latency"] + decode_latency_breakdown["decode_avg_latency"] * generate_len, | |
} | |
gb_factor = 1024 ** 3 | |
inference_result_dict = { | |
"model_params": num_params_model, | |
"prefill_first_token_latency": prefill_latency_breakdown["prefill_latency"], | |
"decode_per_token_latency": decode_latency_breakdown["decode_avg_latency"], | |
"kv_cache_latency": decode_latency_breakdown["kv_cache_avg_latency"], | |
"total_infer_latency": prefill_latency_breakdown["prefill_latency"] + decode_latency_breakdown["decode_avg_latency"] * generate_len, | |
"weight_memory_per_gpu": memory_decode_summary_dict["weight_memory_per_gpu"] / gb_factor, | |
"decode_activation_memory_per_gpu": memory_decode_summary_dict["decode_activation_memory_per_gpu"] / gb_factor, | |
"kv_cache_memory_per_gpu": memory_decode_summary_dict["kv_cache_memory_per_gpu"] / gb_factor, | |
"decode_max_batch_size_per_gpu": memory_decode_summary_dict["decode_max_batch_size_per_gpu"], | |
"max_batch_total_tokens": memory_decode_summary_dict["max_batch_total_tokens"], | |
} | |
pp_specific_dict = { | |
"pp_decode_latency": inference_result_dict["decode_per_token_latency"] / pp_instance_factor, | |
"pp_prefill_latency": inference_result_dict["prefill_first_token_latency"] / pp_instance_factor, | |
"pp_kv_cache_latency": inference_result_dict["kv_cache_latency"] / pp_instance_factor, | |
"pp_e2e_latency": inference_result_dict["total_infer_latency"] / pp_instance_factor, | |
"pp_max_batch_total_tokens": inference_result_dict["decode_per_token_latency"] / pp_instance_factor, | |
"pp_max_batch_size": inference_result_dict["decode_max_batch_size_per_gpu"] / pp_instance_factor, | |
"pp_kv_cache_memory_per_gpu": inference_result_dict["kv_cache_memory_per_gpu"] * pp_instance_factor, | |
} | |
inference_result_dict.update(pp_specific_dict) | |
inference_result_dict.update(infer_config_dict["inference_config"].copy()) | |
inference_result_dict.update(infer_config_dict["gpu_config"].copy()) | |
self.inference_results.append(inference_result_dict) | |
if print_flag: | |
print("\n-------------------------- LLM main infer config --------------------------") | |
pprint.pprint(infer_config_dict, indent=4, sort_dicts=False) | |
print("\n---------------------------- LLM Params analysis ----------------------------") | |
self.print_format_summary_dict(dict_params_per_layer, get_dict_depth(dict_params_per_layer)) | |
pprint.pprint({"params_model": num_to_string(num_params_model)}, indent=4, sort_dicts=False) | |
print("\n---------------------------- LLM Flops analysis -----------------------------") | |
self.print_format_summary_dict(dict_flops_fwd_per_layer, get_dict_depth(dict_flops_fwd_per_layer)) | |
pprint.pprint({"prefill flops_model": num_to_string(num_flops_fwd_model)}, indent=4, sort_dicts=False) | |
print("\n---------------------------- LLM Memory analysis -----------------------------") | |
self.print_format_summary_dict(memory_prefill_summary_dict, get_dict_depth(memory_prefill_summary_dict)) | |
self.print_format_summary_dict(memory_decode_summary_dict, get_dict_depth(memory_decode_summary_dict)) | |
print("\n-------------------------- LLM infer performance analysis --------------------------") | |
self.print_format_summary_dict(infer_result_dict, get_dict_depth(infer_result_dict)) | |
print("\n-------------------------- LLM detailed's latency analysis --------------------------") | |
pprint.pprint([prefill_latency_breakdown, decode_latency_breakdown], indent=4, sort_dicts=False) | |
print("prefill_latency_breakdown depth is ", get_dict_depth(prefill_latency_breakdown), prefill_latency_breakdown) | |
self.print_format_summary_dict(prefill_latency_breakdown, get_dict_depth(prefill_latency_breakdown)) | |
self.print_format_summary_dict(decode_latency_breakdown, get_dict_depth(decode_latency_breakdown)) | |
# return memory_decode_summary_dict["max_batch_total_tokens"] | |
return memory_decode_summary_dict["max_batch_total_tokens"] | |
def get_inference_results(self): | |
return self.inference_results | |
def print_format_summary_dict(self, summary_dict: dict, depth:int) -> str: | |
for key, value in summary_dict.items(): | |
if "params" in key or "flops" in key: | |
if not isinstance(value, dict): | |
summary_dict.update({key: num_to_string(value)}) | |
else: | |
self.print_format_summary_dict(value, get_dict_depth(value)-1) # 递归调用函数 | |
if "latency" in key: | |
if not isinstance(value, dict): | |
summary_dict.update({key: latency_to_string(value)}) | |
else: | |
self.print_format_summary_dict(value, get_dict_depth(value)-1) | |
if "memory" in key: | |
if not isinstance(value, dict): | |
summary_dict.update({key: f"{num_to_string(value)}B"}) | |
else: | |
self.print_format_summary_dict(value, get_dict_depth(value)-1) | |
if depth >= 1: | |
pprint.pprint(summary_dict, indent=4, sort_dicts=False) | |
def llm_profile(model_name="llama2-70b", | |
gpu_name: str = "t4-pcie-15gb", | |
bytes_per_param: int = BYTES_FP16, | |
batch_size_per_gpu: int = 2, | |
seq_len: int = 300, | |
generate_len=40, | |
ds_zero: int = 0, | |
dp_size: int = 1, | |
tp_size: int = 4, | |
pp_size: int = 1, | |
sp_size: int = 1, | |
use_kv_cache: bool = True, | |
layernorm_dtype_bytes: int = BYTES_FP16, | |
kv_cache_dtype_bytes: int = BYTES_FP16, | |
flops_efficiency: float = FLOPS_EFFICIENCY, | |
hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY, | |
intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY, | |
inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY, | |
mode: str = "inference", | |
print_flag: bool = True, | |
) -> dict: | |
"""Returns dict of the total floating-point operations, MACs, parameters and latency of a llm. | |
Args: | |
model_name (str, optional): model name to query the pre-defined `model_configs.json`. Defaults to "llama-13b". | |
gpu_name (str, optional): gpu name to query the pre-defined `model_configs.json`. Defaults to "v100-sxm2-32gb". | |
batch_size_per_gpu (int, optional): _description_. Defaults to 1. | |
seq_len (int, optional): batch size per GPU.. Defaults to 522. | |
generate_len (int, optional): The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. Defaults to 1526. | |
ds_zero (int, optional): which DeepSpeed ZeRO stage to use.. Defaults to 0. | |
dp_size (int, optional): data parallelism size. Defaults to 1. | |
tp_size (int, optional): tensor parallelism size. Defaults to 1. | |
pp_size (int, optional): pipeline parallelism size. Defaults to 1. | |
sp_size (int, optional): sequence parallelism size. Defaults to 1. | |
use_kv_cache (bool, optional): Whether or not the model should use the past last key/values attentions (if applicable to the model) to | |
speed up decoding. Defaults to True. | |
layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations.. Defaults to BYTES_FP16. | |
kv_cache_dtype_bytes (int, optional): number of bytes in the data type for the kv_cache. Defaults to None. | |
flops_efficiency (float, optional): flops efficiency, ranging from 0 to 1. Defaults to None. | |
hbm_memory_efficiency (float, optional): GPU HBM memory efficiency, ranging from 0 to 1. Defaults to HBM_MEMORY_EFFICIENCY. | |
intra_node_memory_efficiency (_type_, optional): intra-node memory efficiency, ranging from 0 to 1.. Defaults to INTRA_NODE_MEMORY_EFFICIENCY. | |
inter_node_memory_efficiency (_type_, optional): inter-node memory efficiency, ranging from 0 to 1.. Defaults to INTER_NODE_MEMORY_EFFICIENCY. | |
mode (str, optional): model training or inference. Defaults to "inference". | |
Returns: | |
dict: a summary dictionary of the inference analysis | |
""" | |
model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name) | |
parallelism_config = ParallelismConfig(tp_size=tp_size, pp_size=pp_size, | |
dp_size=dp_size, sp_size=sp_size | |
) | |
inference_config = InferenceConfig(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len, | |
generate_len=generate_len, use_kv_cache=use_kv_cache, | |
bytes_per_param=bytes_per_param, | |
layernorm_dtype_bytes=layernorm_dtype_bytes, | |
kv_cache_dtype_bytes=kv_cache_dtype_bytes | |
) | |
gpu_efficiency_config = GPUEfficiencyConfig(flops_efficiency=flops_efficiency, | |
hbm_memory_efficiency=hbm_memory_efficiency, | |
intra_node_memory_efficiency=intra_node_memory_efficiency, | |
inter_node_memory_efficiency=inter_node_memory_efficiency | |
) | |
llm_configs = LLMConfigs(model_config=model_config, gpu_config=gpu_config, | |
parallelism_config=parallelism_config, inference_config=inference_config, | |
gpu_efficiency_config=gpu_efficiency_config | |
) | |
profiler = LLMProfiler(llm_configs) | |
max_batch_total_tokens = profiler.infer_profile(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len, | |
generate_len=generate_len, use_kv_cache=use_kv_cache, | |
layernorm_dtype_bytes=layernorm_dtype_bytes, | |
flops_efficiency=flops_efficiency, | |
hbm_memory_efficiency=hbm_memory_efficiency, | |
print_flag=print_flag) | |
return max_batch_total_tokens | |
def llm_profile_infer(model_name="llama2-70b", | |
gpu_name: str = "t4-pcie-15gb", | |
bytes_per_param: int = BYTES_FP16, | |
batch_size_per_gpu: int = 2, | |
seq_len: int = 300, | |
generate_len=40, | |
ds_zero: int = 0, | |
dp_size: int = 1, | |
tp_size: int = 4, | |
pp_size: int = 1, | |
sp_size: int = 1, | |
use_kv_cache: bool = True, | |
layernorm_dtype_bytes: int = BYTES_FP16, | |
kv_cache_dtype_bytes: int = BYTES_FP16, | |
flops_efficiency: float = FLOPS_EFFICIENCY, | |
hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY, | |
intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY, | |
inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY, | |
mode: str = "inference", | |
print_flag: bool = True, | |
) -> list: | |
model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name) | |
parallelism_config = ParallelismConfig(tp_size=tp_size, pp_size=pp_size, | |
dp_size=dp_size, sp_size=sp_size | |
) | |
inference_config = InferenceConfig(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len, | |
generate_len=generate_len, use_kv_cache=use_kv_cache, | |
bytes_per_param=bytes_per_param, | |
layernorm_dtype_bytes=layernorm_dtype_bytes, | |
kv_cache_dtype_bytes=kv_cache_dtype_bytes | |
) | |
gpu_efficiency_config = GPUEfficiencyConfig(flops_efficiency=flops_efficiency, | |
hbm_memory_efficiency=hbm_memory_efficiency, | |
intra_node_memory_efficiency=intra_node_memory_efficiency, | |
inter_node_memory_efficiency=inter_node_memory_efficiency | |
) | |
llm_configs = LLMConfigs(model_config=model_config, gpu_config=gpu_config, | |
parallelism_config=parallelism_config, inference_config=inference_config, | |
gpu_efficiency_config=gpu_efficiency_config | |
) | |
profiler = LLMProfiler(llm_configs) | |
max_batch_total_tokens = profiler.infer_profile(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len, | |
generate_len=generate_len, use_kv_cache=use_kv_cache, | |
layernorm_dtype_bytes=layernorm_dtype_bytes, | |
flops_efficiency=flops_efficiency, | |
hbm_memory_efficiency=hbm_memory_efficiency, | |
print_flag=print_flag) | |
return max_batch_total_tokens, profiler.get_inference_results() | |
def to_csv(inference_results: list, name: str = "infer_results"): | |
df = pd.DataFrame(inference_results) | |
csv_path = name + ".csv" | |
pprint.pprint(f"Saving inference results to: {csv_path}") | |
df.to_csv(csv_path, index=False) | |
def profile_pp(): | |
# model_name_list = ["llama-7b", "llama-13b", "llama-65b", "llama2-70b", "internlm-20b"] | |
model_name_list = ["llama2-70b"] | |
# gpu_name_list = ["a30-sxm-24gb", "a40-pcie-48gb", "a100-sxm-40gb", "a100-sxm-80gb", "910b-64gb", "v100-sxm-32gb", "t4-pcie-15gb"] | |
gpu_name_list = ["a100-sxm-80gb"] | |
batch_size_per_gpu = 32 | |
tp_pp_nums = [ | |
[1, 1], # tp | |
[2, 1], | |
[4, 1], | |
[8, 1], | |
# tp / pp | |
[2, 4], | |
[4, 2], | |
# pp | |
[1, 2], | |
[1, 4], | |
[1, 8], | |
] | |
tgi_service_dict_list = [] | |
seq_len, generate_len = 1024, 1024 | |
inference_results = [] | |
for model_name in model_name_list: | |
if model_name in ["llama2-70b", "internlm-20b"]: | |
seq_len, generate_len = 1024, 1024 | |
for gpu_name in gpu_name_list: | |
for tp_size, pp_size in tp_pp_nums: | |
try: | |
max_batch_total_tokens, infer_result = llm_profile_infer( | |
model_name=model_name, | |
gpu_name=gpu_name, | |
batch_size_per_gpu=batch_size_per_gpu, | |
tp_size=tp_size, | |
pp_size=pp_size, | |
seq_len=seq_len, | |
generate_len=generate_len, | |
print_flag=False, | |
) | |
inference_results += infer_result | |
except Exception as e: | |
print( | |
f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, error: {e}" | |
) | |
continue | |
tgi_service_dict = { | |
"model_name": model_name, | |
"gpu_name": gpu_name, | |
"pp_size": pp_size, | |
"tp_size": tp_size, | |
"max_batch_total_tokens": max_batch_total_tokens, | |
"max_batch_size": floor( | |
max_batch_total_tokens / (seq_len + generate_len) | |
), | |
} | |
tgi_service_dict_list.append(tgi_service_dict) | |
print( | |
"================================== TGI+LightLLM service max_batch_total_tokens params list =============================" | |
) | |
print_list(tgi_service_dict_list) | |
to_csv(inference_results, f"bs{batch_size_per_gpu}_in{seq_len}_out{generate_len}_centralize_allreduce") | |
def demo(): | |
# llm_profile(print_flag=True) | |
# model_name_list = ["llama-7b", "llama-13b", "llama-65b", "llama2-70b", "internlm-20b"] | |
model_name_list = ["llama2-70b"] | |
# gpu_name_list = ["a30-sxm-24gb", "a40-pcie-48gb", "a100-sxm-40gb", "a100-sxm-80gb", "910b-64gb", "v100-sxm-32gb", "t4-pcie-15gb"] | |
gpu_name_list = ["a100-sxm-80gb", "910b-64gb"] | |
batch_size_per_gpu = 32 | |
tp_nums_list = [8] | |
pp_nums_list = [1] | |
tp_pp_nums = [ | |
[8, 1], | |
[1, 8], | |
[4, 2] | |
] | |
tgi_service_dict_list = [] | |
seq_len, generate_len = 1024, 1024 | |
for model_name in model_name_list: | |
if model_name in ["llama2-70b", "internlm-20b"]: | |
seq_len, generate_len = 1024, 1024 | |
# pp_size = 0 | |
# tp_size = 0 | |
for gpu_name in gpu_name_list: | |
# for tp_size in tp_nums_list: | |
# for pp_size in pp_nums_list: | |
for (tp_size, pp_size) in tp_pp_nums: | |
try: | |
max_batch_total_tokens = int(llm_profile(model_name=model_name, gpu_name=gpu_name, batch_size_per_gpu=batch_size_per_gpu, tp_size=tp_size, pp_size=pp_size, | |
seq_len=seq_len, generate_len=generate_len, print_flag=True)) | |
except Exception as e: | |
print(f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, error: {e}") | |
continue | |
tgi_service_dict = {"model_name": model_name, "gpu_name": gpu_name, "pp_size": pp_size, "tp_size": tp_size, "max_batch_total_tokens": max_batch_total_tokens, "max_batch_size": floor(max_batch_total_tokens / (seq_len + generate_len))} | |
tgi_service_dict_list.append(tgi_service_dict) | |
print("================================== TGI+LightLLM service max_batch_total_tokens params list =============================") | |
print_list(tgi_service_dict_list) | |
if __name__ == "__main__": | |
profile_pp() |