Spaces:

haolun
/

llm-profiler

Sleeping

App Files Files Community

zenghaolun02 commited on Jul 3, 2024

Commit

0c4803b

1 Parent(s): e23ddae

add demo

Browse files

Files changed (10) hide show

__init__.py +13 -0
app.py +175 -0
config.py +213 -0
configs/gpu_configs.json +163 -0
configs/gpu_perf.ini +25 -0
configs/model_configs.json +204 -0
constants.py +28 -0
interface.py +175 -0
llm_profiler.py +1274 -0
utils.py +82 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2023 Cheng Li
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import gradio as gr
+import io
+import logging
+from llm_profiler import *
+import sys
+from contextlib import redirect_stdout
+# 模型列表
+model_names = [
+    "opt-1.3b",
+    "opt-6.7b",
+    "opt-13b",
+    "opt-66b",
+    "opt-175b",
+    "gpt2",
+    "gpt2-medium",
+    "gpt2-large",
+    "gpt2-xl",
+    "bloom-560m",
+    "bloom-7b",
+    "bloom-175b",
+    "llama-7b",
+    "llama-13b",
+    "llama-30b",
+    "llama-65b",
+    "llama2-13b",
+    "llama2-70b",
+    "internlm-20b",
+    "baichuan2-13b",
+]
+# GPU 列表
+gpu_names = [
+    "t4-pcie-15gb",
+    "v100-pcie-32gb",
+    "v100-sxm-32gb",
+    "br104p",
+    "a100-pcie-40gb",
+    "a100-sxm-40gb",
+    "a100-pcie-80gb",
+    "a100-sxm-80gb",
+    "910b-64gb",
+    "h100-sxm-80gb",
+    "h100-pcie-80gb",
+    "a30-pcie-24gb",
+    "a30-sxm-24gb",
+    "a40-pcie-48gb",
+]
+# 创建一个日志处理器，将日志消息写入 StringIO 对象
+class StringHandler(logging.Handler):
+    def __init__(self):
+        super().__init__()
+        self.stream = io.StringIO()
+        self.setFormatter(logging.Formatter("%(message)s"))
+    def emit(self, record):
+        self.stream.write(self.format(record) + "\n")
+    def get_value(self):
+        return self.stream.getvalue()
+# 创建一个日志记录器并添加 StringHandler
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+string_handler = StringHandler()
+logger.addHandler(string_handler)
+def gradio_interface(
+    model_name="llama2-70b",
+    gpu_name: str = "t4-pcie-15gb",
+    bytes_per_param: int = BYTES_FP16,
+    batch_size_per_gpu: int = 2,
+    seq_len: int = 300,
+    generate_len: int = 40,
+    ds_zero: int = 0,
+    dp_size: int = 1,
+    tp_size: int = 4,
+    pp_size: int = 1,
+    sp_size: int = 1,
+    use_kv_cache: bool = True,
+    layernorm_dtype_bytes: int = BYTES_FP16,
+    kv_cache_dtype_bytes: int = BYTES_FP16,
+    flops_efficiency: float = FLOPS_EFFICIENCY,
+    hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
+    intra_node_memory_efficiency: float = INTRA_NODE_MEMORY_EFFICIENCY,
+    inter_node_memory_efficiency: float = INTER_NODE_MEMORY_EFFICIENCY,
+    mode: str = "inference",
+    print_flag: bool = True,
+) -> list:
+    # 清空 StringIO 对象
+    string_handler.stream.seek(0)
+    string_handler.stream.truncate()
+    # 重定向 sys.stdout 到 StringHandler
+    original_stdout = sys.stdout
+    sys.stdout = string_handler.stream
+    # 调用你的推理函数
+    results = llm_profile_infer(
+        model_name,
+        gpu_name,
+        bytes_per_param,
+        batch_size_per_gpu,
+        seq_len,
+        generate_len,
+        ds_zero,
+        dp_size,
+        tp_size,
+        pp_size,
+        sp_size,
+        use_kv_cache,
+        layernorm_dtype_bytes,
+        kv_cache_dtype_bytes,
+        flops_efficiency,
+        hbm_memory_efficiency,
+        intra_node_memory_efficiency,
+        inter_node_memory_efficiency,
+        mode,
+        print_flag,
+    )
+    # 恢复 sys.stdout
+    sys.stdout = original_stdout
+    # 获取日志消息
+    log_output = string_handler.get_value()
+    # 返回推理结果和日志输出
+    return results, log_output
+# 创建 Gradio 界面
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.Dropdown(choices=model_names, label="Model Name", value="llama2-70b"),
+        gr.Dropdown(choices=gpu_names, label="GPU Name", value="a100-sxm-80gb"),
+        gr.Number(label="Bytes per Param", value=BYTES_FP16),
+        gr.Number(label="Batch Size per GPU", value=2),
+        gr.Number(label="Sequence Length", value=300),
+        gr.Number(label="Generate Length", value=40),
+        gr.Number(label="DS Zero", value=0),
+        gr.Number(label="DP Size", value=1),
+        gr.Number(label="TP Size", value=4),
+        gr.Number(label="PP Size", value=1),
+        gr.Number(label="SP Size", value=1),
+        gr.Checkbox(label="Use KV Cache", value=True),
+        gr.Number(label="Layernorm dtype Bytes", value=BYTES_FP16),
+        gr.Number(label="KV Cache dtype Bytes", value=BYTES_FP16),
+        gr.Number(label="FLOPS Efficiency", value=FLOPS_EFFICIENCY),
+        gr.Number(label="HBM Memory Efficiency", value=HBM_MEMORY_EFFICIENCY),
+        gr.Number(
+            label="Intra Node Memory Efficiency", value=INTRA_NODE_MEMORY_EFFICIENCY
+        ),
+        gr.Number(
+            label="Inter Node Memory Efficiency", value=INTER_NODE_MEMORY_EFFICIENCY
+        ),
+        gr.Radio(choices=["inference", "other_mode"], label="Mode", value="inference"),
+        gr.Checkbox(label="Print Flag", value=True),
+    ],
+    outputs=[
+        gr.Textbox(label="Inference Results"),  # 推理结果输出，带标签
+        gr.Textbox(label="Detailed Analysis"),  # 日志输出，带标签
+    ],
+    title="LLM Profiler",
+    description="Input parameters to profile your LLM.",
+)
+# 启动 Gradio 界面
+iface.launch(auth=("xtrt-llm", "xtrt-llm"), share=False)
+# iface.launch()

config.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# -*- coding  : utf-8 -*-
+# Description : gpu, model, Parallelism, data, train and inference config definition
+import math, json
+from constants import *
+from dataclasses import dataclass
+from enum import Enum
+from functools import total_ordering
+class ActivationRecomputation(Enum):
+    NONE = 0
+    """No activation recomputation; requires the most amount of memory."""
+    SELECTIVE = 1
+    """Selectively checkpoints and recomputes only parts of each transformer
+    layer that take up a considerable amount of memory but are not
+    computationally expensive to recompute, i.e. Q K V matrix multiplies,
+    QK^T matrix multiply, softmax, softmax dropout, and attention over V."""
+    FULL = 2
+    """Full activation recomputation stores the input to EVERY transformer
+    layer, which is sharded across the tensor parallel group, thus requiring an
+    extra all-gather (ignored for now) per layer and add communication
+    overhead; requires the lease amount of memory; requires an extra forward
+    pass."""
+@total_ordering
+class DSZeRO(Enum):
+    NONE = 0
+    """No DeepSPeed ZeRO; requires the most amount of memory."""
+    STAGE_1 = 1
+    """ZeRO stage 1 shards the optimizer states across the data parallel
+    group."""
+    STAGE_2 = 2
+    """ZeRO stage 2 shards the optimizer states and gradients across the data
+    parallel group."""
+    STAGE_3 = 3
+    """ZeRO stage 3 shards the optimizer states, gradients, and model weights
+    across the data parallel group."""
+    def __lt__(self, other):
+        # 炫技写法
+        if other.__class__ is self.__class__:
+            return self.value < other.value # Enum 枚举类自动赋值
+        return NotImplemented
+    def __eq__(self, other):
+        if isinstance(other, DSZeRO):
+            return self.value == other.value
+        return NotImplemented
+@dataclass
+class GPUEfficiencyConfig:
+    flops_efficiency: float = 1.0
+    hbm_memory_efficiency: float = 1.0
+    intra_node_memory_efficiency: float = 1.0
+    inter_node_memory_efficiency: float = 1.0
+@dataclass
+class InferenceConfig:
+    """Inference configuration dataclass."""
+    batch_size_per_gpu: int = None      # batch size
+    seq_len: int = 522         # input sequence length
+    generate_len: int = 1526    # number of tokens to generate
+    context_len: int = None     # context length
+    use_kv_cache: bool = True   # whether to use key/value cache
+    bytes_per_param: int = BYTES_FP16  # model weight bytes
+    layernorm_dtype_bytes: int = BYTES_FP16  # layernorm data type bytes
+    kv_cache_dtype_bytes: int = BYTES_FP16   # key/value cache data type bytes
+    def __post_init__(self):
+        if self.context_len is None:
+            self.context_len = self.seq_len + self.generate_len
+@dataclass
+class ParallelismConfig:
+    """dataclass module provides a decorator and functions for automatically adding generated special methods
+    such as __init__() and __repr__() to user-defined classes
+    """
+    tp_size: int = 1  # tensor parallelism size, Megatron-LM tensor parallelism implementation
+    pp_size: int = 1  # pipeline parallelism size, Megatron-LM pipeline parallelism implementation
+    dp_size: int = 1  # data parallelism size, DeepSpeed Zero parallelism implementation
+    sp_size: int = 1  # sequence parallelism size, Megatron-LM sequence parallelism implementation
+@dataclass
+class ModelConfig:
+    num_layers: int  # number of transformer layers (blocks)
+    n_head: int      # number of attention heads
+    hidden_dim: int  # hidden dimension
+    vocab_size: int  # vocabulary size
+    num_key_value_heads: int = None
+    max_seq_len: int = None   # max sequence length
+    ffn_embed_dim: int = None # hidden dimension of FFN, default to 4 * hidden_dim
+    model_type: str = None    # model type as tagged on Hugging Face (e.g., gpt2, opt, llama.)
+    model_name: str = None    # model name as tagged on Hugging Face (e.g., gpt2-xl, opt, llama-13b.)
+    def __post_init__(self):
+        if self.num_key_value_heads is None: # 如果不存在，设置默认值
+            self.num_key_value_heads = self.n_head
+        if self.ffn_embed_dim is None:
+            self.ffn_embed_dim = self.hidden_dim * 4
+@dataclass
+class GPUConfig:
+    # 1, gpu 型号和显存大小
+    name: str  # GPU config name
+    memory_GPU_in_GB: float  # memory per GPU in GB
+    # 2, gpu 显存带宽、节点内带宽、节点间带宽
+    hbm_bandwidth_in_GB_per_sec: float  # GPU HBM bandwidth in GB/s
+    intra_node_bandwidth_in_GB_per_sec: float  # intra node GPU bandwidth in GB/s.(PCIE/NVLINK)
+    intra_node_min_message_latency: float  # minimum intra node message latency in seconds
+    inter_node_bandwidth_in_GB_per_sec: float = 200  # inter node bandwidth in GB/s, assuming Mellanox 200Gbps HDR Infiniband
+    # 3, 不同精度的 Tensor core 的计算性能
+    peak_fp32_TFLOPS: float = None  # peak Tensor TFLOPS for FP32
+    peak_fp16_TFLOPS: float = None         # peak Tensor TFLOPS for FP16
+    peak_int8_TFLOPS: float = None  # peak Tensor TFLOPS for INT8
+    peak_int4_TFLOPS: float = None  # peak Tensor TFLOPS for INT4
+    FLOPS_EFFICIENCY = 0.7
+    HBM_MEMORY_EFFICIENCY = 0.9
+    def __post_init__(self):
+        """object creation of DataClass starts with __init__() (constructor-calling) and
+        ends with __post__init__() (post-init processing).
+        """
+        if self.peak_fp32_TFLOPS is None:
+            self.peak_fp32_TFLOPS =  math.ceil(self.peak_fp16_TFLOPS / 2)
+        if self.peak_int8_TFLOPS is None:
+            self.peak_int8_TFLOPS = 2 * self.peak_fp16_TFLOPS
+        if self.peak_int4_TFLOPS is None:
+            self.peak_int4_TFLOPS = 4 * self.peak_fp16_TFLOPS
+        if self.FLOPS_EFFICIENCY:
+            self.peak_fp32_TFLOPS *= self.FLOPS_EFFICIENCY
+            self.peak_fp16_TFLOPS *= self.FLOPS_EFFICIENCY
+            self.peak_int8_TFLOPS *= self.FLOPS_EFFICIENCY
+            self.peak_int4_TFLOPS *= self.FLOPS_EFFICIENCY
+        if self.HBM_MEMORY_EFFICIENCY:
+            self.hbm_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
+            self.intra_node_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
+class LLMConfigs(object):
+    def __init__(self, gpu_config: GPUConfig,
+                 model_config: ModelConfig,
+                 parallelism_config: ParallelismConfig = ParallelismConfig(),
+                 inference_config: InferenceConfig = InferenceConfig(),
+                 gpu_efficiency_config: GPUEfficiencyConfig = GPUEfficiencyConfig()
+                ) -> None:
+        self.model_config = model_config
+        self.gpu_config = gpu_config
+        self.parallelism_config = parallelism_config
+        self.inference_config = inference_config # 用户自行指定配置
+        self.gpu_efficiency_config = gpu_efficiency_config # 用户自行指定配置
+def get_model_and_gpu_config_by_name(model_name="llama-13b", gpu_name="v100-pcie-32gb") -> dict:
+    """Read model and gpu configs from a json file."""
+    config_files = ["configs/model_configs.json", "configs/gpu_configs.json"]
+    model_config, gpu_config = {}, {}
+    for config_filename in config_files:
+        with open(config_filename, "r") as f:
+            config_json = json.load(f)
+            if "model" in config_filename:
+                assert model_name in config_json, f"model name {model_name} not found in {config_filename}"
+                config_dict = config_json[model_name]
+                model_config = ModelConfig(**config_dict)
+            elif "gpu" in config_filename:
+                assert gpu_name in config_json, f"gpu name {gpu_name} not found in {config_filename}"
+                config_dict = config_json[gpu_name]
+                gpu_config = GPUConfig(**config_dict)
+            else:
+                assert False, f"unknown config type when reading: {type}"
+    return model_config, gpu_config
+def get_TFLOPS_per_gpu(gpu_config: GPUConfig, data_type="fp16", flops_efficiency=1.0) -> float:
+    """Get the expected TFLOPS per GPU for the specified data type
+    configuration/GPU (adjusted by flops_efficiency)
+    Returns:
+        float: TFLOPS per GPU and unit is T.
+    """
+    if data_type == "int8":
+        gemm_TFOPS = gpu_config.peak_int8_TFLOPS
+    elif data_type == "fp16":
+        gemm_TFOPS = gpu_config.peak_fp16_TFLOPS
+    else:
+        print("weight_bits and activation_bits must be 8, or 16!")
+    return gemm_TFOPS * flops_efficiency
+def get_gpu_hbm_bandwidth(gpu_config: GPUConfig, hbm_memory_efficiency=1.0) -> float:
+    return (
+        gpu_config.hbm_bandwidth_in_GB_per_sec * hbm_memory_efficiency
+    )
+def get_intra_node_bandwidth(gpu_config: GPUConfig, intra_node_memory_efficiency=1.0) -> float:
+    return (
+        gpu_config.intra_node_bandwidth_in_GB_per_sec * intra_node_memory_efficiency
+    )
+def get_inter_node_bandwidth(gpu_config: GPUConfig, inter_node_memory_efficiency=1.0) -> float:
+    return (
+        gpu_config.inter_node_bandwidth_in_GB_per_sec * inter_node_memory_efficiency
+    )

configs/gpu_configs.json ADDED Viewed

	@@ -0,0 +1,163 @@

+{
+    "t4-pcie-15gb": {
+        "name": "t4-pcie-15gb",
+        "memory_GPU_in_GB": 15,
+        "hbm_bandwidth_in_GB_per_sec": 300,
+        "intra_node_bandwidth_in_GB_per_sec": 32,
+        "peak_fp16_TFLOPS": 65,
+        "peak_int8_TFLOPS": 130,
+        "peak_int4_TFLOPS": 260,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "v100-pcie-32gb": {
+        "name": "v100-pcie-32gb",
+        "memory_GPU_in_GB": 32,
+        "hbm_bandwidth_in_GB_per_sec": 900,
+        "intra_node_bandwidth_in_GB_per_sec": 32,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp16_TFLOPS": 112,
+        "peak_int8_TFLOPS": 224,
+        "peak_int4_TFLOPS": 448,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "v100-sxm-32gb": {
+        "name": "v100-sxm-32gb",
+        "memory_GPU_in_GB": 32,
+        "hbm_bandwidth_in_GB_per_sec": 900,
+        "intra_node_bandwidth_in_GB_per_sec": 300,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp16_TFLOPS": 112,
+        "peak_int8_TFLOPS": 224,
+        "peak_int4_TFLOPS": 448,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "br104p": {
+        "name": "br104p",
+        "memory_GPU_in_GB": 32,
+        "hbm_bandwidth_in_GB_per_sec": 819,
+        "intra_node_bandwidth_in_GB_per_sec": 192,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 256,
+        "peak_fp16_TFLOPS": 512,
+        "peak_int8_TFLOPS": 1024,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "a100-pcie-40gb": {
+        "name": "a100-pcie-40gb",
+        "memory_GPU_in_GB": 40,
+        "hbm_bandwidth_in_GB_per_sec": 1555,
+        "intra_node_bandwidth_in_GB_per_sec": 64,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 156,
+        "peak_fp16_TFLOPS": 312,
+        "peak_int8_TFLOPS": 624,
+        "peak_int4_TFLOPS": 1248,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "a100-sxm-40gb": {
+        "name": "a100-sxm-40gb",
+        "memory_GPU_in_GB": 40,
+        "hbm_bandwidth_in_GB_per_sec": 1555,
+        "intra_node_bandwidth_in_GB_per_sec": 600,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 156,
+        "peak_fp16_TFLOPS": 312,
+        "peak_int8_TFLOPS": 624,
+        "peak_int4_TFLOPS": 1248,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "a100-pcie-80gb": {
+        "name": "a100-pcie-80gb",
+        "memory_GPU_in_GB": 80,
+        "hbm_bandwidth_in_GB_per_sec": 1935,
+        "intra_node_bandwidth_in_GB_per_sec": 64,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 156,
+        "peak_fp16_TFLOPS": 312,
+        "peak_int8_TFLOPS": 624,
+        "peak_int4_TFLOPS": 1248,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "a100-sxm-80gb": {
+        "name": "a100-sxm-80gb",
+        "memory_GPU_in_GB": 80,
+        "hbm_bandwidth_in_GB_per_sec": 2039,
+        "intra_node_bandwidth_in_GB_per_sec": 600,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 156,
+        "peak_fp16_TFLOPS": 312,
+        "peak_int8_TFLOPS": 624,
+        "peak_int4_TFLOPS": 1248,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "910b-64gb": {
+        "name": "910b-64gb",
+        "memory_GPU_in_GB": 64,
+        "hbm_bandwidth_in_GB_per_sec": 460,
+        "intra_node_bandwidth_in_GB_per_sec": 392,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 188,
+        "peak_fp16_TFLOPS": 376,
+        "peak_int8_TFLOPS": 752,
+        "peak_int4_TFLOPS": 1504,
+        "intra_node_min_message_latency": 9e-06
+    },
+    "h100-sxm-80gb": {
+        "name": "a100-sxm-80gb",
+        "memory_GPU_in_GB": 80,
+        "hbm_bandwidth_in_GB_per_sec": 3430,
+        "intra_node_bandwidth_in_GB_per_sec": 900,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 989,
+        "peak_fp16_TFLOPS": 1979,
+        "peak_int8_TFLOPS": 3958,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "h100-pcie-80gb": {
+        "name": "a100-sxm-80gb",
+        "memory_GPU_in_GB": 80,
+        "hbm_bandwidth_in_GB_per_sec": 2048,
+        "intra_node_bandwidth_in_GB_per_sec": 128,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 756,
+        "peak_fp16_TFLOPS": 1513,
+        "peak_int8_TFLOPS": 3026,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "a30-pcie-24gb": {
+        "name": "a30-pcie-24gb",
+        "memory_GPU_in_GB": 24,
+        "hbm_bandwidth_in_GB_per_sec": 933,
+        "intra_node_bandwidth_in_GB_per_sec": 64,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 82,
+        "peak_fp16_TFLOPS": 165,
+        "peak_int8_TFLOPS": 330,
+        "peak_int4_TFLOPS": 661,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "a30-sxm-24gb": {
+        "name": "a30-sxm-24gb",
+        "memory_GPU_in_GB": 24,
+        "hbm_bandwidth_in_GB_per_sec": 933,
+        "intra_node_bandwidth_in_GB_per_sec": 200,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 82,
+        "peak_fp16_TFLOPS": 165,
+        "peak_int8_TFLOPS": 330,
+        "peak_int4_TFLOPS": 661,
+        "intra_node_min_message_latency": 8e-06
+    },
+    "a40-pcie-48gb": {
+        "name": "a40-pcie-48gb",
+        "memory_GPU_in_GB": 44.98,
+        "hbm_bandwidth_in_GB_per_sec": 696,
+        "intra_node_bandwidth_in_GB_per_sec": 64,
+        "inter_node_bandwidth_in_GB_per_sec": 200,
+        "peak_fp32_TFLOPS": 74.8,
+        "peak_fp16_TFLOPS": 149.7,
+        "peak_int8_TFLOPS": 299.3,
+        "peak_int4_TFLOPS": 598.7,
+        "intra_node_min_message_latency": 8e-06
+    }
+}

configs/gpu_perf.ini ADDED Viewed

	@@ -0,0 +1,25 @@

+[T4]
+gpu_memory=16GB
+single_precision=8.1TFLOPS
+gpu_memory_bandwidth=300GB/s
+interconnect_bandwidth=32GB/s
+[L4]
+gpu_memory=30GB
+single_precision=24TFLOPS
+gpu_memory_bandwidth=300GB/s
+interconnect_bandwidth=64GB/s
+[L40]
+gpu_memory=48GB
+single_precision=90.5TFLOPS
+gpu_memory_bandwidth=864GB/s
+interconnect_bandwidth=64GB/s
+[V100]
+gpu_memory=36GB
+single_precision=14TFLOPS
+gpu_memory_bandwidth=900GB/s
+interconnect_bandwidth=32GB/s
+[A100]
+gpu_memory=80GB
+single_precision=19.5TFLOPS
+gpu_memory_bandwidth=1935GB/s
+interconnect_bandwidth=64GB/s

configs/model_configs.json ADDED Viewed

	@@ -0,0 +1,204 @@

+{
+    "opt-1.3b":{
+        "num_layers": 24,
+        "n_head": 32,
+        "hidden_dim": 2048,
+        "vocab_size": 50272,
+        "max_seq_len": 2048,
+        "ffn_embed_dim": 8192,
+        "model_type": "opt",
+        "model_name": "opt-1.3b"
+    },
+    "opt-6.7b":{
+        "num_layers": 32,
+        "n_head": 32,
+        "hidden_dim": 4096,
+        "vocab_size": 50272,
+        "max_seq_len": 2048,
+        "ffn_embed_dim": 16384,
+        "model_type": "opt",
+        "model_name": "opt-6.7b"
+    },
+    "opt-13b":{
+        "num_layers": 40,
+        "n_head": 40,
+        "hidden_dim": 5120,
+        "vocab_size": 50272,
+        "max_seq_len": 2048,
+        "ffn_embed_dim": 20480,
+        "model_type": "opt",
+        "model_name": "opt-13b"
+    },
+    "opt-66b":{
+        "num_layers": 64,
+        "n_head": 72,
+        "hidden_dim": 9216,
+        "vocab_size": 50272,
+        "max_seq_len": 2048,
+        "ffn_embed_dim": 36864,
+        "model_type": "opt",
+        "model_name": "opt-66b"
+    },
+    "opt-175b":{
+        "max_seq_len": 2048,
+        "num_layers": 96,
+        "n_head": 96,
+        "hidden_dim": 12288,
+        "vocab_size": 50272,
+        "ffn_embed_dim": 49152,
+        "model_type": "opt",
+        "model_name": "opt-175b"
+    },
+    "gpt2":{
+        "num_layers": 12,
+        "n_head": 12,
+        "hidden_dim": 768,
+        "vocab_size": 50257,
+        "max_seq_len": 1024,
+        "ffn_embed_dim": 3072,
+        "model_type": "gpt2",
+        "model_name": "gpt2"
+    },
+    "gpt2-medium":{
+        "num_layers": 24,
+        "n_head": 16,
+        "hidden_dim": 1024,
+        "vocab_size": 50257,
+        "max_seq_len": 1024,
+        "ffn_embed_dim": 4096,
+        "model_type": "gpt2",
+        "model_name": "gpt2-medium"
+    },
+    "gpt2-large":{
+        "num_layers": 36,
+        "n_head": 20,
+        "hidden_dim": 1280,
+        "vocab_size": 50257,
+        "max_seq_len": 1024,
+        "ffn_embed_dim": 5120,
+        "model_type": "gpt2",
+        "model_name": "gpt2-large"
+    },
+    "gpt2-xl":{
+        "num_layers": 48,
+        "n_head": 25,
+        "hidden_dim": 1600,
+        "vocab_size": 50257,
+        "max_seq_len": 1024,
+        "ffn_embed_dim": 6400,
+        "model_type": "gpt2",
+        "model_name": "gpt2-xl"
+    },
+    "bloom-560m":{
+        "num_layers": 24,
+        "n_head": 16,
+        "hidden_dim": 1024,
+        "vocab_size": 250880,
+        "max_seq_len": null,
+        "ffn_embed_dim": 4096,
+        "model_type": "bloom",
+        "model_name": "bloom-560m"
+    },
+    "bloom-7b":{
+        "num_layers": 30,
+        "n_head": 32,
+        "hidden_dim": 4096,
+        "vocab_size": 250880,
+        "max_seq_len": null,
+        "ffn_embed_dim": 16384,
+        "model_type": "bloom",
+        "model_name": "bloom-7b"
+    },
+    "bloom-175b":{
+        "num_layers": 96,
+        "n_head": 96,
+        "hidden_dim": 12288,
+        "vocab_size": 250880,
+        "ffn_embed_dim": 49152,
+        "model_type": "bloom",
+        "model_name": "bloom-175b"
+    },
+    "llama-7b":{
+        "num_layers": 32,
+        "n_head": 32,
+        "hidden_dim": 4096,
+        "vocab_size": 32000,
+        "max_seq_len": 2048,
+        "ffn_embed_dim": 16384,
+        "model_type": "llama"
+    },
+    "llama-13b":{
+        "num_layers": 40,
+        "n_head": 40,
+        "hidden_dim": 5120,
+        "vocab_size": 32000,
+        "max_seq_len": 2048,
+        "ffn_embed_dim": 20480,
+        "model_type": "llama",
+        "model_name": "llama-13b"
+    },
+    "llama-30b":{
+        "num_layers": 60,
+        "n_head": 52,
+        "hidden_dim": 6656,
+        "vocab_size": 32000,
+        "max_seq_len": 2048,
+        "ffn_embed_dim": 26624,
+        "model_type": "llama",
+        "model_name": "llama-30b"
+    },
+    "llama-65b":{
+        "num_layers": 80,
+        "n_head": 64,
+        "hidden_dim": 8192,
+        "vocab_size": 32000,
+        "max_seq_len": 2048,
+        "ffn_embed_dim": 32768,
+        "model_type": "llama",
+        "model_name": "llama-65b"
+    },
+    "llama2-13b":{
+        "num_layers": 40,
+        "n_head": 40,
+        "num_key_value_heads": 40,
+        "hidden_dim": 5120,
+        "ffn_embed_dim": 20480,
+        "vocab_size": 32000,
+        "max_seq_len": 4096,
+        "model_type": "llama",
+        "model_name": "llama2-13b"
+    },
+    "llama2-70b":{
+        "num_layers": 80,
+        "n_head": 64,
+        "num_key_value_heads": 8,
+        "hidden_dim": 8192,
+        "ffn_embed_dim": 32768,
+        "vocab_size": 49960,
+        "max_seq_len": 4096,
+        "model_type": "llama2",
+        "model_name": "llama2-70b"
+    },
+    "baichuan2-13b": {
+        "num_layers": 40,
+        "n_head": 40,
+        "num_key_value_heads": 40,
+        "hidden_dim": 5120,
+        "ffn_embed_dim": 13696,
+        "vocab_size": 125696,
+        "max_seq_len": 4096,
+        "model_type": "baichuan",
+        "model_name": "baichuan2-13b"
+    },
+    "internlm-20b": {
+        "num_layers": 60,
+        "n_head": 40,
+        "num_key_value_heads": 40,
+        "hidden_dim": 5120,
+        "ffn_embed_dim": 20480,
+        "vocab_size": 103168,
+        "max_seq_len": 16384,
+        "model_type": "llama",
+        "model_name": "internlm-20b"
+    }
+}

constants.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#########################################
+#######     llm profiler    ############
+#########################################
+FLOPS_EFFICIENCY = 1.0  # FLOPS efficiency achieved by Megatron-LM is ~0.5 for LLM training
+HBM_MEMORY_EFFICIENCY = 1  # GPU HBM memory efficiency
+INTRA_NODE_MEMORY_EFFICIENCY = 1.0  # intra-node (nvlink) memory efficiency
+INTER_NODE_MEMORY_EFFICIENCY = 1.0  # inter-node memory efficiency
+NUM_GPUS_PER_NODE = 8  # number of GPUs per node
+TOLERANCE = 0.01  # tolerance for floating point comparisons
+BITS_PER_BYTE = 8  # number of bits in a byte
+BITS_FP32 = 32  # number of bits in FP32 data type
+BITS_FP16 = 16  # number of bits in FP16 data type
+BITS_INT8 = 8  # number of bits in INT8 data type
+BITS_INT4 = 4  # number of bits in INT4 data type
+BYTES_FP32 = BITS_FP32 // BITS_PER_BYTE  # number of bytes in FP32 data type
+BYTES_FP16 = BITS_FP16 // BITS_PER_BYTE  # number of bytes in FP16 data type
+BYTES_INT8 = BITS_INT8 // BITS_PER_BYTE  # number of bytes in INT8 data type
+BYTES_INT4 = BITS_INT4 // BITS_PER_BYTE  # number of bytes in INT4 data type
+PRINT_LINE_WIDTH = 100
+GPUS = [1, 2, 4, 8]

interface.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import gradio as gr
+import io
+import logging
+from llm_profiler import *
+import sys
+from contextlib import redirect_stdout
+# 模型列表
+model_names = [
+    "opt-1.3b",
+    "opt-6.7b",
+    "opt-13b",
+    "opt-66b",
+    "opt-175b",
+    "gpt2",
+    "gpt2-medium",
+    "gpt2-large",
+    "gpt2-xl",
+    "bloom-560m",
+    "bloom-7b",
+    "bloom-175b",
+    "llama-7b",
+    "llama-13b",
+    "llama-30b",
+    "llama-65b",
+    "llama2-13b",
+    "llama2-70b",
+    "internlm-20b",
+    "baichuan2-13b",
+]
+# GPU 列表
+gpu_names = [
+    "t4-pcie-15gb",
+    "v100-pcie-32gb",
+    "v100-sxm-32gb",
+    "br104p",
+    "a100-pcie-40gb",
+    "a100-sxm-40gb",
+    "a100-pcie-80gb",
+    "a100-sxm-80gb",
+    "910b-64gb",
+    "h100-sxm-80gb",
+    "h100-pcie-80gb",
+    "a30-pcie-24gb",
+    "a30-sxm-24gb",
+    "a40-pcie-48gb",
+]
+# 创建一个日志处理器，将日志消息写入 StringIO 对象
+class StringHandler(logging.Handler):
+    def __init__(self):
+        super().__init__()
+        self.stream = io.StringIO()
+        self.setFormatter(logging.Formatter("%(message)s"))
+    def emit(self, record):
+        self.stream.write(self.format(record) + "\n")
+    def get_value(self):
+        return self.stream.getvalue()
+# 创建一个日志记录器并添加 StringHandler
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+string_handler = StringHandler()
+logger.addHandler(string_handler)
+def gradio_interface(
+    model_name="llama2-70b",
+    gpu_name: str = "t4-pcie-15gb",
+    bytes_per_param: int = BYTES_FP16,
+    batch_size_per_gpu: int = 2,
+    seq_len: int = 300,
+    generate_len: int = 40,
+    ds_zero: int = 0,
+    dp_size: int = 1,
+    tp_size: int = 4,
+    pp_size: int = 1,
+    sp_size: int = 1,
+    use_kv_cache: bool = True,
+    layernorm_dtype_bytes: int = BYTES_FP16,
+    kv_cache_dtype_bytes: int = BYTES_FP16,
+    flops_efficiency: float = FLOPS_EFFICIENCY,
+    hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
+    intra_node_memory_efficiency: float = INTRA_NODE_MEMORY_EFFICIENCY,
+    inter_node_memory_efficiency: float = INTER_NODE_MEMORY_EFFICIENCY,
+    mode: str = "inference",
+    print_flag: bool = True,
+) -> list:
+    # 清空 StringIO 对象
+    string_handler.stream.seek(0)
+    string_handler.stream.truncate()
+    # 重定向 sys.stdout 到 StringHandler
+    original_stdout = sys.stdout
+    sys.stdout = string_handler.stream
+    # 调用你的推理函数
+    results = llm_profile_infer(
+        model_name,
+        gpu_name,
+        bytes_per_param,
+        batch_size_per_gpu,
+        seq_len,
+        generate_len,
+        ds_zero,
+        dp_size,
+        tp_size,
+        pp_size,
+        sp_size,
+        use_kv_cache,
+        layernorm_dtype_bytes,
+        kv_cache_dtype_bytes,
+        flops_efficiency,
+        hbm_memory_efficiency,
+        intra_node_memory_efficiency,
+        inter_node_memory_efficiency,
+        mode,
+        print_flag,
+    )
+    # 恢复 sys.stdout
+    sys.stdout = original_stdout
+    # 获取日志消息
+    log_output = string_handler.get_value()
+    # 返回推理结果和日志输出
+    return results, log_output
+# 创建 Gradio 界面
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.Dropdown(choices=model_names, label="Model Name", value="llama2-70b"),
+        gr.Dropdown(choices=gpu_names, label="GPU Name", value="a100-sxm-80gb"),
+        gr.Number(label="Bytes per Param", value=BYTES_FP16),
+        gr.Number(label="Batch Size per GPU", value=2),
+        gr.Number(label="Sequence Length", value=300),
+        gr.Number(label="Generate Length", value=40),
+        gr.Number(label="DS Zero", value=0),
+        gr.Number(label="DP Size", value=1),
+        gr.Number(label="TP Size", value=4),
+        gr.Number(label="PP Size", value=1),
+        gr.Number(label="SP Size", value=1),
+        gr.Checkbox(label="Use KV Cache", value=True),
+        gr.Number(label="Layernorm dtype Bytes", value=BYTES_FP16),
+        gr.Number(label="KV Cache dtype Bytes", value=BYTES_FP16),
+        gr.Number(label="FLOPS Efficiency", value=FLOPS_EFFICIENCY),
+        gr.Number(label="HBM Memory Efficiency", value=HBM_MEMORY_EFFICIENCY),
+        gr.Number(
+            label="Intra Node Memory Efficiency", value=INTRA_NODE_MEMORY_EFFICIENCY
+        ),
+        gr.Number(
+            label="Inter Node Memory Efficiency", value=INTER_NODE_MEMORY_EFFICIENCY
+        ),
+        gr.Radio(choices=["inference", "other_mode"], label="Mode", value="inference"),
+        gr.Checkbox(label="Print Flag", value=True),
+    ],
+    outputs=[
+        gr.Textbox(label="Inference Results"),  # 推理结果输出，带标签
+        gr.Textbox(label="Detailed Analysis"),  # 日志输出，带标签
+    ],
+    title="LLM Profiler",
+    description="Input parameters to profile your LLM.",
+)
+# 启动 Gradio 界面
+iface.launch(auth=("xtrt-llm", "xtrt-llm"), share=False)
+# iface.launch()

llm_profiler.py ADDED Viewed

	@@ -0,0 +1,1274 @@

+# -*- coding  : utf-8 -*-
+# author      : honggao.zhang
+# Create      : 2023-7-19
+# Version     : 0.1.0
+# Description : transformer model(llm) profiling tools, can be used to profile the model's flops, memory, and latency.
+# Reference   : https://github.com/cli99/llm-analysis
+import logging
+from pprint import pformat
+import pprint
+import pandas as pd
+import os
+from config import *
+from utils import *
+from math import floor
+logger = logging.getLogger()
+class CountCausalLMParams(object):
+    def __init__(self, model_config: ModelConfig) -> None:
+        self.h = model_config.hidden_dim
+        self.l = model_config.num_layers
+        self.V = model_config.vocab_size
+        self.model_config = model_config
+    def count_params_embedding(self, shared_embedding: bool = True) -> int:
+        """Get the number of parameters in the embedding layer. params_te = vocab_size * d_model
+        Args:
+            shared_embedding (bool, optional):  whether the output embedding \
+                shares weights with the input embedding. Defaults to True.
+        Returns:
+            int: the number of parameters in the embedding layer
+        """
+        num_params_input_embedding = self.V * self.h
+        num_params_output_embedding = self.V * self.h if not shared_embedding else 0
+        return num_params_input_embedding + num_params_output_embedding
+    def count_params_per_layer_attn(self) -> int:
+        """Get the number of parameters per layer in the attention module
+        which include 4 linear layer: query/key/value projection and output matrices.
+        params_attn(mha) = params_q + params_k + params_v + params_o = 4 * d_model**2
+        Returns:
+            int: the number of parameters per layer in the attention module(mha)
+        """
+        return 4 * self.h ** 2
+    def count_params_per_layer_mlp(self) -> int:
+        """Get the number of parameters in the MLP linear layers, including the
+        intermediate and output matrices.
+        params_mlp = prams_fc1 + params_fc2 = d_model * 4_d_model + 4_d_model * d_model = 8 * d_model**2
+        Returns:
+            int: the number of parameters in the two MLP linear layers
+        """
+        return 8 * self.h ** 2
+    def count_params_per_layer_ln(self) -> int:
+        """Get the number of parameters per layer in the two layer normalization module.
+        params_ln = 4 * d_model
+        Returns:
+            int: the number of parameters per layer in the two layer normalization module
+        """
+        return 4 * self.h
+    def count_params_per_layer(self, ln_ignore=True) -> tuple:
+        """Get the number of params per layer in the transformer decoder blocks,
+        mainly including the attention and MLP layers
+        params_per_layer = params_attn + params_mlp + params_ln
+                         = 4d_model^2 + 8d_model^2 + 2*4d_model = 12d_model^2 + 8d_model
+        Return:
+            int: the number of params per layer in the transformer decoder blocks
+        """
+        params_per_layer_attn = self.count_params_per_layer_attn()
+        params_per_layer_mlp = self.count_params_per_layer_mlp()
+        params_per_layer_ln = 0 if ln_ignore else 2 * self.count_params_per_layer_ln()
+        params_per_layer = (
+            params_per_layer_attn
+            + params_per_layer_mlp
+            + params_per_layer_ln
+        )
+        dict_params_per_layer = {
+            "params_per_layer": params_per_layer,
+            "params_attn": params_per_layer_attn,
+            "params_mlp": params_per_layer_mlp,
+            "params_layernorm": params_per_layer_ln,
+        }
+        return params_per_layer, dict_params_per_layer
+    def count_params_model(self) -> int:
+        """Get the total number of parameters in the model including all layers and token embedding layer.
+        params_model = params_embedding + params_per_layer * num_layers
+                    = V * d_model + 12 * d_model**2 * num_layers
+        Returns:
+            int: the total number of parameters in the model
+        """
+        params_per_layer, dict_params_per_layer = self.count_params_per_layer()
+        return (params_per_layer * self.l
+                + self.count_params_embedding()
+        )
+    def __call__(self, hidden_dim, num_layers, vocab_size) -> int:
+        return (vocab_size * hidden_dim
+                + 12 * hidden_dim ** 2 * num_layers
+            )
+class CountCausalLMFlops(object):
+    """The count is model-specific and does not depend on the parallelism strategy.
+       And ignore layer normalization and other element-wise operations."""
+    def __init__(self, model_config: ModelConfig, batch_size: int, seq_len: int, simp_count=False) -> None:
+        self.h = model_config.hidden_dim
+        self.l = model_config.num_layers
+        self.V = model_config.vocab_size
+        self.b = batch_size
+        self.s = seq_len
+        if not simp_count:
+            llm_params = CountCausalLMParams(model_config)
+            self.model_flops = llm_params(self.h, self.l, self.V) * 2
+    def count_flops_fwd_per_layer_attn(self, batch_size: int, seq_len: int) -> int:
+        """Get the number of floating point operations (flops) for the forward
+        pass of the attention module in a transformer layer, given the batch
+        size and sequence length.
+        mainly including four linear calculations: query/key/value projection and output
+        matrices multiplication、self-attention internal operation, and element-wise operations are ignored.
+        flops_attn = flops_q + flops_k + flops_v + flops_output + flops_self_attention
+              = 4(bsh^2) + 2(2bs^2h)
+        Args:
+            batch_size (int): batch size
+            seq_len (int): sequence length
+        Returns:
+            int: flops for the forward pass of the attention module in a transformer layer
+        """
+        return (
+            8 * batch_size * seq_len * self.h ** 2
+            + 4 * batch_size * seq_len ** 2 * self.h
+        )
+    def count_flops_fwd_per_layer_mlp(self, batch_size: int, seq_len: int) -> int:
+        """Count two flops of matrices multiplication(two linear layers in the MLP module.)
+        flops_mlp = flops_fc1 + flops_fc2 = 2bs(4h^2) + 2bs(4h^2) = 16bsh^2
+        """
+        return 16 * batch_size * seq_len * self.h ** 2
+    def count_flops_fwd_per_layer(self, batch_size: int, seq_len: int, ln_ignore=True) -> tuple:
+        flops_fwd_per_layer_attn = self.count_flops_fwd_per_layer_attn(batch_size, seq_len)
+        flops_fwd_per_layer_mlp = self.count_flops_fwd_per_layer_mlp(batch_size, seq_len)
+        flops_fwd_per_layer_ln = 0
+        flops_fwd_per_layer = (
+            flops_fwd_per_layer_attn
+            + flops_fwd_per_layer_mlp
+            + flops_fwd_per_layer_ln
+        )
+        dict_flops_fwd_per_layer = {
+            "flops_fwd_per_layer": flops_fwd_per_layer,
+            "flops_attn": flops_fwd_per_layer_attn,
+            "flops_mlp": flops_fwd_per_layer_mlp,
+            "flops_layernorm": flops_fwd_per_layer_ln,
+        }
+        return flops_fwd_per_layer, dict_flops_fwd_per_layer
+    def count_flops_logits_layer(self,) -> int:
+        """flops of output token logits layer"""
+        return 2 * self.b * self.s * self.h * self.V
+    def count_flops_fwd_model(self, batch_size: int, seq_len: int) -> int:
+        """Count flops of the forward pass of the transformer model, given the batch size and sequence length."""
+        num_flops_fwd_model = (
+            self.count_flops_fwd_per_layer(batch_size, seq_len)[0] * self.l
+            + self.count_flops_logits_layer()
+        )
+        # validate
+        assert within_range(
+            num_flops_fwd_model,
+            (
+                24 * self.b * self.s * self.l * self.h**2
+                * (1 + self.s / (6 * self.h) + self.V / (12 * self.l * self.h))
+            ),
+            TOLERANCE,
+        )
+        return num_flops_fwd_model
+    def count_flops_bwd_model(self, batch_size: int, seq_len: int) -> int:
+        """Get the number of floating point operations (flops) for the backward
+        pass of the entire transformer model, given the batch size and sequence"""
+        return 2 * self.count_flops_fwd_model(batch_size, seq_len)
+class CountCausalLMMemory(object):
+    """Count memory of the model and layers."""
+    def __init__(self, llm_configs: LLMConfigs) -> None:
+        self.model_config = llm_configs.model_config
+        self.h = self.model_config.hidden_dim
+        self.l = self.model_config.num_layers
+        self.V = self.model_config.vocab_size
+        self.b = llm_configs.inference_config.batch_size_per_gpu
+        self.s = llm_configs.inference_config.seq_len
+        self.o = llm_configs.inference_config.generate_len
+        self.bytes_per_param = llm_configs.inference_config.bytes_per_param
+        self.tp_size = llm_configs.parallelism_config.tp_size
+        self.pp_size = llm_configs.parallelism_config.pp_size
+        self.num_layers_per_gpu = int(self.l / self.pp_size)
+        self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9  # 单位 GB
+        self.llm_params = CountCausalLMParams(self.model_config)
+    def count_memory_weights(self, embedding_dtype_bytes: int = BYTES_FP16):
+        """Get the memory of the model weights"""
+        params_per_layer, dict_params_per_layer = self.llm_params.count_params_per_layer()
+        params_embedding = self.llm_params.count_params_embedding()
+        memory_weight_per_layer = (
+            (params_per_layer / self.tp_size) * self.bytes_per_param
+        )
+        memory_weight_per_gpu = memory_weight_per_layer *  self.num_layers_per_gpu
+        memory_embedding = (params_embedding / self.tp_size) * embedding_dtype_bytes
+        memory_weight_per_gpu = memory_weight_per_gpu + memory_embedding
+        return memory_weight_per_gpu
+    def count_memory_activation_per_layer_attn(
+        self,
+        batch_size: int,
+        seq_len: int,
+        is_inference: bool = True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL
+    ) -> float:
+        """Count the memory (in bytes) required  to store the activations of the
+        attention in a transformer layer, given the batch size, sequence length,
+        whether it is inference or training, the activation recomputation strategy,
+        and the activation data type.
+        """
+        if activation_recomputation == ActivationRecomputation.FULL:
+            return (batch_size * seq_len * self.h / self.tp_size) * self.bytes_per_param
+    def count_memory_activation_per_layer_mlp(
+        self,
+        is_inference: bool = True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+    ) -> float:
+        """ The `mlp` activations include the input to the two linear layers."""
+        if activation_recomputation == ActivationRecomputation.FULL:
+            return 0
+        return 0
+    def count_memory_activation_per_layer_layernorm(
+        self,
+        is_inference: bool = True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+        layernorm_dtype_bytes: int = BYTES_FP16
+    ) -> float:
+        if activation_recomputation == ActivationRecomputation.FULL:
+            return 0
+        return 0
+    def count_memory_activation_per_layer(
+        self,
+        batch_size: int,
+        seq_len: int,
+        is_inference: bool = True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+        layernorm_dtype_bytes: int = BYTES_FP16
+    ) -> float:
+        if activation_recomputation == ActivationRecomputation.FULL:
+            return (
+                (batch_size * seq_len * self.h / self.tp_size) * self.bytes_per_param
+            )
+        return 0
+    def count_memory_kv_cache_per_layer(
+        self,
+        batch_size: int,
+        seq_len: int,
+        generate_len: int,
+        kv_cache_dtype_bytes: int = BYTES_FP16,
+    ) -> float:
+        """Get the memory (in bytes) required to store the key and value cache
+        for a transformer layer in inference, given the batch size, sequence
+        length, activation data type, and tensor parallelism size.
+        memory_kv_cache = 4blh(s+o) unit is byte
+        Args:
+            batch_size (int): batch size
+            context_len (int): seq_len + generate_len
+        Returns:
+            float: the memory (in bytes) required  to store the key and value cache for a transformer layer in inference
+        """
+        return (
+            (2 * batch_size * (seq_len + generate_len) * self.h) / self.tp_size
+        ) * kv_cache_dtype_bytes
+    def count_memory_per_gpu(
+        self,
+        batch_size: int,
+        seq_len: int,
+        generate_len: int,
+        is_inference: bool = True,
+        use_kv_cache: bool = True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+        layernorm_dtype_bytes: int = BYTES_FP16,
+        kv_cache_dtype_bytes: int = BYTES_FP16
+    ) -> tuple:
+        # 1, prefill stage count memory and max_batch_size
+        weight_memory_per_gpu = self.count_memory_weights() # count model weights memory
+        memory_left = self.gpu_memory_in_GB - weight_memory_per_gpu
+        prefill_activation_memory_batch_size_1 = ( # count model activations and kv cache memory of prefill stage
+            self.count_memory_activation_per_layer(
+                1, seq_len, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes
+            )
+            * self.num_layers_per_gpu
+        )
+        prefill_max_batch_size_per_gpu = int(
+            memory_left / prefill_activation_memory_batch_size_1
+        )
+        prefill_activation_memory_per_gpu = (
+            self.count_memory_activation_per_layer(
+                batch_size, seq_len, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes
+            )
+            * self.num_layers_per_gpu
+        )
+        assert memory_left > prefill_activation_memory_per_gpu, (
+            f"weight_memory_per_gpu {num_to_string(weight_memory_per_gpu)}, activation memory {num_to_string(prefill_activation_memory_per_gpu)} is too large can't fit in GPU memory! memory_left is {num_to_string(memory_left)}!"
+        )
+        # 2, decode stage count memory and max_batch_size
+        if use_kv_cache:
+            kv_cache_memory_batch_size_1 = (
+                self.count_memory_kv_cache_per_layer(
+                    1,
+                    seq_len + generate_len,
+                    kv_cache_dtype_bytes
+                )
+                * self.num_layers_per_gpu
+            )
+            kv_cache_memory_per_gpu = (
+                self.count_memory_kv_cache_per_layer(
+                    batch_size,
+                    seq_len + generate_len,
+                    kv_cache_dtype_bytes
+                )
+                * self.num_layers_per_gpu
+            )
+            decode_activation_memory_batch_size_1 = (
+                # seq_len 1 is used for decoding
+                self.count_memory_activation_per_layer(
+                    1, 1, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes
+                )
+                * self.num_layers_per_gpu
+            )
+            decode_activation_memory_per_gpu = (
+                # seq_len 1 is used for decoding
+                self.count_memory_activation_per_layer(
+                    batch_size, 1, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes
+                )
+                * self.num_layers_per_gpu
+            )
+            decode_max_batch_size_per_gpu = int(
+                memory_left / (decode_activation_memory_batch_size_1 + kv_cache_memory_batch_size_1)
+            )
+            max_batch_total_tokens = decode_max_batch_size_per_gpu * (seq_len + generate_len)
+            # llama2-70b 模型使用了 GQA  技术，kv cache  对应的 head  数目为 8，所以 max_batch_total_tokens  参数可取值为 16384*8。
+            if self.model_config.model_name == "llama2-70b":
+                max_batch_total_tokens *= 8
+            assert batch_size <= decode_max_batch_size_per_gpu, (
+                f"batch_size_per_gpu {batch_size} is too large to fit"
+                " in GPU memory, decode_max_batch_size_per_gpu:"
+                f" {decode_max_batch_size_per_gpu}"
+            )
+            assert memory_left > (
+                kv_cache_memory_per_gpu + decode_activation_memory_per_gpu
+            ), ("kv_cache and activation memory with batch_size_per_gpu ="
+                f" {batch_size} is too large to fit in GPU memory"
+            )
+        else:
+            # 上下文长度不再是新生成的那个 token，而是 seq_len + generate_len
+            decode_activation_memory_batch_size_1 = (
+                self.count_memory_activation_per_layer(
+                    1, seq_len + generate_len, True, ActivationRecomputation.FULL, layernorm_dtype_bytes
+                )
+                * self.num_layers_per_gpu
+            )
+            decode_max_batch_size_per_gpu = int(
+                memory_left / decode_activation_memory_batch_size_1
+            )
+            assert batch_size <= decode_max_batch_size_per_gpu, (
+                f"batch_size {batch_size} is too large to fit"
+                " in GPU memory, decode_max_batch_size_per_gpu:"
+                f" {decode_max_batch_size_per_gpu}"
+            )
+            decode_activation_memory_per_gpu = (
+                self.count_memory_activation_per_layer(
+                    batch_size, seq_len + generate_len, True, ActivationRecomputation.FULL, layernorm_dtype_bytes
+                )
+                * self.num_layers_per_gpu
+            )
+            kv_cache_memory_per_gpu = 0
+        decode_memory_total = (weight_memory_per_gpu + decode_activation_memory_per_gpu + kv_cache_memory_per_gpu)
+        # memory summary
+        memory_prefill_summary_dict = {
+            "weight_memory_per_gpu": weight_memory_per_gpu,
+            "prefill_activation_memory_batch_size_1": prefill_activation_memory_batch_size_1,
+            "prefill_max_batch_size_per_gpu": prefill_max_batch_size_per_gpu,
+            "prefill_activation_memory_per_gpu": prefill_activation_memory_per_gpu,
+        }
+        memory_decode_summary_dict = {
+            "weight_memory_per_gpu": weight_memory_per_gpu,
+            "decode_activation_memory_per_gpu": decode_activation_memory_per_gpu,
+            "kv_cache_memory_per_gpu": kv_cache_memory_per_gpu,
+            "decode_memory_total": decode_memory_total,
+            "decode_max_batch_size_per_gpu": decode_max_batch_size_per_gpu,
+            "max_batch_total_tokens": max_batch_total_tokens * 0.97,
+        }
+        return memory_prefill_summary_dict, memory_decode_summary_dict
+class CountCausalLMLatency(object):
+    """Count latency by roof-line performance model."""
+    def __init__(self, llm_configs: LLMConfigs, data_type="fp16") -> None:
+        self.model_config = llm_configs.model_config
+        self.gpu_config = llm_configs.gpu_config
+        self.inference_config = llm_configs.inference_config
+        self.parallelism_config = llm_configs.parallelism_config
+        self.h = self.model_config.hidden_dim
+        self.l = self.model_config.num_layers
+        self.V = self.model_config.vocab_size
+        self.b = llm_configs.inference_config.batch_size_per_gpu
+        self.s = llm_configs.inference_config.seq_len
+        self.o = llm_configs.inference_config.generate_len
+        self.bytes_per_param = llm_configs.inference_config.bytes_per_param
+        self.tp_size = self.parallelism_config.tp_size
+        self.pp_size = self.parallelism_config.pp_size
+        self.num_layers_per_gpu = int(self.l / self.parallelism_config.pp_size)
+        self.gpu_hbm_bandwidth = get_gpu_hbm_bandwidth(self.gpu_config) * 10**9 # 单位 GB/s
+        self.gpu_intra_node_bandwidth = get_intra_node_bandwidth(self.gpu_config) * 10**9       # 互连带宽，单位 GB/s
+        self.gpu_TFLOPS = get_TFLOPS_per_gpu(self.gpu_config) * 10**12           # 单位 TFLOPS
+        self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9  # 单位 GB
+        self.llm_params = CountCausalLMParams(self.model_config)
+        self.llm_memory = CountCausalLMMemory(llm_configs)
+        self.llm_flops = CountCausalLMFlops(self.model_config, self.b, self.o)
+    def common_count_latency_for_ops(
+        self,
+        batch_size: int,
+        seq_len: int,
+        is_inference=True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+        ops_type: str="attn",
+        stage="decode_"
+    ) -> float:
+        """Count the latency for the forward layer or model, assuming the compute and memory operations are perfectly overlapped.
+        Args:
+            flops (float): flops of the forward layer or model
+            memory (float): r/w memory(bytes) of the forward layer or model
+            tp_size (float): tensor parallelism size
+            gpu_TFLOPS (float): GPU TFLOPS in T(10^12)FLOPS
+            gpu_hbm_bandwidth (float): GPU HBM bandwidth in GB/s(10^9)
+        Returns:
+            float: the latency in seconds for the forward pass
+        """
+        if ops_type=="attn":
+            flops = self.llm_flops.count_flops_fwd_per_layer_attn(batch_size, seq_len)
+            weight_memory = self.llm_params.count_params_per_layer_attn() * self.bytes_per_param
+            activation_memory = self.llm_memory.count_memory_activation_per_layer_attn(
+                                batch_size, seq_len, is_inference, activation_recomputation
+            )
+        elif ops_type=="mlp":
+            flops = self.llm_flops.count_flops_fwd_per_layer_mlp(batch_size, seq_len)
+            weight_memory = self.llm_params.count_params_per_layer_mlp() * self.bytes_per_param
+            activation_memory = self.llm_memory.count_memory_activation_per_layer_mlp(is_inference, activation_recomputation)
+        elif ops_type=="layernorm":
+            activation_memory = self.llm_memory.count_memory_activation_per_layer_layernorm(
+                                is_inference, activation_recomputation) # activation_memory
+            weight_memory = 0   # layernorm has no matrix weight, only vector weight, is ignored
+            flops = 0   # layernorm is not compute bound, flops is very small
+        else:
+            print("error! unsupported ops_type")
+        activation_memory = 0
+        memory = weight_memory + activation_memory
+        compute_latency = flops / (self.tp_size * self.gpu_TFLOPS) # 单位秒
+        memory_latency = memory / (self.tp_size * self.gpu_hbm_bandwidth)
+        if memory_latency > compute_latency:
+            print(f"{stage} stage: memory_latency {latency_to_string(memory_latency)} > compute_latency {latency_to_string(compute_latency)}, this {ops_type} layer is memory bound!")
+        else:
+            print(f"{stage} stage: memory_latency {latency_to_string(memory_latency)} <= compute_latency {latency_to_string(compute_latency)}, this {ops_type} layer is compute bound!")
+        return max(compute_latency, memory_latency)
+    def count_latency_fwd_per_layer_tp_comm(self, batch_size: int, seq_len: int) -> float:
+        """Count the latency of a single allreduce communication across the
+        tensor parallel group in the forward pass of a transformer layer.
+        The latency is the max of the latency for the allreduce and the minimum
+        message latency through intra-node connect.
+        """
+        is_ring_allreduce = False
+        if self.tp_size == 1:
+            return 0
+        # \phi is communication data, if tp_size is large enough num_data_per_all_reduce can be 2bsh
+        if is_ring_allreduce:
+            num_data_per_all_reduce = (
+                2 * batch_size * seq_len * self.h *
+                (self.tp_size - 1) / (self.tp_size)
+            )
+        else:
+            bsh = batch_size * seq_len * self.h
+            num_data_per_all_reduce = (
+                6 * bsh * (self.tp_size - 1) / (self.tp_size) +
+                3 * bsh
+            )
+        latency_per_all_reduce = (
+            num_data_per_all_reduce * self.bytes_per_param
+            / (self.gpu_intra_node_bandwidth)
+        )
+        # intra_node_min_message_latency: 节点内连接的最小消息延迟
+        return max(
+            latency_per_all_reduce,
+            self.gpu_config.intra_node_min_message_latency,
+        )
+    def count_latency_fwd_per_layer(
+        self,
+        batch_size: int,
+        seq_len: int,
+        is_inference: bool=True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+        layernorm_dtype_bytes: int = BYTES_FP16,
+        stage="decode_"
+    ) -> tuple:
+        latency_fwd_per_layer_attn = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, ops_type="attn", stage=stage)
+        latency_fwd_per_layer_mlp = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, ops_type="mlp", stage=stage)
+        latency_fwd_per_layer_layernorm = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, "layernorm", stage=stage)
+        latency_fwd_per_layer_tp_comm = self.count_latency_fwd_per_layer_tp_comm(batch_size, seq_len)
+        latency_per_layer = (
+            latency_fwd_per_layer_attn
+            + latency_fwd_per_layer_mlp
+            + 2 * latency_fwd_per_layer_layernorm   # 2 个 layernorm 层
+            + 2 * latency_fwd_per_layer_tp_comm     # 一次 AllReduce 产生的通讯量为 2bsh
+        )
+        dict_latency_per_layer = {
+            "latency_per_layer": (latency_per_layer),
+            "latency_attn": (latency_fwd_per_layer_attn),
+            "latency_mlp": (latency_fwd_per_layer_mlp),
+            "latency_layernorm": (2 * latency_fwd_per_layer_layernorm),
+            "latency_tp_comm": (2 * latency_fwd_per_layer_tp_comm),
+        }
+        return latency_per_layer, dict_latency_per_layer
+    def count_latency_fwd_input_embedding(
+        self, batch_size: int, seq_len: int
+    ) -> float:
+        """Get the latency for the forward pass of the input embedding layer,
+        given the batch size, sequence length, and data type of the embedding
+        weight.
+        Args:
+            batch_size (int): batch size
+            seq_len (int): sequence length
+            dtype_bytes (int, optional): number of bytes in the data type for the embedding weight. Defaults to BYTES_FP32.
+        Returns:
+            float: the latency in seconds for the forward pass of the input embedding layer
+        """
+        memory_latency = (
+            self.model_config.vocab_size
+            * self.model_config.hidden_dim
+            * self.bytes_per_param
+            / (self.gpu_hbm_bandwidth)
+        )
+        comm_latency = self.count_latency_fwd_per_layer_tp_comm(
+            batch_size, seq_len
+        )
+        return memory_latency + comm_latency
+    def count_latency_fwd_output_embedding_loss(
+        self, batch_size: int, seq_len: int
+    ) -> float:
+        """Get the latency for the forward pass of the output embedding layer (computing the logits). The operation is compute bound. With tensor parallelism size > 1, an allgather communicates `batch_size * seq_len` elements, which is ignored here. Refer to https://arxiv.org/abs/1909.08053 for more details.
+        Args:
+            batch_size (int): batch size
+            seq_len (int): sequence length
+        Returns:
+            float: the latency in seconds for the forward pass of the output embedding layer
+        """
+        compute_latency = (
+            2 * batch_size * seq_len  * self.h * self.V
+            / self.tp_size
+            / self.gpu_TFLOPS
+        )
+        return compute_latency
+    def count_latency_kv_cache(
+        self,
+        batch_size: int,
+        seq_len: int,
+        generate_len: int,
+        use_kv_cache: bool = True,
+        kv_cache_dtype_bytes: int = BYTES_FP16
+    ) -> tuple:
+        """Get the latency for the forward pass of the key and value cache in a transformer layer, given the batch size, sequence length, and whether the key and value cache is used.
+        Args:
+            batch_size (int): batch size
+            seq_len (int): sequence length
+            generate_len (int): number of tokens to generate
+            use_kv_cache (bool, optional): whether the key and value cache is used. Defaults to True.
+        Returns:
+            float: the latency in seconds for the forward pass of the key and value cache in a transformer layer
+        """
+        if not use_kv_cache:
+            return 0
+        kv_cache_memory_list_per_gpu, kv_cache_latency_list = [], []
+        for context_len in range(seq_len, seq_len + generate_len + 1):
+            kv_cache_memory_per_gpu = (
+                self.llm_memory.count_memory_kv_cache_per_layer(
+                    batch_size,
+                    context_len,
+                    kv_cache_dtype_bytes
+                ) * self.num_layers_per_gpu
+            )
+            kv_cache_latency = (
+                kv_cache_memory_per_gpu / self.gpu_hbm_bandwidth
+            )
+            kv_cache_memory_list_per_gpu.append(kv_cache_memory_per_gpu)
+            kv_cache_latency_list.append(kv_cache_latency)
+        kv_cache_avg_latency = average(kv_cache_latency_list)
+        kv_cache_peak_latency = max(kv_cache_latency_list)
+        return kv_cache_avg_latency, kv_cache_peak_latency
+    def count_latency_fwd_model(
+        self,
+        batch_size: int,
+        seq_len: int,
+        is_inference: bool = True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+        layernorm_dtype_bytes: int = BYTES_FP32,
+        breakdown_prefix: str = "",
+    ) -> tuple:
+        latency_fwd_per_layer, breakdown_per_layer = self.count_latency_fwd_per_layer(
+            batch_size,
+            seq_len,
+            is_inference,
+            activation_recomputation,
+            layernorm_dtype_bytes,
+            stage=breakdown_prefix
+        )
+        num_layers_per_gpu = self.num_layers_per_gpu
+        latency_fwd_all_layers = latency_fwd_per_layer * self.num_layers_per_gpu
+        latency_fwd_input_embedding = self.count_latency_fwd_input_embedding(batch_size, seq_len)
+        latency_fwd_output_embedding_loss = self.count_latency_fwd_output_embedding_loss(batch_size, seq_len)
+        model_latency = (
+            latency_fwd_all_layers
+            + latency_fwd_input_embedding
+            + latency_fwd_output_embedding_loss
+        )
+        model_latency_breakdown = {
+            breakdown_prefix + "latency_fwd_per_layer": breakdown_per_layer,
+            breakdown_prefix + "latency_fwd_attn": (breakdown_per_layer["latency_attn"] * num_layers_per_gpu),
+            breakdown_prefix + "latency_fwd_mlp": (breakdown_per_layer["latency_mlp"] * num_layers_per_gpu),
+            breakdown_prefix + "latency_fwd_layernorm": (breakdown_per_layer["latency_layernorm"] * num_layers_per_gpu),
+            breakdown_prefix + "latency_fwd_tp_comm": (breakdown_per_layer["latency_tp_comm"] * num_layers_per_gpu),
+            breakdown_prefix + "latency_fwd_input_embedding": (latency_fwd_input_embedding),
+            breakdown_prefix + "latency_fwd_output_embedding_loss": (latency_fwd_output_embedding_loss),
+        }
+        return model_latency, model_latency_breakdown
+    def count_latency_fwd(
+        self,
+        batch_size: int,
+        seq_len: int,
+        generate_len: int,
+        use_kv_cache: bool = True,
+        kv_cache_dtype_bytes: int = BYTES_FP16,
+        is_inference: bool = True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+        layernorm_dtype_bytes: int = BYTES_FP32,
+    ) -> tuple:
+        # 1, 预填充阶段
+        prefill_latency, prefill_latency_breakdown = self.count_latency_fwd_model(
+            batch_size,
+            seq_len,
+            is_inference=is_inference,
+            layernorm_dtype_bytes=layernorm_dtype_bytes,
+            breakdown_prefix="prefill_",
+        )
+        prefill_latency_breakdown.update(
+            {
+                "prefill_latency": prefill_latency,
+            }
+        )
+        # 2, 解码阶段
+        kv_cache_avg_latency, kv_cache_peak_latency = self.count_latency_kv_cache(
+            batch_size,
+            seq_len,
+            generate_len,
+            use_kv_cache,
+            kv_cache_dtype_bytes
+        )
+        decode_model_latency, decode_latency_breakdown = self.count_latency_fwd_model(
+            batch_size,
+            1 if use_kv_cache else (seq_len + generate_len) * (2/3), # k、v cache 占 2/3，重新计算
+            is_inference=is_inference,
+            activation_recomputation=activation_recomputation,
+            layernorm_dtype_bytes=layernorm_dtype_bytes,
+            breakdown_prefix="decode_",
+        )
+        decode_avg_latency = decode_model_latency + kv_cache_avg_latency
+        decode_peak_latency = decode_model_latency + kv_cache_peak_latency
+        decode_latency_breakdown.update(
+            {
+                "kv_cache_avg_latency": (kv_cache_avg_latency),
+                "kv_cache_peak_latency": (kv_cache_peak_latency),
+                "decode_avg_latency": (decode_avg_latency),
+                "decode_peak_latency": (decode_peak_latency)
+            }
+        )
+        return prefill_latency_breakdown, decode_latency_breakdown
+class LLMProfiler(object):
+    """Measures the latency, memory, number of estimated floating-point operations and parameters of each module in a PyTorch model."""
+    def __init__(self, llm_configs: LLMConfigs) -> None:
+        self.model_config = llm_configs.model_config
+        self.gpu_config = llm_configs.gpu_config
+        self.inference_config = llm_configs.inference_config
+        self.parallelism_config = llm_configs.parallelism_config
+        self.gpu_efficiency_config = llm_configs.gpu_efficiency_config
+        self.h = self.model_config.hidden_dim
+        self.l = self.model_config.num_layers
+        self.V = self.model_config.vocab_size
+        self.b = llm_configs.inference_config.batch_size_per_gpu
+        self.s = llm_configs.inference_config.seq_len
+        self.o = llm_configs.inference_config.generate_len
+        self.bytes_per_param = llm_configs.inference_config.bytes_per_param
+        self.tp_size = self.parallelism_config.tp_size
+        self.pp_size = self.parallelism_config.pp_size
+        self.num_layers_per_gpu = int(self.l / self.parallelism_config.pp_size)
+        self.gpu_hbm_bandwidth = get_gpu_hbm_bandwidth(self.gpu_config) * 10**9 # 单位 GB/s
+        self.gpu_intra_node_bandwidth = get_intra_node_bandwidth(self.gpu_config) * 10**9       # 互连带宽，单位 GB/s
+        self.gpu_TFLOPS = get_TFLOPS_per_gpu(self.gpu_config) * 10**12           # 单位 TFLOPS
+        self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9  # 单位 GB
+        self.llm_params = CountCausalLMParams(self.model_config)
+        self.llm_flops = CountCausalLMFlops(self.model_config, self.b, self.s)
+        self.llm_memory = CountCausalLMMemory(llm_configs)
+        self.llm_latency = CountCausalLMLatency(llm_configs)
+        self.inference_results = []
+    def infer_profile(
+        self,
+        batch_size_per_gpu: int = 1,
+        seq_len: int = 522,
+        generate_len: int = 1526,
+        use_kv_cache: bool = True,
+        activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
+        layernorm_dtype_bytes: int = 2,
+        kv_cache_dtype_bytes: int = 2,
+        flops_efficiency: float = None,
+        hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
+        intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
+        inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
+        print_flag=True
+    ) -> dict:
+        """LLM inference analysis given the llm configs and inputs.
+        Args:
+            generate_len (int, optional): number of tokens to generate for generative models. Defaults to 100.
+            use_kv_cache (bool, optional): whether to use kv_cache. Defaults to True.
+            layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations. Defaults to BYTES_FP32.
+                Often has to be at least FP16 in inference to maintain model accuracy.
+        Returns:
+            dict: a summary dict of the training analysis
+        """
+        if self.model_config.max_seq_len is not None:
+            assert(
+                seq_len + generate_len <= self.model_config.max_seq_len
+            ), f"seq_len {seq_len} exceeds the max_seq_len {self.model_config.max_seq_len}"
+        if self.l % self.pp_size != 0:
+            logger.warning(
+                "Warning: the number of layers is not divisible by pp_size, please taking the floor!"
+            )
+        pp_instance_factor = self.pp_size
+        infer_config_dict = {
+            "inference_config":{
+                "model_name": self.model_config.model_name,
+                "batch_size_per_gpu": batch_size_per_gpu,
+                "seq_len": seq_len,
+                "tp_size": self.tp_size,
+                "pp_size": self.pp_size,
+                "generate_len": generate_len,
+                "use_kv_cache": use_kv_cache,
+            },
+            "gpu_config": {
+                "name": self.gpu_config.name,
+                "memory_GPU_in_GB": f"{self.gpu_config.memory_GPU_in_GB} GB",
+                "gpu_hbm_bandwidth": f"{get_gpu_hbm_bandwidth(self.gpu_config)} GB/s",
+                "gpu_intra_node_bandwidth": f"{get_intra_node_bandwidth(self.gpu_config)} GB/s",
+                "gpu_TFLOPS": f"{get_TFLOPS_per_gpu(self.gpu_config)} TFLOPS",
+            }
+        }
+        params_per_layer, dict_params_per_layer = self.llm_params.count_params_per_layer()
+        num_params_model = self.llm_params.count_params_model()
+        flops_fwd_per_layer, dict_flops_fwd_per_layer = self.llm_flops.count_flops_fwd_per_layer(self.b, self.s)
+        num_flops_fwd_model = self.llm_flops.count_flops_fwd_model(self.b, self.s)
+        memory_prefill_summary_dict, memory_decode_summary_dict = self.llm_memory.count_memory_per_gpu(
+            batch_size_per_gpu,
+            seq_len,
+            generate_len,
+            is_inference=True,
+            use_kv_cache=use_kv_cache,
+            activation_recomputation=activation_recomputation,
+            layernorm_dtype_bytes=layernorm_dtype_bytes,
+            kv_cache_dtype_bytes=kv_cache_dtype_bytes
+        )
+        prefill_latency_breakdown, decode_latency_breakdown = self.llm_latency.count_latency_fwd(
+            batch_size_per_gpu,
+            seq_len,
+            generate_len,
+            use_kv_cache=use_kv_cache,
+            activation_recomputation=activation_recomputation,
+            layernorm_dtype_bytes=layernorm_dtype_bytes,
+            kv_cache_dtype_bytes=kv_cache_dtype_bytes
+        )
+        infer_result_dict = {
+            "model_params": num_params_model,
+            "model_flops": num_flops_fwd_model,
+            "prefill_first_token_latency": prefill_latency_breakdown["prefill_latency"],
+            "decode_per_token_latency": decode_latency_breakdown["decode_avg_latency"],
+            "kv_cache_latency": decode_latency_breakdown["kv_cache_avg_latency"],
+            "total_infer_latency": prefill_latency_breakdown["prefill_latency"] + decode_latency_breakdown["decode_avg_latency"] * generate_len,
+        }
+        gb_factor = 1024 ** 3
+        inference_result_dict = {
+            "model_params": num_params_model,
+            "prefill_first_token_latency": prefill_latency_breakdown["prefill_latency"],
+            "decode_per_token_latency": decode_latency_breakdown["decode_avg_latency"],
+            "kv_cache_latency": decode_latency_breakdown["kv_cache_avg_latency"],
+            "total_infer_latency": prefill_latency_breakdown["prefill_latency"] + decode_latency_breakdown["decode_avg_latency"] * generate_len,
+            "weight_memory_per_gpu": memory_decode_summary_dict["weight_memory_per_gpu"] / gb_factor,
+            "decode_activation_memory_per_gpu": memory_decode_summary_dict["decode_activation_memory_per_gpu"] / gb_factor,
+            "kv_cache_memory_per_gpu": memory_decode_summary_dict["kv_cache_memory_per_gpu"] / gb_factor,
+            "decode_max_batch_size_per_gpu": memory_decode_summary_dict["decode_max_batch_size_per_gpu"],
+            "max_batch_total_tokens": memory_decode_summary_dict["max_batch_total_tokens"],
+        }
+        pp_specific_dict = {
+            "pp_decode_latency": inference_result_dict["decode_per_token_latency"] / pp_instance_factor,
+            "pp_prefill_latency": inference_result_dict["prefill_first_token_latency"] / pp_instance_factor,
+            "pp_kv_cache_latency": inference_result_dict["kv_cache_latency"] / pp_instance_factor,
+            "pp_e2e_latency": inference_result_dict["total_infer_latency"] / pp_instance_factor,
+            "pp_max_batch_total_tokens": inference_result_dict["decode_per_token_latency"] / pp_instance_factor,
+            "pp_max_batch_size": inference_result_dict["decode_max_batch_size_per_gpu"] / pp_instance_factor,
+            "pp_kv_cache_memory_per_gpu": inference_result_dict["kv_cache_memory_per_gpu"] * pp_instance_factor,
+        }
+        inference_result_dict.update(pp_specific_dict)
+        inference_result_dict.update(infer_config_dict["inference_config"].copy())
+        inference_result_dict.update(infer_config_dict["gpu_config"].copy())
+        self.inference_results.append(inference_result_dict)
+        if print_flag:
+            print("\n-------------------------- LLM main infer config --------------------------")
+            pprint.pprint(infer_config_dict, indent=4, sort_dicts=False)
+            print("\n---------------------------- LLM Params analysis ----------------------------")
+            self.print_format_summary_dict(dict_params_per_layer, get_dict_depth(dict_params_per_layer))
+            pprint.pprint({"params_model": num_to_string(num_params_model)}, indent=4, sort_dicts=False)
+            print("\n---------------------------- LLM Flops analysis -----------------------------")
+            self.print_format_summary_dict(dict_flops_fwd_per_layer, get_dict_depth(dict_flops_fwd_per_layer))
+            pprint.pprint({"prefill flops_model": num_to_string(num_flops_fwd_model)}, indent=4, sort_dicts=False)
+            print("\n---------------------------- LLM Memory analysis -----------------------------")
+            self.print_format_summary_dict(memory_prefill_summary_dict, get_dict_depth(memory_prefill_summary_dict))
+            self.print_format_summary_dict(memory_decode_summary_dict, get_dict_depth(memory_decode_summary_dict))
+            print("\n-------------------------- LLM infer performance analysis --------------------------")
+            self.print_format_summary_dict(infer_result_dict, get_dict_depth(infer_result_dict))
+            print("\n-------------------------- LLM detailed's latency analysis --------------------------")
+            pprint.pprint([prefill_latency_breakdown, decode_latency_breakdown], indent=4, sort_dicts=False)
+            print("prefill_latency_breakdown depth is ", get_dict_depth(prefill_latency_breakdown), prefill_latency_breakdown)
+            self.print_format_summary_dict(prefill_latency_breakdown, get_dict_depth(prefill_latency_breakdown))
+            self.print_format_summary_dict(decode_latency_breakdown, get_dict_depth(decode_latency_breakdown))
+        # return memory_decode_summary_dict["max_batch_total_tokens"]
+        return memory_decode_summary_dict["max_batch_total_tokens"]
+    def get_inference_results(self):
+        return self.inference_results
+    def print_format_summary_dict(self, summary_dict: dict, depth:int) -> str:
+        for key, value in summary_dict.items():
+            if "params" in key or "flops" in key:
+                if not isinstance(value, dict):
+                    summary_dict.update({key: num_to_string(value)})
+                else:
+                    self.print_format_summary_dict(value, get_dict_depth(value)-1) # 递归调用函数
+            if "latency" in key:
+                if not isinstance(value, dict):
+                    summary_dict.update({key: latency_to_string(value)})
+                else:
+                    self.print_format_summary_dict(value, get_dict_depth(value)-1)
+            if "memory" in key:
+                if not isinstance(value, dict):
+                    summary_dict.update({key: f"{num_to_string(value)}B"})
+                else:
+                    self.print_format_summary_dict(value, get_dict_depth(value)-1)
+        if depth >= 1:
+            pprint.pprint(summary_dict, indent=4, sort_dicts=False)
+def llm_profile(model_name="llama2-70b",
+                gpu_name: str = "t4-pcie-15gb",
+                bytes_per_param: int = BYTES_FP16,
+                batch_size_per_gpu: int = 2,
+                seq_len: int = 300,
+                generate_len=40,
+                ds_zero: int = 0,
+                dp_size: int = 1,
+                tp_size: int = 4,
+                pp_size: int = 1,
+                sp_size: int = 1,
+                use_kv_cache: bool = True,
+                layernorm_dtype_bytes: int = BYTES_FP16,
+                kv_cache_dtype_bytes: int = BYTES_FP16,
+                flops_efficiency: float = FLOPS_EFFICIENCY,
+                hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
+                intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
+                inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
+                mode: str = "inference",
+                print_flag: bool = True,
+            ) -> dict:
+    """Returns dict of the total floating-point operations, MACs, parameters and latency of a llm.
+    Args:
+        model_name (str, optional): model name to query the pre-defined `model_configs.json`. Defaults to "llama-13b".
+        gpu_name (str, optional): gpu name to query the pre-defined `model_configs.json`. Defaults to "v100-sxm2-32gb".
+        batch_size_per_gpu (int, optional): _description_. Defaults to 1.
+        seq_len (int, optional): batch size per GPU.. Defaults to 522.
+        generate_len (int, optional): The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. Defaults to 1526.
+        ds_zero (int, optional): which DeepSpeed ZeRO stage to use.. Defaults to 0.
+        dp_size (int, optional): data parallelism size. Defaults to 1.
+        tp_size (int, optional): tensor parallelism size. Defaults to 1.
+        pp_size (int, optional): pipeline parallelism size. Defaults to 1.
+        sp_size (int, optional): sequence parallelism size. Defaults to 1.
+        use_kv_cache (bool, optional): Whether or not the model should use the past last key/values attentions (if applicable to the model) to
+            speed up decoding. Defaults to True.
+        layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations.. Defaults to BYTES_FP16.
+        kv_cache_dtype_bytes (int, optional): number of bytes in the data type for the kv_cache. Defaults to None.
+        flops_efficiency (float, optional): flops efficiency, ranging from 0 to 1. Defaults to None.
+        hbm_memory_efficiency (float, optional): GPU HBM memory efficiency, ranging from 0 to 1. Defaults to HBM_MEMORY_EFFICIENCY.
+        intra_node_memory_efficiency (_type_, optional): intra-node memory efficiency, ranging from 0 to 1.. Defaults to INTRA_NODE_MEMORY_EFFICIENCY.
+        inter_node_memory_efficiency (_type_, optional): inter-node memory efficiency, ranging from 0 to 1.. Defaults to INTER_NODE_MEMORY_EFFICIENCY.
+        mode (str, optional): model training or inference. Defaults to "inference".
+    Returns:
+        dict: a summary dictionary of the inference analysis
+    """
+    model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name)
+    parallelism_config = ParallelismConfig(tp_size=tp_size, pp_size=pp_size,
+                                        dp_size=dp_size, sp_size=sp_size
+                                        )
+    inference_config = InferenceConfig(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len,
+                                       generate_len=generate_len, use_kv_cache=use_kv_cache,
+                                       bytes_per_param=bytes_per_param,
+                                       layernorm_dtype_bytes=layernorm_dtype_bytes,
+                                       kv_cache_dtype_bytes=kv_cache_dtype_bytes
+                                       )
+    gpu_efficiency_config = GPUEfficiencyConfig(flops_efficiency=flops_efficiency,
+                                                hbm_memory_efficiency=hbm_memory_efficiency,
+                                                intra_node_memory_efficiency=intra_node_memory_efficiency,
+                                                inter_node_memory_efficiency=inter_node_memory_efficiency
+    )
+    llm_configs = LLMConfigs(model_config=model_config, gpu_config=gpu_config,
+                             parallelism_config=parallelism_config, inference_config=inference_config,
+                             gpu_efficiency_config=gpu_efficiency_config
+                            )
+    profiler = LLMProfiler(llm_configs)
+    max_batch_total_tokens = profiler.infer_profile(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len,
+                        generate_len=generate_len, use_kv_cache=use_kv_cache,
+                        layernorm_dtype_bytes=layernorm_dtype_bytes,
+                        flops_efficiency=flops_efficiency,
+                        hbm_memory_efficiency=hbm_memory_efficiency,
+                        print_flag=print_flag)
+    return max_batch_total_tokens
+def llm_profile_infer(model_name="llama2-70b",
+                    gpu_name: str = "t4-pcie-15gb",
+                    bytes_per_param: int = BYTES_FP16,
+                    batch_size_per_gpu: int = 2,
+                    seq_len: int = 300,
+                    generate_len=40,
+                    ds_zero: int = 0,
+                    dp_size: int = 1,
+                    tp_size: int = 4,
+                    pp_size: int = 1,
+                    sp_size: int = 1,
+                    use_kv_cache: bool = True,
+                    layernorm_dtype_bytes: int = BYTES_FP16,
+                    kv_cache_dtype_bytes: int = BYTES_FP16,
+                    flops_efficiency: float = FLOPS_EFFICIENCY,
+                    hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
+                    intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
+                    inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
+                    mode: str = "inference",
+                    print_flag: bool = True,
+            ) -> list:
+    model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name)
+    parallelism_config = ParallelismConfig(tp_size=tp_size, pp_size=pp_size,
+                                        dp_size=dp_size, sp_size=sp_size
+                                        )
+    inference_config = InferenceConfig(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len,
+                                       generate_len=generate_len, use_kv_cache=use_kv_cache,
+                                       bytes_per_param=bytes_per_param,
+                                       layernorm_dtype_bytes=layernorm_dtype_bytes,
+                                       kv_cache_dtype_bytes=kv_cache_dtype_bytes
+                                       )
+    gpu_efficiency_config = GPUEfficiencyConfig(flops_efficiency=flops_efficiency,
+                                                hbm_memory_efficiency=hbm_memory_efficiency,
+                                                intra_node_memory_efficiency=intra_node_memory_efficiency,
+                                                inter_node_memory_efficiency=inter_node_memory_efficiency
+    )
+    llm_configs = LLMConfigs(model_config=model_config, gpu_config=gpu_config,
+                             parallelism_config=parallelism_config, inference_config=inference_config,
+                             gpu_efficiency_config=gpu_efficiency_config
+                            )
+    profiler = LLMProfiler(llm_configs)
+    max_batch_total_tokens = profiler.infer_profile(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len,
+                        generate_len=generate_len, use_kv_cache=use_kv_cache,
+                        layernorm_dtype_bytes=layernorm_dtype_bytes,
+                        flops_efficiency=flops_efficiency,
+                        hbm_memory_efficiency=hbm_memory_efficiency,
+                        print_flag=print_flag)
+    return max_batch_total_tokens, profiler.get_inference_results()
+def to_csv(inference_results: list, name: str = "infer_results"):
+    df = pd.DataFrame(inference_results)
+    csv_path = name + ".csv"
+    pprint.pprint(f"Saving inference results to: {csv_path}")
+    df.to_csv(csv_path, index=False)
+def profile_pp():
+    # model_name_list = ["llama-7b", "llama-13b", "llama-65b", "llama2-70b", "internlm-20b"]
+    model_name_list = ["llama2-70b"]
+    # gpu_name_list = ["a30-sxm-24gb", "a40-pcie-48gb", "a100-sxm-40gb", "a100-sxm-80gb", "910b-64gb", "v100-sxm-32gb", "t4-pcie-15gb"]
+    gpu_name_list = ["a100-sxm-80gb"]
+    batch_size_per_gpu = 32
+    tp_pp_nums = [
+        [1, 1],  # tp
+        [2, 1],
+        [4, 1],
+        [8, 1],
+        # tp / pp
+        [2, 4],
+        [4, 2],
+        # pp
+        [1, 2],
+        [1, 4],
+        [1, 8],
+    ]
+    tgi_service_dict_list = []
+    seq_len, generate_len = 1024, 1024
+    inference_results = []
+    for model_name in model_name_list:
+        if model_name in ["llama2-70b", "internlm-20b"]:
+            seq_len, generate_len = 1024, 1024
+        for gpu_name in gpu_name_list:
+            for tp_size, pp_size in tp_pp_nums:
+                try:
+                    max_batch_total_tokens, infer_result = llm_profile_infer(
+                        model_name=model_name,
+                        gpu_name=gpu_name,
+                        batch_size_per_gpu=batch_size_per_gpu,
+                        tp_size=tp_size,
+                        pp_size=pp_size,
+                        seq_len=seq_len,
+                        generate_len=generate_len,
+                        print_flag=False,
+                    )
+                    inference_results += infer_result
+                except Exception as e:
+                    print(
+                        f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, error: {e}"
+                    )
+                    continue
+                tgi_service_dict = {
+                    "model_name": model_name,
+                    "gpu_name": gpu_name,
+                    "pp_size": pp_size,
+                    "tp_size": tp_size,
+                    "max_batch_total_tokens": max_batch_total_tokens,
+                    "max_batch_size": floor(
+                        max_batch_total_tokens / (seq_len + generate_len)
+                    ),
+                }
+                tgi_service_dict_list.append(tgi_service_dict)
+    print(
+        "================================== TGI+LightLLM service max_batch_total_tokens params list ============================="
+    )
+    print_list(tgi_service_dict_list)
+    to_csv(inference_results, f"bs{batch_size_per_gpu}_in{seq_len}_out{generate_len}_centralize_allreduce")
+def demo():
+    # llm_profile(print_flag=True)
+    # model_name_list = ["llama-7b", "llama-13b", "llama-65b", "llama2-70b", "internlm-20b"]
+    model_name_list = ["llama2-70b"]
+    # gpu_name_list = ["a30-sxm-24gb", "a40-pcie-48gb", "a100-sxm-40gb", "a100-sxm-80gb", "910b-64gb", "v100-sxm-32gb", "t4-pcie-15gb"]
+    gpu_name_list = ["a100-sxm-80gb", "910b-64gb"]
+    batch_size_per_gpu = 32
+    tp_nums_list = [8]
+    pp_nums_list = [1]
+    tp_pp_nums = [
+        [8, 1],
+        [1, 8],
+        [4, 2]
+    ]
+    tgi_service_dict_list = []
+    seq_len, generate_len = 1024, 1024
+    for model_name in model_name_list:
+        if model_name in ["llama2-70b", "internlm-20b"]:
+            seq_len, generate_len = 1024, 1024
+        # pp_size = 0
+        # tp_size = 0
+        for gpu_name in gpu_name_list:
+            # for tp_size in tp_nums_list:
+            # for pp_size in pp_nums_list:
+            for (tp_size, pp_size) in tp_pp_nums:
+                try:
+                    max_batch_total_tokens = int(llm_profile(model_name=model_name, gpu_name=gpu_name, batch_size_per_gpu=batch_size_per_gpu, tp_size=tp_size, pp_size=pp_size,
+                                                         seq_len=seq_len, generate_len=generate_len, print_flag=True))
+                except Exception as e:
+                    print(f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, error: {e}")
+                    continue
+                tgi_service_dict = {"model_name": model_name, "gpu_name": gpu_name, "pp_size": pp_size, "tp_size": tp_size, "max_batch_total_tokens": max_batch_total_tokens, "max_batch_size": floor(max_batch_total_tokens / (seq_len + generate_len))}
+                tgi_service_dict_list.append(tgi_service_dict)
+    print("================================== TGI+LightLLM service max_batch_total_tokens params list =============================")
+    print_list(tgi_service_dict_list)
+if __name__ == "__main__":
+    profile_pp()

utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from constants import *
+def print_list(list):
+    """print one-dimensional list
+    :param list: List[int]
+    :return: None
+    """
+    for i, x in enumerate(list):
+        print(x, end='\n')
+def get_dict_depth(d, depth=0):
+    if not isinstance(d, dict):
+        return depth
+    if not d:
+        return depth
+    return max(get_dict_depth(v, depth + 1) for v in d.values())
+def latency_to_string(latency_in_s, precision=2):
+    if latency_in_s is None:
+        return "None"
+    day = 24 * 60 * 60
+    hour = 60 * 60
+    minute = 60
+    ms = 1 / 1000
+    us = 1 / 1000000
+    if latency_in_s // day > 0:
+        return str(round(latency_in_s / day, precision)) + " days"
+    elif latency_in_s // hour > 0:
+        return str(round(latency_in_s / hour, precision)) + " hours"
+    elif latency_in_s // minute > 0:
+        return str(round(latency_in_s / minute, precision)) + " minutes"
+    elif latency_in_s > 1:
+        return str(round(latency_in_s, precision)) + " s"
+    elif latency_in_s > ms:
+        return str(round(latency_in_s / ms, precision)) + " ms"
+    else:
+        return str(round(latency_in_s / us, precision)) + " us"
+def num_to_string(num, precision=2):
+    if num is None:
+        return "None"
+    if num // 10**12 > 0:
+        return str(round(num / 10.0**12, precision)) + " T"
+    elif num // 10**9 > 0:
+        return str(round(num / 10.0**9, precision)) + " G"
+    elif num // 10**6 > 0:
+        return str(round(num / 10.0**6, precision)) + " M"
+    elif num // 10**3 > 0:
+        return str(round(num / 10.0**3, precision)) + " K"
+    else:
+        return str(num)
+def get_readable_summary_dict(summary_dict: dict, title="Summary") -> str:
+    log_str = f"\n{title.center(PRINT_LINE_WIDTH, '-')}\n"
+    for key, value in summary_dict.items():
+        if "num_tokens" in key or "num_params" in key or "flops" in key:
+            log_str += f"{key}: {num_to_string(value)}\n"
+        elif "gpu_hours" == key:
+            log_str += f"{key}: {int(value)}\n"
+        elif "memory" in key and "efficiency" not in key:
+            log_str += f"{key}: {num_to_string(value)}B\n"
+        elif "latency" in key:
+            log_str += f"{key}: {latency_to_string(value)}\n"
+        else:
+            log_str += f"{key}: {value}\n"
+    log_str += f"{'-' * PRINT_LINE_WIDTH}\n"
+    return log_str
+def within_range(val, target, tolerance):
+    return abs(val - target) / target < tolerance
+def average(lst):
+    if not lst:
+        return None
+    return sum(lst) / len(lst)
+def max_value(lst):
+    if not lst:
+        return None
+    return max(lst)