zenghaolun02 commited on
Commit
0c4803b
·
1 Parent(s): e23ddae
Files changed (10) hide show
  1. __init__.py +13 -0
  2. app.py +175 -0
  3. config.py +213 -0
  4. configs/gpu_configs.json +163 -0
  5. configs/gpu_perf.ini +25 -0
  6. configs/model_configs.json +204 -0
  7. constants.py +28 -0
  8. interface.py +175 -0
  9. llm_profiler.py +1274 -0
  10. utils.py +82 -0
__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Cheng Li
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import io
3
+ import logging
4
+
5
+ from llm_profiler import *
6
+ import sys
7
+ from contextlib import redirect_stdout
8
+
9
+ # 模型列表
10
+ model_names = [
11
+ "opt-1.3b",
12
+ "opt-6.7b",
13
+ "opt-13b",
14
+ "opt-66b",
15
+ "opt-175b",
16
+ "gpt2",
17
+ "gpt2-medium",
18
+ "gpt2-large",
19
+ "gpt2-xl",
20
+ "bloom-560m",
21
+ "bloom-7b",
22
+ "bloom-175b",
23
+ "llama-7b",
24
+ "llama-13b",
25
+ "llama-30b",
26
+ "llama-65b",
27
+ "llama2-13b",
28
+ "llama2-70b",
29
+ "internlm-20b",
30
+ "baichuan2-13b",
31
+ ]
32
+ # GPU 列表
33
+ gpu_names = [
34
+ "t4-pcie-15gb",
35
+ "v100-pcie-32gb",
36
+ "v100-sxm-32gb",
37
+ "br104p",
38
+ "a100-pcie-40gb",
39
+ "a100-sxm-40gb",
40
+ "a100-pcie-80gb",
41
+ "a100-sxm-80gb",
42
+ "910b-64gb",
43
+ "h100-sxm-80gb",
44
+ "h100-pcie-80gb",
45
+ "a30-pcie-24gb",
46
+ "a30-sxm-24gb",
47
+ "a40-pcie-48gb",
48
+ ]
49
+
50
+
51
+ # 创建一个日志处理器,将日志消息写入 StringIO 对象
52
+ class StringHandler(logging.Handler):
53
+ def __init__(self):
54
+ super().__init__()
55
+ self.stream = io.StringIO()
56
+ self.setFormatter(logging.Formatter("%(message)s"))
57
+
58
+ def emit(self, record):
59
+ self.stream.write(self.format(record) + "\n")
60
+
61
+ def get_value(self):
62
+ return self.stream.getvalue()
63
+
64
+
65
+ # 创建一个日志记录器并添加 StringHandler
66
+ logger = logging.getLogger(__name__)
67
+ logger.setLevel(logging.INFO)
68
+ string_handler = StringHandler()
69
+ logger.addHandler(string_handler)
70
+
71
+
72
+ def gradio_interface(
73
+ model_name="llama2-70b",
74
+ gpu_name: str = "t4-pcie-15gb",
75
+ bytes_per_param: int = BYTES_FP16,
76
+ batch_size_per_gpu: int = 2,
77
+ seq_len: int = 300,
78
+ generate_len: int = 40,
79
+ ds_zero: int = 0,
80
+ dp_size: int = 1,
81
+ tp_size: int = 4,
82
+ pp_size: int = 1,
83
+ sp_size: int = 1,
84
+ use_kv_cache: bool = True,
85
+ layernorm_dtype_bytes: int = BYTES_FP16,
86
+ kv_cache_dtype_bytes: int = BYTES_FP16,
87
+ flops_efficiency: float = FLOPS_EFFICIENCY,
88
+ hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
89
+ intra_node_memory_efficiency: float = INTRA_NODE_MEMORY_EFFICIENCY,
90
+ inter_node_memory_efficiency: float = INTER_NODE_MEMORY_EFFICIENCY,
91
+ mode: str = "inference",
92
+ print_flag: bool = True,
93
+ ) -> list:
94
+ # 清空 StringIO 对象
95
+ string_handler.stream.seek(0)
96
+ string_handler.stream.truncate()
97
+
98
+ # 重定向 sys.stdout 到 StringHandler
99
+ original_stdout = sys.stdout
100
+ sys.stdout = string_handler.stream
101
+
102
+ # 调用你的推理函数
103
+ results = llm_profile_infer(
104
+ model_name,
105
+ gpu_name,
106
+ bytes_per_param,
107
+ batch_size_per_gpu,
108
+ seq_len,
109
+ generate_len,
110
+ ds_zero,
111
+ dp_size,
112
+ tp_size,
113
+ pp_size,
114
+ sp_size,
115
+ use_kv_cache,
116
+ layernorm_dtype_bytes,
117
+ kv_cache_dtype_bytes,
118
+ flops_efficiency,
119
+ hbm_memory_efficiency,
120
+ intra_node_memory_efficiency,
121
+ inter_node_memory_efficiency,
122
+ mode,
123
+ print_flag,
124
+ )
125
+
126
+ # 恢复 sys.stdout
127
+ sys.stdout = original_stdout
128
+
129
+ # 获取日志消息
130
+ log_output = string_handler.get_value()
131
+
132
+ # 返回推理结果和日志输出
133
+ return results, log_output
134
+
135
+
136
+ # 创建 Gradio 界面
137
+ iface = gr.Interface(
138
+ fn=gradio_interface,
139
+ inputs=[
140
+ gr.Dropdown(choices=model_names, label="Model Name", value="llama2-70b"),
141
+ gr.Dropdown(choices=gpu_names, label="GPU Name", value="a100-sxm-80gb"),
142
+ gr.Number(label="Bytes per Param", value=BYTES_FP16),
143
+ gr.Number(label="Batch Size per GPU", value=2),
144
+ gr.Number(label="Sequence Length", value=300),
145
+ gr.Number(label="Generate Length", value=40),
146
+ gr.Number(label="DS Zero", value=0),
147
+ gr.Number(label="DP Size", value=1),
148
+ gr.Number(label="TP Size", value=4),
149
+ gr.Number(label="PP Size", value=1),
150
+ gr.Number(label="SP Size", value=1),
151
+ gr.Checkbox(label="Use KV Cache", value=True),
152
+ gr.Number(label="Layernorm dtype Bytes", value=BYTES_FP16),
153
+ gr.Number(label="KV Cache dtype Bytes", value=BYTES_FP16),
154
+ gr.Number(label="FLOPS Efficiency", value=FLOPS_EFFICIENCY),
155
+ gr.Number(label="HBM Memory Efficiency", value=HBM_MEMORY_EFFICIENCY),
156
+ gr.Number(
157
+ label="Intra Node Memory Efficiency", value=INTRA_NODE_MEMORY_EFFICIENCY
158
+ ),
159
+ gr.Number(
160
+ label="Inter Node Memory Efficiency", value=INTER_NODE_MEMORY_EFFICIENCY
161
+ ),
162
+ gr.Radio(choices=["inference", "other_mode"], label="Mode", value="inference"),
163
+ gr.Checkbox(label="Print Flag", value=True),
164
+ ],
165
+ outputs=[
166
+ gr.Textbox(label="Inference Results"), # 推理结果输出,带标签
167
+ gr.Textbox(label="Detailed Analysis"), # 日志输出,带标签
168
+ ],
169
+ title="LLM Profiler",
170
+ description="Input parameters to profile your LLM.",
171
+ )
172
+
173
+ # 启动 Gradio 界面
174
+ iface.launch(auth=("xtrt-llm", "xtrt-llm"), share=False)
175
+ # iface.launch()
config.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding : utf-8 -*-
2
+ # Description : gpu, model, Parallelism, data, train and inference config definition
3
+
4
+ import math, json
5
+ from constants import *
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from functools import total_ordering
9
+
10
+
11
+ class ActivationRecomputation(Enum):
12
+ NONE = 0
13
+ """No activation recomputation; requires the most amount of memory."""
14
+
15
+ SELECTIVE = 1
16
+ """Selectively checkpoints and recomputes only parts of each transformer
17
+ layer that take up a considerable amount of memory but are not
18
+ computationally expensive to recompute, i.e. Q K V matrix multiplies,
19
+ QK^T matrix multiply, softmax, softmax dropout, and attention over V."""
20
+
21
+ FULL = 2
22
+ """Full activation recomputation stores the input to EVERY transformer
23
+ layer, which is sharded across the tensor parallel group, thus requiring an
24
+ extra all-gather (ignored for now) per layer and add communication
25
+ overhead; requires the lease amount of memory; requires an extra forward
26
+ pass."""
27
+
28
+ @total_ordering
29
+ class DSZeRO(Enum):
30
+ NONE = 0
31
+ """No DeepSPeed ZeRO; requires the most amount of memory."""
32
+
33
+ STAGE_1 = 1
34
+ """ZeRO stage 1 shards the optimizer states across the data parallel
35
+ group."""
36
+
37
+ STAGE_2 = 2
38
+ """ZeRO stage 2 shards the optimizer states and gradients across the data
39
+ parallel group."""
40
+
41
+ STAGE_3 = 3
42
+ """ZeRO stage 3 shards the optimizer states, gradients, and model weights
43
+ across the data parallel group."""
44
+
45
+ def __lt__(self, other):
46
+ # 炫技写法
47
+ if other.__class__ is self.__class__:
48
+ return self.value < other.value # Enum 枚举类自动赋值
49
+ return NotImplemented
50
+
51
+ def __eq__(self, other):
52
+ if isinstance(other, DSZeRO):
53
+ return self.value == other.value
54
+ return NotImplemented
55
+
56
+ @dataclass
57
+ class GPUEfficiencyConfig:
58
+ flops_efficiency: float = 1.0
59
+ hbm_memory_efficiency: float = 1.0
60
+ intra_node_memory_efficiency: float = 1.0
61
+ inter_node_memory_efficiency: float = 1.0
62
+
63
+ @dataclass
64
+ class InferenceConfig:
65
+ """Inference configuration dataclass."""
66
+ batch_size_per_gpu: int = None # batch size
67
+ seq_len: int = 522 # input sequence length
68
+ generate_len: int = 1526 # number of tokens to generate
69
+ context_len: int = None # context length
70
+ use_kv_cache: bool = True # whether to use key/value cache
71
+ bytes_per_param: int = BYTES_FP16 # model weight bytes
72
+ layernorm_dtype_bytes: int = BYTES_FP16 # layernorm data type bytes
73
+ kv_cache_dtype_bytes: int = BYTES_FP16 # key/value cache data type bytes
74
+ def __post_init__(self):
75
+ if self.context_len is None:
76
+ self.context_len = self.seq_len + self.generate_len
77
+
78
+ @dataclass
79
+ class ParallelismConfig:
80
+ """dataclass module provides a decorator and functions for automatically adding generated special methods
81
+ such as __init__() and __repr__() to user-defined classes
82
+ """
83
+ tp_size: int = 1 # tensor parallelism size, Megatron-LM tensor parallelism implementation
84
+ pp_size: int = 1 # pipeline parallelism size, Megatron-LM pipeline parallelism implementation
85
+ dp_size: int = 1 # data parallelism size, DeepSpeed Zero parallelism implementation
86
+ sp_size: int = 1 # sequence parallelism size, Megatron-LM sequence parallelism implementation
87
+
88
+ @dataclass
89
+ class ModelConfig:
90
+ num_layers: int # number of transformer layers (blocks)
91
+ n_head: int # number of attention heads
92
+ hidden_dim: int # hidden dimension
93
+ vocab_size: int # vocabulary size
94
+ num_key_value_heads: int = None
95
+ max_seq_len: int = None # max sequence length
96
+ ffn_embed_dim: int = None # hidden dimension of FFN, default to 4 * hidden_dim
97
+ model_type: str = None # model type as tagged on Hugging Face (e.g., gpt2, opt, llama.)
98
+ model_name: str = None # model name as tagged on Hugging Face (e.g., gpt2-xl, opt, llama-13b.)
99
+
100
+ def __post_init__(self):
101
+ if self.num_key_value_heads is None: # 如果不存在,设置默认值
102
+ self.num_key_value_heads = self.n_head
103
+
104
+ if self.ffn_embed_dim is None:
105
+ self.ffn_embed_dim = self.hidden_dim * 4
106
+
107
+ @dataclass
108
+ class GPUConfig:
109
+ # 1, gpu 型号和显存大小
110
+ name: str # GPU config name
111
+ memory_GPU_in_GB: float # memory per GPU in GB
112
+
113
+ # 2, gpu 显存带宽、节点内带宽、节点间带宽
114
+ hbm_bandwidth_in_GB_per_sec: float # GPU HBM bandwidth in GB/s
115
+ intra_node_bandwidth_in_GB_per_sec: float # intra node GPU bandwidth in GB/s.(PCIE/NVLINK)
116
+ intra_node_min_message_latency: float # minimum intra node message latency in seconds
117
+
118
+ inter_node_bandwidth_in_GB_per_sec: float = 200 # inter node bandwidth in GB/s, assuming Mellanox 200Gbps HDR Infiniband
119
+
120
+ # 3, 不同精度的 Tensor core 的计算性能
121
+ peak_fp32_TFLOPS: float = None # peak Tensor TFLOPS for FP32
122
+ peak_fp16_TFLOPS: float = None # peak Tensor TFLOPS for FP16
123
+ peak_int8_TFLOPS: float = None # peak Tensor TFLOPS for INT8
124
+ peak_int4_TFLOPS: float = None # peak Tensor TFLOPS for INT4
125
+
126
+ FLOPS_EFFICIENCY = 0.7
127
+ HBM_MEMORY_EFFICIENCY = 0.9
128
+
129
+ def __post_init__(self):
130
+ """object creation of DataClass starts with __init__() (constructor-calling) and
131
+ ends with __post__init__() (post-init processing).
132
+ """
133
+ if self.peak_fp32_TFLOPS is None:
134
+ self.peak_fp32_TFLOPS = math.ceil(self.peak_fp16_TFLOPS / 2)
135
+ if self.peak_int8_TFLOPS is None:
136
+ self.peak_int8_TFLOPS = 2 * self.peak_fp16_TFLOPS
137
+ if self.peak_int4_TFLOPS is None:
138
+ self.peak_int4_TFLOPS = 4 * self.peak_fp16_TFLOPS
139
+
140
+ if self.FLOPS_EFFICIENCY:
141
+ self.peak_fp32_TFLOPS *= self.FLOPS_EFFICIENCY
142
+ self.peak_fp16_TFLOPS *= self.FLOPS_EFFICIENCY
143
+ self.peak_int8_TFLOPS *= self.FLOPS_EFFICIENCY
144
+ self.peak_int4_TFLOPS *= self.FLOPS_EFFICIENCY
145
+ if self.HBM_MEMORY_EFFICIENCY:
146
+ self.hbm_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
147
+ self.intra_node_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
148
+ class LLMConfigs(object):
149
+ def __init__(self, gpu_config: GPUConfig,
150
+ model_config: ModelConfig,
151
+ parallelism_config: ParallelismConfig = ParallelismConfig(),
152
+ inference_config: InferenceConfig = InferenceConfig(),
153
+ gpu_efficiency_config: GPUEfficiencyConfig = GPUEfficiencyConfig()
154
+ ) -> None:
155
+ self.model_config = model_config
156
+ self.gpu_config = gpu_config
157
+ self.parallelism_config = parallelism_config
158
+ self.inference_config = inference_config # 用户自行指定配置
159
+ self.gpu_efficiency_config = gpu_efficiency_config # 用户自行指定配置
160
+
161
+ def get_model_and_gpu_config_by_name(model_name="llama-13b", gpu_name="v100-pcie-32gb") -> dict:
162
+ """Read model and gpu configs from a json file."""
163
+ config_files = ["configs/model_configs.json", "configs/gpu_configs.json"]
164
+ model_config, gpu_config = {}, {}
165
+
166
+ for config_filename in config_files:
167
+ with open(config_filename, "r") as f:
168
+ config_json = json.load(f)
169
+
170
+ if "model" in config_filename:
171
+ assert model_name in config_json, f"model name {model_name} not found in {config_filename}"
172
+ config_dict = config_json[model_name]
173
+ model_config = ModelConfig(**config_dict)
174
+
175
+ elif "gpu" in config_filename:
176
+ assert gpu_name in config_json, f"gpu name {gpu_name} not found in {config_filename}"
177
+ config_dict = config_json[gpu_name]
178
+ gpu_config = GPUConfig(**config_dict)
179
+ else:
180
+ assert False, f"unknown config type when reading: {type}"
181
+
182
+ return model_config, gpu_config
183
+
184
+ def get_TFLOPS_per_gpu(gpu_config: GPUConfig, data_type="fp16", flops_efficiency=1.0) -> float:
185
+ """Get the expected TFLOPS per GPU for the specified data type
186
+ configuration/GPU (adjusted by flops_efficiency)
187
+
188
+ Returns:
189
+ float: TFLOPS per GPU and unit is T.
190
+ """
191
+ if data_type == "int8":
192
+ gemm_TFOPS = gpu_config.peak_int8_TFLOPS
193
+ elif data_type == "fp16":
194
+ gemm_TFOPS = gpu_config.peak_fp16_TFLOPS
195
+ else:
196
+ print("weight_bits and activation_bits must be 8, or 16!")
197
+
198
+ return gemm_TFOPS * flops_efficiency
199
+
200
+ def get_gpu_hbm_bandwidth(gpu_config: GPUConfig, hbm_memory_efficiency=1.0) -> float:
201
+ return (
202
+ gpu_config.hbm_bandwidth_in_GB_per_sec * hbm_memory_efficiency
203
+ )
204
+
205
+ def get_intra_node_bandwidth(gpu_config: GPUConfig, intra_node_memory_efficiency=1.0) -> float:
206
+ return (
207
+ gpu_config.intra_node_bandwidth_in_GB_per_sec * intra_node_memory_efficiency
208
+ )
209
+
210
+ def get_inter_node_bandwidth(gpu_config: GPUConfig, inter_node_memory_efficiency=1.0) -> float:
211
+ return (
212
+ gpu_config.inter_node_bandwidth_in_GB_per_sec * inter_node_memory_efficiency
213
+ )
configs/gpu_configs.json ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "t4-pcie-15gb": {
3
+ "name": "t4-pcie-15gb",
4
+ "memory_GPU_in_GB": 15,
5
+ "hbm_bandwidth_in_GB_per_sec": 300,
6
+ "intra_node_bandwidth_in_GB_per_sec": 32,
7
+ "peak_fp16_TFLOPS": 65,
8
+ "peak_int8_TFLOPS": 130,
9
+ "peak_int4_TFLOPS": 260,
10
+ "intra_node_min_message_latency": 8e-06
11
+ },
12
+ "v100-pcie-32gb": {
13
+ "name": "v100-pcie-32gb",
14
+ "memory_GPU_in_GB": 32,
15
+ "hbm_bandwidth_in_GB_per_sec": 900,
16
+ "intra_node_bandwidth_in_GB_per_sec": 32,
17
+ "inter_node_bandwidth_in_GB_per_sec": 200,
18
+ "peak_fp16_TFLOPS": 112,
19
+ "peak_int8_TFLOPS": 224,
20
+ "peak_int4_TFLOPS": 448,
21
+ "intra_node_min_message_latency": 8e-06
22
+ },
23
+ "v100-sxm-32gb": {
24
+ "name": "v100-sxm-32gb",
25
+ "memory_GPU_in_GB": 32,
26
+ "hbm_bandwidth_in_GB_per_sec": 900,
27
+ "intra_node_bandwidth_in_GB_per_sec": 300,
28
+ "inter_node_bandwidth_in_GB_per_sec": 200,
29
+ "peak_fp16_TFLOPS": 112,
30
+ "peak_int8_TFLOPS": 224,
31
+ "peak_int4_TFLOPS": 448,
32
+ "intra_node_min_message_latency": 8e-06
33
+ },
34
+ "br104p": {
35
+ "name": "br104p",
36
+ "memory_GPU_in_GB": 32,
37
+ "hbm_bandwidth_in_GB_per_sec": 819,
38
+ "intra_node_bandwidth_in_GB_per_sec": 192,
39
+ "inter_node_bandwidth_in_GB_per_sec": 200,
40
+ "peak_fp32_TFLOPS": 256,
41
+ "peak_fp16_TFLOPS": 512,
42
+ "peak_int8_TFLOPS": 1024,
43
+ "intra_node_min_message_latency": 8e-06
44
+ },
45
+ "a100-pcie-40gb": {
46
+ "name": "a100-pcie-40gb",
47
+ "memory_GPU_in_GB": 40,
48
+ "hbm_bandwidth_in_GB_per_sec": 1555,
49
+ "intra_node_bandwidth_in_GB_per_sec": 64,
50
+ "inter_node_bandwidth_in_GB_per_sec": 200,
51
+ "peak_fp32_TFLOPS": 156,
52
+ "peak_fp16_TFLOPS": 312,
53
+ "peak_int8_TFLOPS": 624,
54
+ "peak_int4_TFLOPS": 1248,
55
+ "intra_node_min_message_latency": 8e-06
56
+ },
57
+ "a100-sxm-40gb": {
58
+ "name": "a100-sxm-40gb",
59
+ "memory_GPU_in_GB": 40,
60
+ "hbm_bandwidth_in_GB_per_sec": 1555,
61
+ "intra_node_bandwidth_in_GB_per_sec": 600,
62
+ "inter_node_bandwidth_in_GB_per_sec": 200,
63
+ "peak_fp32_TFLOPS": 156,
64
+ "peak_fp16_TFLOPS": 312,
65
+ "peak_int8_TFLOPS": 624,
66
+ "peak_int4_TFLOPS": 1248,
67
+ "intra_node_min_message_latency": 8e-06
68
+ },
69
+ "a100-pcie-80gb": {
70
+ "name": "a100-pcie-80gb",
71
+ "memory_GPU_in_GB": 80,
72
+ "hbm_bandwidth_in_GB_per_sec": 1935,
73
+ "intra_node_bandwidth_in_GB_per_sec": 64,
74
+ "inter_node_bandwidth_in_GB_per_sec": 200,
75
+ "peak_fp32_TFLOPS": 156,
76
+ "peak_fp16_TFLOPS": 312,
77
+ "peak_int8_TFLOPS": 624,
78
+ "peak_int4_TFLOPS": 1248,
79
+ "intra_node_min_message_latency": 8e-06
80
+ },
81
+ "a100-sxm-80gb": {
82
+ "name": "a100-sxm-80gb",
83
+ "memory_GPU_in_GB": 80,
84
+ "hbm_bandwidth_in_GB_per_sec": 2039,
85
+ "intra_node_bandwidth_in_GB_per_sec": 600,
86
+ "inter_node_bandwidth_in_GB_per_sec": 200,
87
+ "peak_fp32_TFLOPS": 156,
88
+ "peak_fp16_TFLOPS": 312,
89
+ "peak_int8_TFLOPS": 624,
90
+ "peak_int4_TFLOPS": 1248,
91
+ "intra_node_min_message_latency": 8e-06
92
+ },
93
+ "910b-64gb": {
94
+ "name": "910b-64gb",
95
+ "memory_GPU_in_GB": 64,
96
+ "hbm_bandwidth_in_GB_per_sec": 460,
97
+ "intra_node_bandwidth_in_GB_per_sec": 392,
98
+ "inter_node_bandwidth_in_GB_per_sec": 200,
99
+ "peak_fp32_TFLOPS": 188,
100
+ "peak_fp16_TFLOPS": 376,
101
+ "peak_int8_TFLOPS": 752,
102
+ "peak_int4_TFLOPS": 1504,
103
+ "intra_node_min_message_latency": 9e-06
104
+ },
105
+ "h100-sxm-80gb": {
106
+ "name": "a100-sxm-80gb",
107
+ "memory_GPU_in_GB": 80,
108
+ "hbm_bandwidth_in_GB_per_sec": 3430,
109
+ "intra_node_bandwidth_in_GB_per_sec": 900,
110
+ "inter_node_bandwidth_in_GB_per_sec": 200,
111
+ "peak_fp32_TFLOPS": 989,
112
+ "peak_fp16_TFLOPS": 1979,
113
+ "peak_int8_TFLOPS": 3958,
114
+ "intra_node_min_message_latency": 8e-06
115
+ },
116
+ "h100-pcie-80gb": {
117
+ "name": "a100-sxm-80gb",
118
+ "memory_GPU_in_GB": 80,
119
+ "hbm_bandwidth_in_GB_per_sec": 2048,
120
+ "intra_node_bandwidth_in_GB_per_sec": 128,
121
+ "inter_node_bandwidth_in_GB_per_sec": 200,
122
+ "peak_fp32_TFLOPS": 756,
123
+ "peak_fp16_TFLOPS": 1513,
124
+ "peak_int8_TFLOPS": 3026,
125
+ "intra_node_min_message_latency": 8e-06
126
+ },
127
+ "a30-pcie-24gb": {
128
+ "name": "a30-pcie-24gb",
129
+ "memory_GPU_in_GB": 24,
130
+ "hbm_bandwidth_in_GB_per_sec": 933,
131
+ "intra_node_bandwidth_in_GB_per_sec": 64,
132
+ "inter_node_bandwidth_in_GB_per_sec": 200,
133
+ "peak_fp32_TFLOPS": 82,
134
+ "peak_fp16_TFLOPS": 165,
135
+ "peak_int8_TFLOPS": 330,
136
+ "peak_int4_TFLOPS": 661,
137
+ "intra_node_min_message_latency": 8e-06
138
+ },
139
+ "a30-sxm-24gb": {
140
+ "name": "a30-sxm-24gb",
141
+ "memory_GPU_in_GB": 24,
142
+ "hbm_bandwidth_in_GB_per_sec": 933,
143
+ "intra_node_bandwidth_in_GB_per_sec": 200,
144
+ "inter_node_bandwidth_in_GB_per_sec": 200,
145
+ "peak_fp32_TFLOPS": 82,
146
+ "peak_fp16_TFLOPS": 165,
147
+ "peak_int8_TFLOPS": 330,
148
+ "peak_int4_TFLOPS": 661,
149
+ "intra_node_min_message_latency": 8e-06
150
+ },
151
+ "a40-pcie-48gb": {
152
+ "name": "a40-pcie-48gb",
153
+ "memory_GPU_in_GB": 44.98,
154
+ "hbm_bandwidth_in_GB_per_sec": 696,
155
+ "intra_node_bandwidth_in_GB_per_sec": 64,
156
+ "inter_node_bandwidth_in_GB_per_sec": 200,
157
+ "peak_fp32_TFLOPS": 74.8,
158
+ "peak_fp16_TFLOPS": 149.7,
159
+ "peak_int8_TFLOPS": 299.3,
160
+ "peak_int4_TFLOPS": 598.7,
161
+ "intra_node_min_message_latency": 8e-06
162
+ }
163
+ }
configs/gpu_perf.ini ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [T4]
2
+ gpu_memory=16GB
3
+ single_precision=8.1TFLOPS
4
+ gpu_memory_bandwidth=300GB/s
5
+ interconnect_bandwidth=32GB/s
6
+ [L4]
7
+ gpu_memory=30GB
8
+ single_precision=24TFLOPS
9
+ gpu_memory_bandwidth=300GB/s
10
+ interconnect_bandwidth=64GB/s
11
+ [L40]
12
+ gpu_memory=48GB
13
+ single_precision=90.5TFLOPS
14
+ gpu_memory_bandwidth=864GB/s
15
+ interconnect_bandwidth=64GB/s
16
+ [V100]
17
+ gpu_memory=36GB
18
+ single_precision=14TFLOPS
19
+ gpu_memory_bandwidth=900GB/s
20
+ interconnect_bandwidth=32GB/s
21
+ [A100]
22
+ gpu_memory=80GB
23
+ single_precision=19.5TFLOPS
24
+ gpu_memory_bandwidth=1935GB/s
25
+ interconnect_bandwidth=64GB/s
configs/model_configs.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "opt-1.3b":{
3
+ "num_layers": 24,
4
+ "n_head": 32,
5
+ "hidden_dim": 2048,
6
+ "vocab_size": 50272,
7
+ "max_seq_len": 2048,
8
+ "ffn_embed_dim": 8192,
9
+ "model_type": "opt",
10
+ "model_name": "opt-1.3b"
11
+ },
12
+ "opt-6.7b":{
13
+ "num_layers": 32,
14
+ "n_head": 32,
15
+ "hidden_dim": 4096,
16
+ "vocab_size": 50272,
17
+ "max_seq_len": 2048,
18
+ "ffn_embed_dim": 16384,
19
+ "model_type": "opt",
20
+ "model_name": "opt-6.7b"
21
+ },
22
+ "opt-13b":{
23
+ "num_layers": 40,
24
+ "n_head": 40,
25
+ "hidden_dim": 5120,
26
+ "vocab_size": 50272,
27
+ "max_seq_len": 2048,
28
+ "ffn_embed_dim": 20480,
29
+ "model_type": "opt",
30
+ "model_name": "opt-13b"
31
+ },
32
+ "opt-66b":{
33
+ "num_layers": 64,
34
+ "n_head": 72,
35
+ "hidden_dim": 9216,
36
+ "vocab_size": 50272,
37
+ "max_seq_len": 2048,
38
+ "ffn_embed_dim": 36864,
39
+ "model_type": "opt",
40
+ "model_name": "opt-66b"
41
+ },
42
+ "opt-175b":{
43
+ "max_seq_len": 2048,
44
+ "num_layers": 96,
45
+ "n_head": 96,
46
+ "hidden_dim": 12288,
47
+ "vocab_size": 50272,
48
+ "ffn_embed_dim": 49152,
49
+ "model_type": "opt",
50
+ "model_name": "opt-175b"
51
+ },
52
+ "gpt2":{
53
+ "num_layers": 12,
54
+ "n_head": 12,
55
+ "hidden_dim": 768,
56
+ "vocab_size": 50257,
57
+ "max_seq_len": 1024,
58
+ "ffn_embed_dim": 3072,
59
+ "model_type": "gpt2",
60
+ "model_name": "gpt2"
61
+ },
62
+ "gpt2-medium":{
63
+ "num_layers": 24,
64
+ "n_head": 16,
65
+ "hidden_dim": 1024,
66
+ "vocab_size": 50257,
67
+ "max_seq_len": 1024,
68
+ "ffn_embed_dim": 4096,
69
+ "model_type": "gpt2",
70
+ "model_name": "gpt2-medium"
71
+ },
72
+ "gpt2-large":{
73
+ "num_layers": 36,
74
+ "n_head": 20,
75
+ "hidden_dim": 1280,
76
+ "vocab_size": 50257,
77
+ "max_seq_len": 1024,
78
+ "ffn_embed_dim": 5120,
79
+ "model_type": "gpt2",
80
+ "model_name": "gpt2-large"
81
+ },
82
+ "gpt2-xl":{
83
+ "num_layers": 48,
84
+ "n_head": 25,
85
+ "hidden_dim": 1600,
86
+ "vocab_size": 50257,
87
+ "max_seq_len": 1024,
88
+ "ffn_embed_dim": 6400,
89
+ "model_type": "gpt2",
90
+ "model_name": "gpt2-xl"
91
+ },
92
+ "bloom-560m":{
93
+ "num_layers": 24,
94
+ "n_head": 16,
95
+ "hidden_dim": 1024,
96
+ "vocab_size": 250880,
97
+ "max_seq_len": null,
98
+ "ffn_embed_dim": 4096,
99
+ "model_type": "bloom",
100
+ "model_name": "bloom-560m"
101
+ },
102
+ "bloom-7b":{
103
+ "num_layers": 30,
104
+ "n_head": 32,
105
+ "hidden_dim": 4096,
106
+ "vocab_size": 250880,
107
+ "max_seq_len": null,
108
+ "ffn_embed_dim": 16384,
109
+ "model_type": "bloom",
110
+ "model_name": "bloom-7b"
111
+ },
112
+ "bloom-175b":{
113
+ "num_layers": 96,
114
+ "n_head": 96,
115
+ "hidden_dim": 12288,
116
+ "vocab_size": 250880,
117
+ "ffn_embed_dim": 49152,
118
+ "model_type": "bloom",
119
+ "model_name": "bloom-175b"
120
+ },
121
+ "llama-7b":{
122
+ "num_layers": 32,
123
+ "n_head": 32,
124
+ "hidden_dim": 4096,
125
+ "vocab_size": 32000,
126
+ "max_seq_len": 2048,
127
+ "ffn_embed_dim": 16384,
128
+ "model_type": "llama"
129
+ },
130
+ "llama-13b":{
131
+ "num_layers": 40,
132
+ "n_head": 40,
133
+ "hidden_dim": 5120,
134
+ "vocab_size": 32000,
135
+ "max_seq_len": 2048,
136
+ "ffn_embed_dim": 20480,
137
+ "model_type": "llama",
138
+ "model_name": "llama-13b"
139
+ },
140
+ "llama-30b":{
141
+ "num_layers": 60,
142
+ "n_head": 52,
143
+ "hidden_dim": 6656,
144
+ "vocab_size": 32000,
145
+ "max_seq_len": 2048,
146
+ "ffn_embed_dim": 26624,
147
+ "model_type": "llama",
148
+ "model_name": "llama-30b"
149
+ },
150
+ "llama-65b":{
151
+ "num_layers": 80,
152
+ "n_head": 64,
153
+ "hidden_dim": 8192,
154
+ "vocab_size": 32000,
155
+ "max_seq_len": 2048,
156
+ "ffn_embed_dim": 32768,
157
+ "model_type": "llama",
158
+ "model_name": "llama-65b"
159
+ },
160
+ "llama2-13b":{
161
+ "num_layers": 40,
162
+ "n_head": 40,
163
+ "num_key_value_heads": 40,
164
+ "hidden_dim": 5120,
165
+ "ffn_embed_dim": 20480,
166
+ "vocab_size": 32000,
167
+ "max_seq_len": 4096,
168
+ "model_type": "llama",
169
+ "model_name": "llama2-13b"
170
+ },
171
+ "llama2-70b":{
172
+ "num_layers": 80,
173
+ "n_head": 64,
174
+ "num_key_value_heads": 8,
175
+ "hidden_dim": 8192,
176
+ "ffn_embed_dim": 32768,
177
+ "vocab_size": 49960,
178
+ "max_seq_len": 4096,
179
+ "model_type": "llama2",
180
+ "model_name": "llama2-70b"
181
+ },
182
+ "baichuan2-13b": {
183
+ "num_layers": 40,
184
+ "n_head": 40,
185
+ "num_key_value_heads": 40,
186
+ "hidden_dim": 5120,
187
+ "ffn_embed_dim": 13696,
188
+ "vocab_size": 125696,
189
+ "max_seq_len": 4096,
190
+ "model_type": "baichuan",
191
+ "model_name": "baichuan2-13b"
192
+ },
193
+ "internlm-20b": {
194
+ "num_layers": 60,
195
+ "n_head": 40,
196
+ "num_key_value_heads": 40,
197
+ "hidden_dim": 5120,
198
+ "ffn_embed_dim": 20480,
199
+ "vocab_size": 103168,
200
+ "max_seq_len": 16384,
201
+ "model_type": "llama",
202
+ "model_name": "internlm-20b"
203
+ }
204
+ }
constants.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #########################################
2
+ ####### llm profiler ############
3
+ #########################################
4
+
5
+ FLOPS_EFFICIENCY = 1.0 # FLOPS efficiency achieved by Megatron-LM is ~0.5 for LLM training
6
+ HBM_MEMORY_EFFICIENCY = 1 # GPU HBM memory efficiency
7
+ INTRA_NODE_MEMORY_EFFICIENCY = 1.0 # intra-node (nvlink) memory efficiency
8
+ INTER_NODE_MEMORY_EFFICIENCY = 1.0 # inter-node memory efficiency
9
+
10
+ NUM_GPUS_PER_NODE = 8 # number of GPUs per node
11
+
12
+ TOLERANCE = 0.01 # tolerance for floating point comparisons
13
+
14
+ BITS_PER_BYTE = 8 # number of bits in a byte
15
+
16
+ BITS_FP32 = 32 # number of bits in FP32 data type
17
+ BITS_FP16 = 16 # number of bits in FP16 data type
18
+ BITS_INT8 = 8 # number of bits in INT8 data type
19
+ BITS_INT4 = 4 # number of bits in INT4 data type
20
+
21
+ BYTES_FP32 = BITS_FP32 // BITS_PER_BYTE # number of bytes in FP32 data type
22
+ BYTES_FP16 = BITS_FP16 // BITS_PER_BYTE # number of bytes in FP16 data type
23
+ BYTES_INT8 = BITS_INT8 // BITS_PER_BYTE # number of bytes in INT8 data type
24
+ BYTES_INT4 = BITS_INT4 // BITS_PER_BYTE # number of bytes in INT4 data type
25
+
26
+ PRINT_LINE_WIDTH = 100
27
+
28
+ GPUS = [1, 2, 4, 8]
interface.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import io
3
+ import logging
4
+
5
+ from llm_profiler import *
6
+ import sys
7
+ from contextlib import redirect_stdout
8
+
9
+ # 模型列表
10
+ model_names = [
11
+ "opt-1.3b",
12
+ "opt-6.7b",
13
+ "opt-13b",
14
+ "opt-66b",
15
+ "opt-175b",
16
+ "gpt2",
17
+ "gpt2-medium",
18
+ "gpt2-large",
19
+ "gpt2-xl",
20
+ "bloom-560m",
21
+ "bloom-7b",
22
+ "bloom-175b",
23
+ "llama-7b",
24
+ "llama-13b",
25
+ "llama-30b",
26
+ "llama-65b",
27
+ "llama2-13b",
28
+ "llama2-70b",
29
+ "internlm-20b",
30
+ "baichuan2-13b",
31
+ ]
32
+ # GPU 列表
33
+ gpu_names = [
34
+ "t4-pcie-15gb",
35
+ "v100-pcie-32gb",
36
+ "v100-sxm-32gb",
37
+ "br104p",
38
+ "a100-pcie-40gb",
39
+ "a100-sxm-40gb",
40
+ "a100-pcie-80gb",
41
+ "a100-sxm-80gb",
42
+ "910b-64gb",
43
+ "h100-sxm-80gb",
44
+ "h100-pcie-80gb",
45
+ "a30-pcie-24gb",
46
+ "a30-sxm-24gb",
47
+ "a40-pcie-48gb",
48
+ ]
49
+
50
+
51
+ # 创建一个日志处理器,将日志消息写入 StringIO 对象
52
+ class StringHandler(logging.Handler):
53
+ def __init__(self):
54
+ super().__init__()
55
+ self.stream = io.StringIO()
56
+ self.setFormatter(logging.Formatter("%(message)s"))
57
+
58
+ def emit(self, record):
59
+ self.stream.write(self.format(record) + "\n")
60
+
61
+ def get_value(self):
62
+ return self.stream.getvalue()
63
+
64
+
65
+ # 创建一个日志记录器并添加 StringHandler
66
+ logger = logging.getLogger(__name__)
67
+ logger.setLevel(logging.INFO)
68
+ string_handler = StringHandler()
69
+ logger.addHandler(string_handler)
70
+
71
+
72
+ def gradio_interface(
73
+ model_name="llama2-70b",
74
+ gpu_name: str = "t4-pcie-15gb",
75
+ bytes_per_param: int = BYTES_FP16,
76
+ batch_size_per_gpu: int = 2,
77
+ seq_len: int = 300,
78
+ generate_len: int = 40,
79
+ ds_zero: int = 0,
80
+ dp_size: int = 1,
81
+ tp_size: int = 4,
82
+ pp_size: int = 1,
83
+ sp_size: int = 1,
84
+ use_kv_cache: bool = True,
85
+ layernorm_dtype_bytes: int = BYTES_FP16,
86
+ kv_cache_dtype_bytes: int = BYTES_FP16,
87
+ flops_efficiency: float = FLOPS_EFFICIENCY,
88
+ hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
89
+ intra_node_memory_efficiency: float = INTRA_NODE_MEMORY_EFFICIENCY,
90
+ inter_node_memory_efficiency: float = INTER_NODE_MEMORY_EFFICIENCY,
91
+ mode: str = "inference",
92
+ print_flag: bool = True,
93
+ ) -> list:
94
+ # 清空 StringIO 对象
95
+ string_handler.stream.seek(0)
96
+ string_handler.stream.truncate()
97
+
98
+ # 重定向 sys.stdout 到 StringHandler
99
+ original_stdout = sys.stdout
100
+ sys.stdout = string_handler.stream
101
+
102
+ # 调用你的推理函数
103
+ results = llm_profile_infer(
104
+ model_name,
105
+ gpu_name,
106
+ bytes_per_param,
107
+ batch_size_per_gpu,
108
+ seq_len,
109
+ generate_len,
110
+ ds_zero,
111
+ dp_size,
112
+ tp_size,
113
+ pp_size,
114
+ sp_size,
115
+ use_kv_cache,
116
+ layernorm_dtype_bytes,
117
+ kv_cache_dtype_bytes,
118
+ flops_efficiency,
119
+ hbm_memory_efficiency,
120
+ intra_node_memory_efficiency,
121
+ inter_node_memory_efficiency,
122
+ mode,
123
+ print_flag,
124
+ )
125
+
126
+ # 恢复 sys.stdout
127
+ sys.stdout = original_stdout
128
+
129
+ # 获取日志消息
130
+ log_output = string_handler.get_value()
131
+
132
+ # 返回推理结果和日志输出
133
+ return results, log_output
134
+
135
+
136
+ # 创建 Gradio 界面
137
+ iface = gr.Interface(
138
+ fn=gradio_interface,
139
+ inputs=[
140
+ gr.Dropdown(choices=model_names, label="Model Name", value="llama2-70b"),
141
+ gr.Dropdown(choices=gpu_names, label="GPU Name", value="a100-sxm-80gb"),
142
+ gr.Number(label="Bytes per Param", value=BYTES_FP16),
143
+ gr.Number(label="Batch Size per GPU", value=2),
144
+ gr.Number(label="Sequence Length", value=300),
145
+ gr.Number(label="Generate Length", value=40),
146
+ gr.Number(label="DS Zero", value=0),
147
+ gr.Number(label="DP Size", value=1),
148
+ gr.Number(label="TP Size", value=4),
149
+ gr.Number(label="PP Size", value=1),
150
+ gr.Number(label="SP Size", value=1),
151
+ gr.Checkbox(label="Use KV Cache", value=True),
152
+ gr.Number(label="Layernorm dtype Bytes", value=BYTES_FP16),
153
+ gr.Number(label="KV Cache dtype Bytes", value=BYTES_FP16),
154
+ gr.Number(label="FLOPS Efficiency", value=FLOPS_EFFICIENCY),
155
+ gr.Number(label="HBM Memory Efficiency", value=HBM_MEMORY_EFFICIENCY),
156
+ gr.Number(
157
+ label="Intra Node Memory Efficiency", value=INTRA_NODE_MEMORY_EFFICIENCY
158
+ ),
159
+ gr.Number(
160
+ label="Inter Node Memory Efficiency", value=INTER_NODE_MEMORY_EFFICIENCY
161
+ ),
162
+ gr.Radio(choices=["inference", "other_mode"], label="Mode", value="inference"),
163
+ gr.Checkbox(label="Print Flag", value=True),
164
+ ],
165
+ outputs=[
166
+ gr.Textbox(label="Inference Results"), # 推理结果输出,带标签
167
+ gr.Textbox(label="Detailed Analysis"), # 日志输出,带标签
168
+ ],
169
+ title="LLM Profiler",
170
+ description="Input parameters to profile your LLM.",
171
+ )
172
+
173
+ # 启动 Gradio 界面
174
+ iface.launch(auth=("xtrt-llm", "xtrt-llm"), share=False)
175
+ # iface.launch()
llm_profiler.py ADDED
@@ -0,0 +1,1274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding : utf-8 -*-
2
+ # author : honggao.zhang
3
+ # Create : 2023-7-19
4
+ # Version : 0.1.0
5
+ # Description : transformer model(llm) profiling tools, can be used to profile the model's flops, memory, and latency.
6
+ # Reference : https://github.com/cli99/llm-analysis
7
+
8
+ import logging
9
+ from pprint import pformat
10
+ import pprint
11
+ import pandas as pd
12
+ import os
13
+
14
+ from config import *
15
+ from utils import *
16
+ from math import floor
17
+
18
+ logger = logging.getLogger()
19
+
20
+ class CountCausalLMParams(object):
21
+ def __init__(self, model_config: ModelConfig) -> None:
22
+ self.h = model_config.hidden_dim
23
+ self.l = model_config.num_layers
24
+ self.V = model_config.vocab_size
25
+
26
+ self.model_config = model_config
27
+
28
+ def count_params_embedding(self, shared_embedding: bool = True) -> int:
29
+ """Get the number of parameters in the embedding layer. params_te = vocab_size * d_model
30
+ Args:
31
+ shared_embedding (bool, optional): whether the output embedding \
32
+ shares weights with the input embedding. Defaults to True.
33
+
34
+ Returns:
35
+ int: the number of parameters in the embedding layer
36
+ """
37
+ num_params_input_embedding = self.V * self.h
38
+ num_params_output_embedding = self.V * self.h if not shared_embedding else 0
39
+
40
+ return num_params_input_embedding + num_params_output_embedding
41
+
42
+ def count_params_per_layer_attn(self) -> int:
43
+ """Get the number of parameters per layer in the attention module
44
+ which include 4 linear layer: query/key/value projection and output matrices.
45
+ params_attn(mha) = params_q + params_k + params_v + params_o = 4 * d_model**2
46
+
47
+ Returns:
48
+ int: the number of parameters per layer in the attention module(mha)
49
+ """
50
+ return 4 * self.h ** 2
51
+
52
+ def count_params_per_layer_mlp(self) -> int:
53
+ """Get the number of parameters in the MLP linear layers, including the
54
+ intermediate and output matrices.
55
+ params_mlp = prams_fc1 + params_fc2 = d_model * 4_d_model + 4_d_model * d_model = 8 * d_model**2
56
+
57
+ Returns:
58
+ int: the number of parameters in the two MLP linear layers
59
+ """
60
+
61
+ return 8 * self.h ** 2
62
+
63
+ def count_params_per_layer_ln(self) -> int:
64
+ """Get the number of parameters per layer in the two layer normalization module.
65
+ params_ln = 4 * d_model
66
+
67
+ Returns:
68
+ int: the number of parameters per layer in the two layer normalization module
69
+ """
70
+ return 4 * self.h
71
+
72
+ def count_params_per_layer(self, ln_ignore=True) -> tuple:
73
+ """Get the number of params per layer in the transformer decoder blocks,
74
+ mainly including the attention and MLP layers
75
+
76
+ params_per_layer = params_attn + params_mlp + params_ln
77
+ = 4d_model^2 + 8d_model^2 + 2*4d_model = 12d_model^2 + 8d_model
78
+
79
+ Return:
80
+ int: the number of params per layer in the transformer decoder blocks
81
+ """
82
+ params_per_layer_attn = self.count_params_per_layer_attn()
83
+ params_per_layer_mlp = self.count_params_per_layer_mlp()
84
+ params_per_layer_ln = 0 if ln_ignore else 2 * self.count_params_per_layer_ln()
85
+
86
+ params_per_layer = (
87
+ params_per_layer_attn
88
+ + params_per_layer_mlp
89
+ + params_per_layer_ln
90
+ )
91
+
92
+ dict_params_per_layer = {
93
+ "params_per_layer": params_per_layer,
94
+ "params_attn": params_per_layer_attn,
95
+ "params_mlp": params_per_layer_mlp,
96
+ "params_layernorm": params_per_layer_ln,
97
+ }
98
+
99
+ return params_per_layer, dict_params_per_layer
100
+
101
+ def count_params_model(self) -> int:
102
+ """Get the total number of parameters in the model including all layers and token embedding layer.
103
+ params_model = params_embedding + params_per_layer * num_layers
104
+ = V * d_model + 12 * d_model**2 * num_layers
105
+ Returns:
106
+ int: the total number of parameters in the model
107
+ """
108
+ params_per_layer, dict_params_per_layer = self.count_params_per_layer()
109
+
110
+ return (params_per_layer * self.l
111
+ + self.count_params_embedding()
112
+ )
113
+
114
+ def __call__(self, hidden_dim, num_layers, vocab_size) -> int:
115
+
116
+ return (vocab_size * hidden_dim
117
+ + 12 * hidden_dim ** 2 * num_layers
118
+ )
119
+
120
+
121
+ class CountCausalLMFlops(object):
122
+ """The count is model-specific and does not depend on the parallelism strategy.
123
+ And ignore layer normalization and other element-wise operations."""
124
+ def __init__(self, model_config: ModelConfig, batch_size: int, seq_len: int, simp_count=False) -> None:
125
+ self.h = model_config.hidden_dim
126
+ self.l = model_config.num_layers
127
+ self.V = model_config.vocab_size
128
+
129
+ self.b = batch_size
130
+ self.s = seq_len
131
+
132
+ if not simp_count:
133
+ llm_params = CountCausalLMParams(model_config)
134
+ self.model_flops = llm_params(self.h, self.l, self.V) * 2
135
+
136
+ def count_flops_fwd_per_layer_attn(self, batch_size: int, seq_len: int) -> int:
137
+ """Get the number of floating point operations (flops) for the forward
138
+ pass of the attention module in a transformer layer, given the batch
139
+ size and sequence length.
140
+
141
+ mainly including four linear calculations: query/key/value projection and output
142
+ matrices multiplication、self-attention internal operation, and element-wise operations are ignored.
143
+
144
+ flops_attn = flops_q + flops_k + flops_v + flops_output + flops_self_attention
145
+ = 4(bsh^2) + 2(2bs^2h)
146
+ Args:
147
+ batch_size (int): batch size
148
+ seq_len (int): sequence length
149
+
150
+ Returns:
151
+ int: flops for the forward pass of the attention module in a transformer layer
152
+ """
153
+ return (
154
+ 8 * batch_size * seq_len * self.h ** 2
155
+ + 4 * batch_size * seq_len ** 2 * self.h
156
+ )
157
+
158
+ def count_flops_fwd_per_layer_mlp(self, batch_size: int, seq_len: int) -> int:
159
+ """Count two flops of matrices multiplication(two linear layers in the MLP module.)
160
+
161
+ flops_mlp = flops_fc1 + flops_fc2 = 2bs(4h^2) + 2bs(4h^2) = 16bsh^2
162
+ """
163
+ return 16 * batch_size * seq_len * self.h ** 2
164
+
165
+ def count_flops_fwd_per_layer(self, batch_size: int, seq_len: int, ln_ignore=True) -> tuple:
166
+ flops_fwd_per_layer_attn = self.count_flops_fwd_per_layer_attn(batch_size, seq_len)
167
+ flops_fwd_per_layer_mlp = self.count_flops_fwd_per_layer_mlp(batch_size, seq_len)
168
+ flops_fwd_per_layer_ln = 0
169
+
170
+ flops_fwd_per_layer = (
171
+ flops_fwd_per_layer_attn
172
+ + flops_fwd_per_layer_mlp
173
+ + flops_fwd_per_layer_ln
174
+ )
175
+
176
+ dict_flops_fwd_per_layer = {
177
+ "flops_fwd_per_layer": flops_fwd_per_layer,
178
+ "flops_attn": flops_fwd_per_layer_attn,
179
+ "flops_mlp": flops_fwd_per_layer_mlp,
180
+ "flops_layernorm": flops_fwd_per_layer_ln,
181
+ }
182
+
183
+ return flops_fwd_per_layer, dict_flops_fwd_per_layer
184
+
185
+ def count_flops_logits_layer(self,) -> int:
186
+ """flops of output token logits layer"""
187
+ return 2 * self.b * self.s * self.h * self.V
188
+
189
+ def count_flops_fwd_model(self, batch_size: int, seq_len: int) -> int:
190
+ """Count flops of the forward pass of the transformer model, given the batch size and sequence length."""
191
+ num_flops_fwd_model = (
192
+ self.count_flops_fwd_per_layer(batch_size, seq_len)[0] * self.l
193
+ + self.count_flops_logits_layer()
194
+ )
195
+
196
+ # validate
197
+ assert within_range(
198
+ num_flops_fwd_model,
199
+ (
200
+ 24 * self.b * self.s * self.l * self.h**2
201
+ * (1 + self.s / (6 * self.h) + self.V / (12 * self.l * self.h))
202
+ ),
203
+ TOLERANCE,
204
+ )
205
+
206
+ return num_flops_fwd_model
207
+
208
+ def count_flops_bwd_model(self, batch_size: int, seq_len: int) -> int:
209
+ """Get the number of floating point operations (flops) for the backward
210
+ pass of the entire transformer model, given the batch size and sequence"""
211
+ return 2 * self.count_flops_fwd_model(batch_size, seq_len)
212
+
213
+
214
+ class CountCausalLMMemory(object):
215
+ """Count memory of the model and layers."""
216
+ def __init__(self, llm_configs: LLMConfigs) -> None:
217
+ self.model_config = llm_configs.model_config
218
+ self.h = self.model_config.hidden_dim
219
+ self.l = self.model_config.num_layers
220
+ self.V = self.model_config.vocab_size
221
+
222
+ self.b = llm_configs.inference_config.batch_size_per_gpu
223
+ self.s = llm_configs.inference_config.seq_len
224
+ self.o = llm_configs.inference_config.generate_len
225
+
226
+ self.bytes_per_param = llm_configs.inference_config.bytes_per_param
227
+
228
+ self.tp_size = llm_configs.parallelism_config.tp_size
229
+ self.pp_size = llm_configs.parallelism_config.pp_size
230
+ self.num_layers_per_gpu = int(self.l / self.pp_size)
231
+
232
+ self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9 # 单位 GB
233
+
234
+ self.llm_params = CountCausalLMParams(self.model_config)
235
+
236
+ def count_memory_weights(self, embedding_dtype_bytes: int = BYTES_FP16):
237
+ """Get the memory of the model weights"""
238
+ params_per_layer, dict_params_per_layer = self.llm_params.count_params_per_layer()
239
+ params_embedding = self.llm_params.count_params_embedding()
240
+
241
+ memory_weight_per_layer = (
242
+ (params_per_layer / self.tp_size) * self.bytes_per_param
243
+ )
244
+ memory_weight_per_gpu = memory_weight_per_layer * self.num_layers_per_gpu
245
+
246
+ memory_embedding = (params_embedding / self.tp_size) * embedding_dtype_bytes
247
+ memory_weight_per_gpu = memory_weight_per_gpu + memory_embedding
248
+
249
+ return memory_weight_per_gpu
250
+
251
+ def count_memory_activation_per_layer_attn(
252
+ self,
253
+ batch_size: int,
254
+ seq_len: int,
255
+ is_inference: bool = True,
256
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL
257
+ ) -> float:
258
+ """Count the memory (in bytes) required to store the activations of the
259
+ attention in a transformer layer, given the batch size, sequence length,
260
+ whether it is inference or training, the activation recomputation strategy,
261
+ and the activation data type.
262
+ """
263
+ if activation_recomputation == ActivationRecomputation.FULL:
264
+ return (batch_size * seq_len * self.h / self.tp_size) * self.bytes_per_param
265
+
266
+ def count_memory_activation_per_layer_mlp(
267
+ self,
268
+ is_inference: bool = True,
269
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
270
+ ) -> float:
271
+ """ The `mlp` activations include the input to the two linear layers."""
272
+ if activation_recomputation == ActivationRecomputation.FULL:
273
+ return 0
274
+
275
+ return 0
276
+ def count_memory_activation_per_layer_layernorm(
277
+ self,
278
+ is_inference: bool = True,
279
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
280
+ layernorm_dtype_bytes: int = BYTES_FP16
281
+ ) -> float:
282
+ if activation_recomputation == ActivationRecomputation.FULL:
283
+ return 0
284
+ return 0
285
+
286
+ def count_memory_activation_per_layer(
287
+ self,
288
+ batch_size: int,
289
+ seq_len: int,
290
+ is_inference: bool = True,
291
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
292
+ layernorm_dtype_bytes: int = BYTES_FP16
293
+ ) -> float:
294
+
295
+ if activation_recomputation == ActivationRecomputation.FULL:
296
+ return (
297
+ (batch_size * seq_len * self.h / self.tp_size) * self.bytes_per_param
298
+ )
299
+ return 0
300
+
301
+ def count_memory_kv_cache_per_layer(
302
+ self,
303
+ batch_size: int,
304
+ seq_len: int,
305
+ generate_len: int,
306
+ kv_cache_dtype_bytes: int = BYTES_FP16,
307
+ ) -> float:
308
+ """Get the memory (in bytes) required to store the key and value cache
309
+ for a transformer layer in inference, given the batch size, sequence
310
+ length, activation data type, and tensor parallelism size.
311
+
312
+ memory_kv_cache = 4blh(s+o) unit is byte
313
+ Args:
314
+ batch_size (int): batch size
315
+ context_len (int): seq_len + generate_len
316
+
317
+ Returns:
318
+ float: the memory (in bytes) required to store the key and value cache for a transformer layer in inference
319
+ """
320
+
321
+ return (
322
+ (2 * batch_size * (seq_len + generate_len) * self.h) / self.tp_size
323
+ ) * kv_cache_dtype_bytes
324
+
325
+ def count_memory_per_gpu(
326
+ self,
327
+ batch_size: int,
328
+ seq_len: int,
329
+ generate_len: int,
330
+ is_inference: bool = True,
331
+ use_kv_cache: bool = True,
332
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
333
+ layernorm_dtype_bytes: int = BYTES_FP16,
334
+ kv_cache_dtype_bytes: int = BYTES_FP16
335
+ ) -> tuple:
336
+
337
+ # 1, prefill stage count memory and max_batch_size
338
+
339
+ weight_memory_per_gpu = self.count_memory_weights() # count model weights memory
340
+ memory_left = self.gpu_memory_in_GB - weight_memory_per_gpu
341
+
342
+ prefill_activation_memory_batch_size_1 = ( # count model activations and kv cache memory of prefill stage
343
+ self.count_memory_activation_per_layer(
344
+ 1, seq_len, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes
345
+ )
346
+ * self.num_layers_per_gpu
347
+ )
348
+
349
+ prefill_max_batch_size_per_gpu = int(
350
+ memory_left / prefill_activation_memory_batch_size_1
351
+ )
352
+
353
+ prefill_activation_memory_per_gpu = (
354
+ self.count_memory_activation_per_layer(
355
+ batch_size, seq_len, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes
356
+ )
357
+ * self.num_layers_per_gpu
358
+ )
359
+
360
+ assert memory_left > prefill_activation_memory_per_gpu, (
361
+ f"weight_memory_per_gpu {num_to_string(weight_memory_per_gpu)}, activation memory {num_to_string(prefill_activation_memory_per_gpu)} is too large can't fit in GPU memory! memory_left is {num_to_string(memory_left)}!"
362
+ )
363
+
364
+ # 2, decode stage count memory and max_batch_size
365
+ if use_kv_cache:
366
+ kv_cache_memory_batch_size_1 = (
367
+ self.count_memory_kv_cache_per_layer(
368
+ 1,
369
+ seq_len + generate_len,
370
+ kv_cache_dtype_bytes
371
+ )
372
+ * self.num_layers_per_gpu
373
+ )
374
+
375
+ kv_cache_memory_per_gpu = (
376
+ self.count_memory_kv_cache_per_layer(
377
+ batch_size,
378
+ seq_len + generate_len,
379
+ kv_cache_dtype_bytes
380
+ )
381
+ * self.num_layers_per_gpu
382
+ )
383
+
384
+ decode_activation_memory_batch_size_1 = (
385
+ # seq_len 1 is used for decoding
386
+ self.count_memory_activation_per_layer(
387
+ 1, 1, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes
388
+ )
389
+ * self.num_layers_per_gpu
390
+ )
391
+
392
+ decode_activation_memory_per_gpu = (
393
+ # seq_len 1 is used for decoding
394
+ self.count_memory_activation_per_layer(
395
+ batch_size, 1, is_inference, ActivationRecomputation.FULL, layernorm_dtype_bytes
396
+ )
397
+ * self.num_layers_per_gpu
398
+ )
399
+
400
+ decode_max_batch_size_per_gpu = int(
401
+ memory_left / (decode_activation_memory_batch_size_1 + kv_cache_memory_batch_size_1)
402
+ )
403
+ max_batch_total_tokens = decode_max_batch_size_per_gpu * (seq_len + generate_len)
404
+
405
+ # llama2-70b 模型使用了 GQA 技术,kv cache 对应的 head 数目为 8,所以 max_batch_total_tokens 参数可取值为 16384*8。
406
+ if self.model_config.model_name == "llama2-70b":
407
+ max_batch_total_tokens *= 8
408
+
409
+ assert batch_size <= decode_max_batch_size_per_gpu, (
410
+ f"batch_size_per_gpu {batch_size} is too large to fit"
411
+ " in GPU memory, decode_max_batch_size_per_gpu:"
412
+ f" {decode_max_batch_size_per_gpu}"
413
+ )
414
+
415
+ assert memory_left > (
416
+ kv_cache_memory_per_gpu + decode_activation_memory_per_gpu
417
+ ), ("kv_cache and activation memory with batch_size_per_gpu ="
418
+ f" {batch_size} is too large to fit in GPU memory"
419
+ )
420
+ else:
421
+ # 上下文长度不再是新生成的那个 token,而是 seq_len + generate_len
422
+ decode_activation_memory_batch_size_1 = (
423
+ self.count_memory_activation_per_layer(
424
+ 1, seq_len + generate_len, True, ActivationRecomputation.FULL, layernorm_dtype_bytes
425
+ )
426
+ * self.num_layers_per_gpu
427
+ )
428
+ decode_max_batch_size_per_gpu = int(
429
+ memory_left / decode_activation_memory_batch_size_1
430
+ )
431
+ assert batch_size <= decode_max_batch_size_per_gpu, (
432
+ f"batch_size {batch_size} is too large to fit"
433
+ " in GPU memory, decode_max_batch_size_per_gpu:"
434
+ f" {decode_max_batch_size_per_gpu}"
435
+ )
436
+
437
+ decode_activation_memory_per_gpu = (
438
+ self.count_memory_activation_per_layer(
439
+ batch_size, seq_len + generate_len, True, ActivationRecomputation.FULL, layernorm_dtype_bytes
440
+ )
441
+ * self.num_layers_per_gpu
442
+ )
443
+ kv_cache_memory_per_gpu = 0
444
+
445
+ decode_memory_total = (weight_memory_per_gpu + decode_activation_memory_per_gpu + kv_cache_memory_per_gpu)
446
+
447
+ # memory summary
448
+ memory_prefill_summary_dict = {
449
+ "weight_memory_per_gpu": weight_memory_per_gpu,
450
+ "prefill_activation_memory_batch_size_1": prefill_activation_memory_batch_size_1,
451
+ "prefill_max_batch_size_per_gpu": prefill_max_batch_size_per_gpu,
452
+ "prefill_activation_memory_per_gpu": prefill_activation_memory_per_gpu,
453
+ }
454
+
455
+ memory_decode_summary_dict = {
456
+ "weight_memory_per_gpu": weight_memory_per_gpu,
457
+ "decode_activation_memory_per_gpu": decode_activation_memory_per_gpu,
458
+ "kv_cache_memory_per_gpu": kv_cache_memory_per_gpu,
459
+ "decode_memory_total": decode_memory_total,
460
+ "decode_max_batch_size_per_gpu": decode_max_batch_size_per_gpu,
461
+ "max_batch_total_tokens": max_batch_total_tokens * 0.97,
462
+ }
463
+
464
+ return memory_prefill_summary_dict, memory_decode_summary_dict
465
+
466
+
467
+ class CountCausalLMLatency(object):
468
+ """Count latency by roof-line performance model."""
469
+ def __init__(self, llm_configs: LLMConfigs, data_type="fp16") -> None:
470
+ self.model_config = llm_configs.model_config
471
+ self.gpu_config = llm_configs.gpu_config
472
+ self.inference_config = llm_configs.inference_config
473
+ self.parallelism_config = llm_configs.parallelism_config
474
+
475
+ self.h = self.model_config.hidden_dim
476
+ self.l = self.model_config.num_layers
477
+ self.V = self.model_config.vocab_size
478
+
479
+ self.b = llm_configs.inference_config.batch_size_per_gpu
480
+ self.s = llm_configs.inference_config.seq_len
481
+ self.o = llm_configs.inference_config.generate_len
482
+ self.bytes_per_param = llm_configs.inference_config.bytes_per_param
483
+
484
+ self.tp_size = self.parallelism_config.tp_size
485
+ self.pp_size = self.parallelism_config.pp_size
486
+ self.num_layers_per_gpu = int(self.l / self.parallelism_config.pp_size)
487
+
488
+ self.gpu_hbm_bandwidth = get_gpu_hbm_bandwidth(self.gpu_config) * 10**9 # 单位 GB/s
489
+ self.gpu_intra_node_bandwidth = get_intra_node_bandwidth(self.gpu_config) * 10**9 # 互连带宽,单位 GB/s
490
+ self.gpu_TFLOPS = get_TFLOPS_per_gpu(self.gpu_config) * 10**12 # 单位 TFLOPS
491
+
492
+ self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9 # 单位 GB
493
+
494
+ self.llm_params = CountCausalLMParams(self.model_config)
495
+ self.llm_memory = CountCausalLMMemory(llm_configs)
496
+ self.llm_flops = CountCausalLMFlops(self.model_config, self.b, self.o)
497
+
498
+ def common_count_latency_for_ops(
499
+ self,
500
+ batch_size: int,
501
+ seq_len: int,
502
+ is_inference=True,
503
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
504
+ ops_type: str="attn",
505
+ stage="decode_"
506
+ ) -> float:
507
+ """Count the latency for the forward layer or model, assuming the compute and memory operations are perfectly overlapped.
508
+
509
+ Args:
510
+ flops (float): flops of the forward layer or model
511
+ memory (float): r/w memory(bytes) of the forward layer or model
512
+ tp_size (float): tensor parallelism size
513
+ gpu_TFLOPS (float): GPU TFLOPS in T(10^12)FLOPS
514
+ gpu_hbm_bandwidth (float): GPU HBM bandwidth in GB/s(10^9)
515
+
516
+ Returns:
517
+ float: the latency in seconds for the forward pass
518
+ """
519
+
520
+ if ops_type=="attn":
521
+
522
+ flops = self.llm_flops.count_flops_fwd_per_layer_attn(batch_size, seq_len)
523
+ weight_memory = self.llm_params.count_params_per_layer_attn() * self.bytes_per_param
524
+ activation_memory = self.llm_memory.count_memory_activation_per_layer_attn(
525
+ batch_size, seq_len, is_inference, activation_recomputation
526
+ )
527
+ elif ops_type=="mlp":
528
+ flops = self.llm_flops.count_flops_fwd_per_layer_mlp(batch_size, seq_len)
529
+ weight_memory = self.llm_params.count_params_per_layer_mlp() * self.bytes_per_param
530
+ activation_memory = self.llm_memory.count_memory_activation_per_layer_mlp(is_inference, activation_recomputation)
531
+ elif ops_type=="layernorm":
532
+ activation_memory = self.llm_memory.count_memory_activation_per_layer_layernorm(
533
+ is_inference, activation_recomputation) # activation_memory
534
+ weight_memory = 0 # layernorm has no matrix weight, only vector weight, is ignored
535
+ flops = 0 # layernorm is not compute bound, flops is very small
536
+ else:
537
+ print("error! unsupported ops_type")
538
+
539
+ activation_memory = 0
540
+
541
+ memory = weight_memory + activation_memory
542
+
543
+ compute_latency = flops / (self.tp_size * self.gpu_TFLOPS) # 单位秒
544
+ memory_latency = memory / (self.tp_size * self.gpu_hbm_bandwidth)
545
+
546
+ if memory_latency > compute_latency:
547
+ print(f"{stage} stage: memory_latency {latency_to_string(memory_latency)} > compute_latency {latency_to_string(compute_latency)}, this {ops_type} layer is memory bound!")
548
+ else:
549
+ print(f"{stage} stage: memory_latency {latency_to_string(memory_latency)} <= compute_latency {latency_to_string(compute_latency)}, this {ops_type} layer is compute bound!")
550
+
551
+ return max(compute_latency, memory_latency)
552
+
553
+ def count_latency_fwd_per_layer_tp_comm(self, batch_size: int, seq_len: int) -> float:
554
+ """Count the latency of a single allreduce communication across the
555
+ tensor parallel group in the forward pass of a transformer layer.
556
+ The latency is the max of the latency for the allreduce and the minimum
557
+ message latency through intra-node connect.
558
+ """
559
+ is_ring_allreduce = False
560
+
561
+ if self.tp_size == 1:
562
+ return 0
563
+
564
+ # \phi is communication data, if tp_size is large enough num_data_per_all_reduce can be 2bsh
565
+ if is_ring_allreduce:
566
+ num_data_per_all_reduce = (
567
+ 2 * batch_size * seq_len * self.h *
568
+ (self.tp_size - 1) / (self.tp_size)
569
+ )
570
+ else:
571
+ bsh = batch_size * seq_len * self.h
572
+ num_data_per_all_reduce = (
573
+ 6 * bsh * (self.tp_size - 1) / (self.tp_size) +
574
+ 3 * bsh
575
+ )
576
+
577
+ latency_per_all_reduce = (
578
+ num_data_per_all_reduce * self.bytes_per_param
579
+ / (self.gpu_intra_node_bandwidth)
580
+ )
581
+
582
+ # intra_node_min_message_latency: 节点内连接的最小消息延迟
583
+ return max(
584
+ latency_per_all_reduce,
585
+ self.gpu_config.intra_node_min_message_latency,
586
+ )
587
+
588
+ def count_latency_fwd_per_layer(
589
+ self,
590
+ batch_size: int,
591
+ seq_len: int,
592
+ is_inference: bool=True,
593
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
594
+ layernorm_dtype_bytes: int = BYTES_FP16,
595
+ stage="decode_"
596
+ ) -> tuple:
597
+ latency_fwd_per_layer_attn = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, ops_type="attn", stage=stage)
598
+ latency_fwd_per_layer_mlp = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, ops_type="mlp", stage=stage)
599
+ latency_fwd_per_layer_layernorm = self.common_count_latency_for_ops(batch_size, seq_len, is_inference, activation_recomputation, "layernorm", stage=stage)
600
+
601
+ latency_fwd_per_layer_tp_comm = self.count_latency_fwd_per_layer_tp_comm(batch_size, seq_len)
602
+
603
+ latency_per_layer = (
604
+ latency_fwd_per_layer_attn
605
+ + latency_fwd_per_layer_mlp
606
+ + 2 * latency_fwd_per_layer_layernorm # 2 个 layernorm 层
607
+ + 2 * latency_fwd_per_layer_tp_comm # 一次 AllReduce 产生的通讯量为 2bsh
608
+ )
609
+
610
+ dict_latency_per_layer = {
611
+ "latency_per_layer": (latency_per_layer),
612
+ "latency_attn": (latency_fwd_per_layer_attn),
613
+ "latency_mlp": (latency_fwd_per_layer_mlp),
614
+ "latency_layernorm": (2 * latency_fwd_per_layer_layernorm),
615
+ "latency_tp_comm": (2 * latency_fwd_per_layer_tp_comm),
616
+ }
617
+
618
+ return latency_per_layer, dict_latency_per_layer
619
+
620
+ def count_latency_fwd_input_embedding(
621
+ self, batch_size: int, seq_len: int
622
+ ) -> float:
623
+ """Get the latency for the forward pass of the input embedding layer,
624
+ given the batch size, sequence length, and data type of the embedding
625
+ weight.
626
+
627
+ Args:
628
+ batch_size (int): batch size
629
+ seq_len (int): sequence length
630
+ dtype_bytes (int, optional): number of bytes in the data type for the embedding weight. Defaults to BYTES_FP32.
631
+
632
+ Returns:
633
+ float: the latency in seconds for the forward pass of the input embedding layer
634
+ """
635
+ memory_latency = (
636
+ self.model_config.vocab_size
637
+ * self.model_config.hidden_dim
638
+ * self.bytes_per_param
639
+ / (self.gpu_hbm_bandwidth)
640
+ )
641
+ comm_latency = self.count_latency_fwd_per_layer_tp_comm(
642
+ batch_size, seq_len
643
+ )
644
+ return memory_latency + comm_latency
645
+
646
+ def count_latency_fwd_output_embedding_loss(
647
+ self, batch_size: int, seq_len: int
648
+ ) -> float:
649
+ """Get the latency for the forward pass of the output embedding layer (computing the logits). The operation is compute bound. With tensor parallelism size > 1, an allgather communicates `batch_size * seq_len` elements, which is ignored here. Refer to https://arxiv.org/abs/1909.08053 for more details.
650
+
651
+ Args:
652
+ batch_size (int): batch size
653
+ seq_len (int): sequence length
654
+
655
+ Returns:
656
+ float: the latency in seconds for the forward pass of the output embedding layer
657
+ """
658
+
659
+ compute_latency = (
660
+ 2 * batch_size * seq_len * self.h * self.V
661
+ / self.tp_size
662
+ / self.gpu_TFLOPS
663
+ )
664
+
665
+ return compute_latency
666
+
667
+ def count_latency_kv_cache(
668
+ self,
669
+ batch_size: int,
670
+ seq_len: int,
671
+ generate_len: int,
672
+ use_kv_cache: bool = True,
673
+ kv_cache_dtype_bytes: int = BYTES_FP16
674
+ ) -> tuple:
675
+ """Get the latency for the forward pass of the key and value cache in a transformer layer, given the batch size, sequence length, and whether the key and value cache is used.
676
+
677
+ Args:
678
+ batch_size (int): batch size
679
+ seq_len (int): sequence length
680
+ generate_len (int): number of tokens to generate
681
+ use_kv_cache (bool, optional): whether the key and value cache is used. Defaults to True.
682
+
683
+ Returns:
684
+ float: the latency in seconds for the forward pass of the key and value cache in a transformer layer
685
+ """
686
+ if not use_kv_cache:
687
+ return 0
688
+ kv_cache_memory_list_per_gpu, kv_cache_latency_list = [], []
689
+
690
+ for context_len in range(seq_len, seq_len + generate_len + 1):
691
+ kv_cache_memory_per_gpu = (
692
+ self.llm_memory.count_memory_kv_cache_per_layer(
693
+ batch_size,
694
+ context_len,
695
+ kv_cache_dtype_bytes
696
+ ) * self.num_layers_per_gpu
697
+ )
698
+
699
+ kv_cache_latency = (
700
+ kv_cache_memory_per_gpu / self.gpu_hbm_bandwidth
701
+ )
702
+
703
+ kv_cache_memory_list_per_gpu.append(kv_cache_memory_per_gpu)
704
+ kv_cache_latency_list.append(kv_cache_latency)
705
+
706
+ kv_cache_avg_latency = average(kv_cache_latency_list)
707
+ kv_cache_peak_latency = max(kv_cache_latency_list)
708
+
709
+ return kv_cache_avg_latency, kv_cache_peak_latency
710
+
711
+ def count_latency_fwd_model(
712
+ self,
713
+ batch_size: int,
714
+ seq_len: int,
715
+ is_inference: bool = True,
716
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
717
+ layernorm_dtype_bytes: int = BYTES_FP32,
718
+ breakdown_prefix: str = "",
719
+ ) -> tuple:
720
+ latency_fwd_per_layer, breakdown_per_layer = self.count_latency_fwd_per_layer(
721
+ batch_size,
722
+ seq_len,
723
+ is_inference,
724
+ activation_recomputation,
725
+ layernorm_dtype_bytes,
726
+ stage=breakdown_prefix
727
+ )
728
+ num_layers_per_gpu = self.num_layers_per_gpu
729
+
730
+ latency_fwd_all_layers = latency_fwd_per_layer * self.num_layers_per_gpu
731
+ latency_fwd_input_embedding = self.count_latency_fwd_input_embedding(batch_size, seq_len)
732
+ latency_fwd_output_embedding_loss = self.count_latency_fwd_output_embedding_loss(batch_size, seq_len)
733
+
734
+ model_latency = (
735
+ latency_fwd_all_layers
736
+ + latency_fwd_input_embedding
737
+ + latency_fwd_output_embedding_loss
738
+ )
739
+
740
+ model_latency_breakdown = {
741
+ breakdown_prefix + "latency_fwd_per_layer": breakdown_per_layer,
742
+ breakdown_prefix + "latency_fwd_attn": (breakdown_per_layer["latency_attn"] * num_layers_per_gpu),
743
+ breakdown_prefix + "latency_fwd_mlp": (breakdown_per_layer["latency_mlp"] * num_layers_per_gpu),
744
+ breakdown_prefix + "latency_fwd_layernorm": (breakdown_per_layer["latency_layernorm"] * num_layers_per_gpu),
745
+ breakdown_prefix + "latency_fwd_tp_comm": (breakdown_per_layer["latency_tp_comm"] * num_layers_per_gpu),
746
+ breakdown_prefix + "latency_fwd_input_embedding": (latency_fwd_input_embedding),
747
+ breakdown_prefix + "latency_fwd_output_embedding_loss": (latency_fwd_output_embedding_loss),
748
+ }
749
+
750
+ return model_latency, model_latency_breakdown
751
+
752
+ def count_latency_fwd(
753
+ self,
754
+ batch_size: int,
755
+ seq_len: int,
756
+ generate_len: int,
757
+ use_kv_cache: bool = True,
758
+ kv_cache_dtype_bytes: int = BYTES_FP16,
759
+ is_inference: bool = True,
760
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
761
+ layernorm_dtype_bytes: int = BYTES_FP32,
762
+ ) -> tuple:
763
+ # 1, 预填充阶段
764
+ prefill_latency, prefill_latency_breakdown = self.count_latency_fwd_model(
765
+ batch_size,
766
+ seq_len,
767
+ is_inference=is_inference,
768
+ layernorm_dtype_bytes=layernorm_dtype_bytes,
769
+ breakdown_prefix="prefill_",
770
+ )
771
+
772
+ prefill_latency_breakdown.update(
773
+ {
774
+ "prefill_latency": prefill_latency,
775
+ }
776
+ )
777
+
778
+ # 2, 解码阶段
779
+ kv_cache_avg_latency, kv_cache_peak_latency = self.count_latency_kv_cache(
780
+ batch_size,
781
+ seq_len,
782
+ generate_len,
783
+ use_kv_cache,
784
+ kv_cache_dtype_bytes
785
+ )
786
+
787
+ decode_model_latency, decode_latency_breakdown = self.count_latency_fwd_model(
788
+ batch_size,
789
+ 1 if use_kv_cache else (seq_len + generate_len) * (2/3), # k、v cache 占 2/3,重新计算
790
+ is_inference=is_inference,
791
+ activation_recomputation=activation_recomputation,
792
+ layernorm_dtype_bytes=layernorm_dtype_bytes,
793
+ breakdown_prefix="decode_",
794
+ )
795
+
796
+ decode_avg_latency = decode_model_latency + kv_cache_avg_latency
797
+ decode_peak_latency = decode_model_latency + kv_cache_peak_latency
798
+
799
+ decode_latency_breakdown.update(
800
+ {
801
+ "kv_cache_avg_latency": (kv_cache_avg_latency),
802
+ "kv_cache_peak_latency": (kv_cache_peak_latency),
803
+ "decode_avg_latency": (decode_avg_latency),
804
+ "decode_peak_latency": (decode_peak_latency)
805
+ }
806
+ )
807
+
808
+ return prefill_latency_breakdown, decode_latency_breakdown
809
+
810
+
811
+ class LLMProfiler(object):
812
+ """Measures the latency, memory, number of estimated floating-point operations and parameters of each module in a PyTorch model."""
813
+ def __init__(self, llm_configs: LLMConfigs) -> None:
814
+ self.model_config = llm_configs.model_config
815
+ self.gpu_config = llm_configs.gpu_config
816
+ self.inference_config = llm_configs.inference_config
817
+ self.parallelism_config = llm_configs.parallelism_config
818
+ self.gpu_efficiency_config = llm_configs.gpu_efficiency_config
819
+
820
+ self.h = self.model_config.hidden_dim
821
+ self.l = self.model_config.num_layers
822
+ self.V = self.model_config.vocab_size
823
+
824
+ self.b = llm_configs.inference_config.batch_size_per_gpu
825
+ self.s = llm_configs.inference_config.seq_len
826
+ self.o = llm_configs.inference_config.generate_len
827
+ self.bytes_per_param = llm_configs.inference_config.bytes_per_param
828
+
829
+ self.tp_size = self.parallelism_config.tp_size
830
+ self.pp_size = self.parallelism_config.pp_size
831
+ self.num_layers_per_gpu = int(self.l / self.parallelism_config.pp_size)
832
+
833
+ self.gpu_hbm_bandwidth = get_gpu_hbm_bandwidth(self.gpu_config) * 10**9 # 单位 GB/s
834
+ self.gpu_intra_node_bandwidth = get_intra_node_bandwidth(self.gpu_config) * 10**9 # 互连带宽,单位 GB/s
835
+ self.gpu_TFLOPS = get_TFLOPS_per_gpu(self.gpu_config) * 10**12 # 单位 TFLOPS
836
+
837
+ self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9 # 单位 GB
838
+
839
+ self.llm_params = CountCausalLMParams(self.model_config)
840
+ self.llm_flops = CountCausalLMFlops(self.model_config, self.b, self.s)
841
+ self.llm_memory = CountCausalLMMemory(llm_configs)
842
+ self.llm_latency = CountCausalLMLatency(llm_configs)
843
+ self.inference_results = []
844
+
845
+ def infer_profile(
846
+ self,
847
+ batch_size_per_gpu: int = 1,
848
+ seq_len: int = 522,
849
+ generate_len: int = 1526,
850
+ use_kv_cache: bool = True,
851
+ activation_recomputation: ActivationRecomputation = ActivationRecomputation.FULL,
852
+ layernorm_dtype_bytes: int = 2,
853
+ kv_cache_dtype_bytes: int = 2,
854
+ flops_efficiency: float = None,
855
+ hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
856
+ intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
857
+ inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
858
+ print_flag=True
859
+ ) -> dict:
860
+ """LLM inference analysis given the llm configs and inputs.
861
+
862
+ Args:
863
+ generate_len (int, optional): number of tokens to generate for generative models. Defaults to 100.
864
+ use_kv_cache (bool, optional): whether to use kv_cache. Defaults to True.
865
+ layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations. Defaults to BYTES_FP32.
866
+ Often has to be at least FP16 in inference to maintain model accuracy.
867
+
868
+ Returns:
869
+ dict: a summary dict of the training analysis
870
+ """
871
+ if self.model_config.max_seq_len is not None:
872
+ assert(
873
+ seq_len + generate_len <= self.model_config.max_seq_len
874
+ ), f"seq_len {seq_len} exceeds the max_seq_len {self.model_config.max_seq_len}"
875
+
876
+ if self.l % self.pp_size != 0:
877
+ logger.warning(
878
+ "Warning: the number of layers is not divisible by pp_size, please taking the floor!"
879
+ )
880
+
881
+ pp_instance_factor = self.pp_size
882
+
883
+ infer_config_dict = {
884
+ "inference_config":{
885
+ "model_name": self.model_config.model_name,
886
+ "batch_size_per_gpu": batch_size_per_gpu,
887
+ "seq_len": seq_len,
888
+ "tp_size": self.tp_size,
889
+ "pp_size": self.pp_size,
890
+ "generate_len": generate_len,
891
+ "use_kv_cache": use_kv_cache,
892
+ },
893
+ "gpu_config": {
894
+ "name": self.gpu_config.name,
895
+ "memory_GPU_in_GB": f"{self.gpu_config.memory_GPU_in_GB} GB",
896
+ "gpu_hbm_bandwidth": f"{get_gpu_hbm_bandwidth(self.gpu_config)} GB/s",
897
+ "gpu_intra_node_bandwidth": f"{get_intra_node_bandwidth(self.gpu_config)} GB/s",
898
+ "gpu_TFLOPS": f"{get_TFLOPS_per_gpu(self.gpu_config)} TFLOPS",
899
+ }
900
+ }
901
+
902
+ params_per_layer, dict_params_per_layer = self.llm_params.count_params_per_layer()
903
+ num_params_model = self.llm_params.count_params_model()
904
+
905
+ flops_fwd_per_layer, dict_flops_fwd_per_layer = self.llm_flops.count_flops_fwd_per_layer(self.b, self.s)
906
+ num_flops_fwd_model = self.llm_flops.count_flops_fwd_model(self.b, self.s)
907
+
908
+ memory_prefill_summary_dict, memory_decode_summary_dict = self.llm_memory.count_memory_per_gpu(
909
+ batch_size_per_gpu,
910
+ seq_len,
911
+ generate_len,
912
+ is_inference=True,
913
+ use_kv_cache=use_kv_cache,
914
+ activation_recomputation=activation_recomputation,
915
+ layernorm_dtype_bytes=layernorm_dtype_bytes,
916
+ kv_cache_dtype_bytes=kv_cache_dtype_bytes
917
+ )
918
+
919
+ prefill_latency_breakdown, decode_latency_breakdown = self.llm_latency.count_latency_fwd(
920
+ batch_size_per_gpu,
921
+ seq_len,
922
+ generate_len,
923
+ use_kv_cache=use_kv_cache,
924
+ activation_recomputation=activation_recomputation,
925
+ layernorm_dtype_bytes=layernorm_dtype_bytes,
926
+ kv_cache_dtype_bytes=kv_cache_dtype_bytes
927
+ )
928
+
929
+ infer_result_dict = {
930
+ "model_params": num_params_model,
931
+ "model_flops": num_flops_fwd_model,
932
+ "prefill_first_token_latency": prefill_latency_breakdown["prefill_latency"],
933
+ "decode_per_token_latency": decode_latency_breakdown["decode_avg_latency"],
934
+ "kv_cache_latency": decode_latency_breakdown["kv_cache_avg_latency"],
935
+ "total_infer_latency": prefill_latency_breakdown["prefill_latency"] + decode_latency_breakdown["decode_avg_latency"] * generate_len,
936
+ }
937
+
938
+ gb_factor = 1024 ** 3
939
+
940
+ inference_result_dict = {
941
+ "model_params": num_params_model,
942
+ "prefill_first_token_latency": prefill_latency_breakdown["prefill_latency"],
943
+ "decode_per_token_latency": decode_latency_breakdown["decode_avg_latency"],
944
+ "kv_cache_latency": decode_latency_breakdown["kv_cache_avg_latency"],
945
+ "total_infer_latency": prefill_latency_breakdown["prefill_latency"] + decode_latency_breakdown["decode_avg_latency"] * generate_len,
946
+ "weight_memory_per_gpu": memory_decode_summary_dict["weight_memory_per_gpu"] / gb_factor,
947
+ "decode_activation_memory_per_gpu": memory_decode_summary_dict["decode_activation_memory_per_gpu"] / gb_factor,
948
+ "kv_cache_memory_per_gpu": memory_decode_summary_dict["kv_cache_memory_per_gpu"] / gb_factor,
949
+ "decode_max_batch_size_per_gpu": memory_decode_summary_dict["decode_max_batch_size_per_gpu"],
950
+ "max_batch_total_tokens": memory_decode_summary_dict["max_batch_total_tokens"],
951
+ }
952
+ pp_specific_dict = {
953
+ "pp_decode_latency": inference_result_dict["decode_per_token_latency"] / pp_instance_factor,
954
+ "pp_prefill_latency": inference_result_dict["prefill_first_token_latency"] / pp_instance_factor,
955
+ "pp_kv_cache_latency": inference_result_dict["kv_cache_latency"] / pp_instance_factor,
956
+ "pp_e2e_latency": inference_result_dict["total_infer_latency"] / pp_instance_factor,
957
+ "pp_max_batch_total_tokens": inference_result_dict["decode_per_token_latency"] / pp_instance_factor,
958
+ "pp_max_batch_size": inference_result_dict["decode_max_batch_size_per_gpu"] / pp_instance_factor,
959
+ "pp_kv_cache_memory_per_gpu": inference_result_dict["kv_cache_memory_per_gpu"] * pp_instance_factor,
960
+ }
961
+ inference_result_dict.update(pp_specific_dict)
962
+ inference_result_dict.update(infer_config_dict["inference_config"].copy())
963
+ inference_result_dict.update(infer_config_dict["gpu_config"].copy())
964
+
965
+ self.inference_results.append(inference_result_dict)
966
+
967
+ if print_flag:
968
+ print("\n-------------------------- LLM main infer config --------------------------")
969
+ pprint.pprint(infer_config_dict, indent=4, sort_dicts=False)
970
+
971
+ print("\n---------------------------- LLM Params analysis ----------------------------")
972
+ self.print_format_summary_dict(dict_params_per_layer, get_dict_depth(dict_params_per_layer))
973
+ pprint.pprint({"params_model": num_to_string(num_params_model)}, indent=4, sort_dicts=False)
974
+
975
+ print("\n---------------------------- LLM Flops analysis -----------------------------")
976
+ self.print_format_summary_dict(dict_flops_fwd_per_layer, get_dict_depth(dict_flops_fwd_per_layer))
977
+ pprint.pprint({"prefill flops_model": num_to_string(num_flops_fwd_model)}, indent=4, sort_dicts=False)
978
+
979
+ print("\n---------------------------- LLM Memory analysis -----------------------------")
980
+ self.print_format_summary_dict(memory_prefill_summary_dict, get_dict_depth(memory_prefill_summary_dict))
981
+ self.print_format_summary_dict(memory_decode_summary_dict, get_dict_depth(memory_decode_summary_dict))
982
+
983
+ print("\n-------------------------- LLM infer performance analysis --------------------------")
984
+ self.print_format_summary_dict(infer_result_dict, get_dict_depth(infer_result_dict))
985
+
986
+ print("\n-------------------------- LLM detailed's latency analysis --------------------------")
987
+ pprint.pprint([prefill_latency_breakdown, decode_latency_breakdown], indent=4, sort_dicts=False)
988
+
989
+ print("prefill_latency_breakdown depth is ", get_dict_depth(prefill_latency_breakdown), prefill_latency_breakdown)
990
+ self.print_format_summary_dict(prefill_latency_breakdown, get_dict_depth(prefill_latency_breakdown))
991
+ self.print_format_summary_dict(decode_latency_breakdown, get_dict_depth(decode_latency_breakdown))
992
+
993
+ # return memory_decode_summary_dict["max_batch_total_tokens"]
994
+ return memory_decode_summary_dict["max_batch_total_tokens"]
995
+
996
+ def get_inference_results(self):
997
+ return self.inference_results
998
+
999
+ def print_format_summary_dict(self, summary_dict: dict, depth:int) -> str:
1000
+ for key, value in summary_dict.items():
1001
+ if "params" in key or "flops" in key:
1002
+ if not isinstance(value, dict):
1003
+ summary_dict.update({key: num_to_string(value)})
1004
+ else:
1005
+ self.print_format_summary_dict(value, get_dict_depth(value)-1) # 递归调用函数
1006
+ if "latency" in key:
1007
+ if not isinstance(value, dict):
1008
+ summary_dict.update({key: latency_to_string(value)})
1009
+ else:
1010
+ self.print_format_summary_dict(value, get_dict_depth(value)-1)
1011
+ if "memory" in key:
1012
+ if not isinstance(value, dict):
1013
+ summary_dict.update({key: f"{num_to_string(value)}B"})
1014
+ else:
1015
+ self.print_format_summary_dict(value, get_dict_depth(value)-1)
1016
+ if depth >= 1:
1017
+ pprint.pprint(summary_dict, indent=4, sort_dicts=False)
1018
+
1019
+ def llm_profile(model_name="llama2-70b",
1020
+ gpu_name: str = "t4-pcie-15gb",
1021
+ bytes_per_param: int = BYTES_FP16,
1022
+ batch_size_per_gpu: int = 2,
1023
+ seq_len: int = 300,
1024
+ generate_len=40,
1025
+ ds_zero: int = 0,
1026
+ dp_size: int = 1,
1027
+ tp_size: int = 4,
1028
+ pp_size: int = 1,
1029
+ sp_size: int = 1,
1030
+ use_kv_cache: bool = True,
1031
+ layernorm_dtype_bytes: int = BYTES_FP16,
1032
+ kv_cache_dtype_bytes: int = BYTES_FP16,
1033
+ flops_efficiency: float = FLOPS_EFFICIENCY,
1034
+ hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
1035
+ intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
1036
+ inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
1037
+ mode: str = "inference",
1038
+ print_flag: bool = True,
1039
+ ) -> dict:
1040
+ """Returns dict of the total floating-point operations, MACs, parameters and latency of a llm.
1041
+
1042
+ Args:
1043
+ model_name (str, optional): model name to query the pre-defined `model_configs.json`. Defaults to "llama-13b".
1044
+ gpu_name (str, optional): gpu name to query the pre-defined `model_configs.json`. Defaults to "v100-sxm2-32gb".
1045
+ batch_size_per_gpu (int, optional): _description_. Defaults to 1.
1046
+ seq_len (int, optional): batch size per GPU.. Defaults to 522.
1047
+ generate_len (int, optional): The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. Defaults to 1526.
1048
+ ds_zero (int, optional): which DeepSpeed ZeRO stage to use.. Defaults to 0.
1049
+ dp_size (int, optional): data parallelism size. Defaults to 1.
1050
+ tp_size (int, optional): tensor parallelism size. Defaults to 1.
1051
+ pp_size (int, optional): pipeline parallelism size. Defaults to 1.
1052
+ sp_size (int, optional): sequence parallelism size. Defaults to 1.
1053
+ use_kv_cache (bool, optional): Whether or not the model should use the past last key/values attentions (if applicable to the model) to
1054
+ speed up decoding. Defaults to True.
1055
+ layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations.. Defaults to BYTES_FP16.
1056
+ kv_cache_dtype_bytes (int, optional): number of bytes in the data type for the kv_cache. Defaults to None.
1057
+ flops_efficiency (float, optional): flops efficiency, ranging from 0 to 1. Defaults to None.
1058
+ hbm_memory_efficiency (float, optional): GPU HBM memory efficiency, ranging from 0 to 1. Defaults to HBM_MEMORY_EFFICIENCY.
1059
+ intra_node_memory_efficiency (_type_, optional): intra-node memory efficiency, ranging from 0 to 1.. Defaults to INTRA_NODE_MEMORY_EFFICIENCY.
1060
+ inter_node_memory_efficiency (_type_, optional): inter-node memory efficiency, ranging from 0 to 1.. Defaults to INTER_NODE_MEMORY_EFFICIENCY.
1061
+ mode (str, optional): model training or inference. Defaults to "inference".
1062
+
1063
+ Returns:
1064
+ dict: a summary dictionary of the inference analysis
1065
+ """
1066
+ model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name)
1067
+
1068
+ parallelism_config = ParallelismConfig(tp_size=tp_size, pp_size=pp_size,
1069
+ dp_size=dp_size, sp_size=sp_size
1070
+ )
1071
+
1072
+ inference_config = InferenceConfig(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len,
1073
+ generate_len=generate_len, use_kv_cache=use_kv_cache,
1074
+ bytes_per_param=bytes_per_param,
1075
+ layernorm_dtype_bytes=layernorm_dtype_bytes,
1076
+ kv_cache_dtype_bytes=kv_cache_dtype_bytes
1077
+ )
1078
+
1079
+ gpu_efficiency_config = GPUEfficiencyConfig(flops_efficiency=flops_efficiency,
1080
+ hbm_memory_efficiency=hbm_memory_efficiency,
1081
+ intra_node_memory_efficiency=intra_node_memory_efficiency,
1082
+ inter_node_memory_efficiency=inter_node_memory_efficiency
1083
+ )
1084
+
1085
+ llm_configs = LLMConfigs(model_config=model_config, gpu_config=gpu_config,
1086
+ parallelism_config=parallelism_config, inference_config=inference_config,
1087
+ gpu_efficiency_config=gpu_efficiency_config
1088
+ )
1089
+
1090
+ profiler = LLMProfiler(llm_configs)
1091
+
1092
+ max_batch_total_tokens = profiler.infer_profile(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len,
1093
+ generate_len=generate_len, use_kv_cache=use_kv_cache,
1094
+ layernorm_dtype_bytes=layernorm_dtype_bytes,
1095
+ flops_efficiency=flops_efficiency,
1096
+ hbm_memory_efficiency=hbm_memory_efficiency,
1097
+ print_flag=print_flag)
1098
+
1099
+ return max_batch_total_tokens
1100
+
1101
+
1102
+ def llm_profile_infer(model_name="llama2-70b",
1103
+ gpu_name: str = "t4-pcie-15gb",
1104
+ bytes_per_param: int = BYTES_FP16,
1105
+ batch_size_per_gpu: int = 2,
1106
+ seq_len: int = 300,
1107
+ generate_len=40,
1108
+ ds_zero: int = 0,
1109
+ dp_size: int = 1,
1110
+ tp_size: int = 4,
1111
+ pp_size: int = 1,
1112
+ sp_size: int = 1,
1113
+ use_kv_cache: bool = True,
1114
+ layernorm_dtype_bytes: int = BYTES_FP16,
1115
+ kv_cache_dtype_bytes: int = BYTES_FP16,
1116
+ flops_efficiency: float = FLOPS_EFFICIENCY,
1117
+ hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
1118
+ intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
1119
+ inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
1120
+ mode: str = "inference",
1121
+ print_flag: bool = True,
1122
+ ) -> list:
1123
+ model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name)
1124
+
1125
+ parallelism_config = ParallelismConfig(tp_size=tp_size, pp_size=pp_size,
1126
+ dp_size=dp_size, sp_size=sp_size
1127
+ )
1128
+
1129
+ inference_config = InferenceConfig(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len,
1130
+ generate_len=generate_len, use_kv_cache=use_kv_cache,
1131
+ bytes_per_param=bytes_per_param,
1132
+ layernorm_dtype_bytes=layernorm_dtype_bytes,
1133
+ kv_cache_dtype_bytes=kv_cache_dtype_bytes
1134
+ )
1135
+
1136
+ gpu_efficiency_config = GPUEfficiencyConfig(flops_efficiency=flops_efficiency,
1137
+ hbm_memory_efficiency=hbm_memory_efficiency,
1138
+ intra_node_memory_efficiency=intra_node_memory_efficiency,
1139
+ inter_node_memory_efficiency=inter_node_memory_efficiency
1140
+ )
1141
+
1142
+ llm_configs = LLMConfigs(model_config=model_config, gpu_config=gpu_config,
1143
+ parallelism_config=parallelism_config, inference_config=inference_config,
1144
+ gpu_efficiency_config=gpu_efficiency_config
1145
+ )
1146
+
1147
+ profiler = LLMProfiler(llm_configs)
1148
+
1149
+ max_batch_total_tokens = profiler.infer_profile(batch_size_per_gpu=batch_size_per_gpu, seq_len=seq_len,
1150
+ generate_len=generate_len, use_kv_cache=use_kv_cache,
1151
+ layernorm_dtype_bytes=layernorm_dtype_bytes,
1152
+ flops_efficiency=flops_efficiency,
1153
+ hbm_memory_efficiency=hbm_memory_efficiency,
1154
+ print_flag=print_flag)
1155
+ return max_batch_total_tokens, profiler.get_inference_results()
1156
+
1157
+ def to_csv(inference_results: list, name: str = "infer_results"):
1158
+ df = pd.DataFrame(inference_results)
1159
+ csv_path = name + ".csv"
1160
+ pprint.pprint(f"Saving inference results to: {csv_path}")
1161
+ df.to_csv(csv_path, index=False)
1162
+
1163
+
1164
+ def profile_pp():
1165
+ # model_name_list = ["llama-7b", "llama-13b", "llama-65b", "llama2-70b", "internlm-20b"]
1166
+ model_name_list = ["llama2-70b"]
1167
+ # gpu_name_list = ["a30-sxm-24gb", "a40-pcie-48gb", "a100-sxm-40gb", "a100-sxm-80gb", "910b-64gb", "v100-sxm-32gb", "t4-pcie-15gb"]
1168
+ gpu_name_list = ["a100-sxm-80gb"]
1169
+ batch_size_per_gpu = 32
1170
+ tp_pp_nums = [
1171
+ [1, 1], # tp
1172
+ [2, 1],
1173
+ [4, 1],
1174
+ [8, 1],
1175
+ # tp / pp
1176
+ [2, 4],
1177
+ [4, 2],
1178
+ # pp
1179
+ [1, 2],
1180
+ [1, 4],
1181
+ [1, 8],
1182
+ ]
1183
+ tgi_service_dict_list = []
1184
+ seq_len, generate_len = 1024, 1024
1185
+ inference_results = []
1186
+
1187
+ for model_name in model_name_list:
1188
+ if model_name in ["llama2-70b", "internlm-20b"]:
1189
+ seq_len, generate_len = 1024, 1024
1190
+
1191
+ for gpu_name in gpu_name_list:
1192
+ for tp_size, pp_size in tp_pp_nums:
1193
+ try:
1194
+ max_batch_total_tokens, infer_result = llm_profile_infer(
1195
+ model_name=model_name,
1196
+ gpu_name=gpu_name,
1197
+ batch_size_per_gpu=batch_size_per_gpu,
1198
+ tp_size=tp_size,
1199
+ pp_size=pp_size,
1200
+ seq_len=seq_len,
1201
+ generate_len=generate_len,
1202
+ print_flag=False,
1203
+ )
1204
+ inference_results += infer_result
1205
+ except Exception as e:
1206
+ print(
1207
+ f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, error: {e}"
1208
+ )
1209
+ continue
1210
+
1211
+ tgi_service_dict = {
1212
+ "model_name": model_name,
1213
+ "gpu_name": gpu_name,
1214
+ "pp_size": pp_size,
1215
+ "tp_size": tp_size,
1216
+ "max_batch_total_tokens": max_batch_total_tokens,
1217
+ "max_batch_size": floor(
1218
+ max_batch_total_tokens / (seq_len + generate_len)
1219
+ ),
1220
+ }
1221
+ tgi_service_dict_list.append(tgi_service_dict)
1222
+
1223
+ print(
1224
+ "================================== TGI+LightLLM service max_batch_total_tokens params list ============================="
1225
+ )
1226
+ print_list(tgi_service_dict_list)
1227
+
1228
+ to_csv(inference_results, f"bs{batch_size_per_gpu}_in{seq_len}_out{generate_len}_centralize_allreduce")
1229
+
1230
+
1231
+ def demo():
1232
+ # llm_profile(print_flag=True)
1233
+
1234
+ # model_name_list = ["llama-7b", "llama-13b", "llama-65b", "llama2-70b", "internlm-20b"]
1235
+ model_name_list = ["llama2-70b"]
1236
+ # gpu_name_list = ["a30-sxm-24gb", "a40-pcie-48gb", "a100-sxm-40gb", "a100-sxm-80gb", "910b-64gb", "v100-sxm-32gb", "t4-pcie-15gb"]
1237
+ gpu_name_list = ["a100-sxm-80gb", "910b-64gb"]
1238
+ batch_size_per_gpu = 32
1239
+ tp_nums_list = [8]
1240
+ pp_nums_list = [1]
1241
+ tp_pp_nums = [
1242
+ [8, 1],
1243
+ [1, 8],
1244
+ [4, 2]
1245
+ ]
1246
+ tgi_service_dict_list = []
1247
+ seq_len, generate_len = 1024, 1024
1248
+
1249
+ for model_name in model_name_list:
1250
+ if model_name in ["llama2-70b", "internlm-20b"]:
1251
+ seq_len, generate_len = 1024, 1024
1252
+
1253
+ # pp_size = 0
1254
+ # tp_size = 0
1255
+ for gpu_name in gpu_name_list:
1256
+ # for tp_size in tp_nums_list:
1257
+ # for pp_size in pp_nums_list:
1258
+ for (tp_size, pp_size) in tp_pp_nums:
1259
+ try:
1260
+ max_batch_total_tokens = int(llm_profile(model_name=model_name, gpu_name=gpu_name, batch_size_per_gpu=batch_size_per_gpu, tp_size=tp_size, pp_size=pp_size,
1261
+ seq_len=seq_len, generate_len=generate_len, print_flag=True))
1262
+ except Exception as e:
1263
+ print(f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, error: {e}")
1264
+ continue
1265
+
1266
+ tgi_service_dict = {"model_name": model_name, "gpu_name": gpu_name, "pp_size": pp_size, "tp_size": tp_size, "max_batch_total_tokens": max_batch_total_tokens, "max_batch_size": floor(max_batch_total_tokens / (seq_len + generate_len))}
1267
+ tgi_service_dict_list.append(tgi_service_dict)
1268
+
1269
+ print("================================== TGI+LightLLM service max_batch_total_tokens params list =============================")
1270
+ print_list(tgi_service_dict_list)
1271
+
1272
+
1273
+ if __name__ == "__main__":
1274
+ profile_pp()
utils.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from constants import *
2
+
3
+ def print_list(list):
4
+ """print one-dimensional list
5
+
6
+ :param list: List[int]
7
+ :return: None
8
+ """
9
+ for i, x in enumerate(list):
10
+ print(x, end='\n')
11
+
12
+ def get_dict_depth(d, depth=0):
13
+ if not isinstance(d, dict):
14
+ return depth
15
+ if not d:
16
+ return depth
17
+
18
+ return max(get_dict_depth(v, depth + 1) for v in d.values())
19
+
20
+ def latency_to_string(latency_in_s, precision=2):
21
+ if latency_in_s is None:
22
+ return "None"
23
+ day = 24 * 60 * 60
24
+ hour = 60 * 60
25
+ minute = 60
26
+ ms = 1 / 1000
27
+ us = 1 / 1000000
28
+ if latency_in_s // day > 0:
29
+ return str(round(latency_in_s / day, precision)) + " days"
30
+ elif latency_in_s // hour > 0:
31
+ return str(round(latency_in_s / hour, precision)) + " hours"
32
+ elif latency_in_s // minute > 0:
33
+ return str(round(latency_in_s / minute, precision)) + " minutes"
34
+ elif latency_in_s > 1:
35
+ return str(round(latency_in_s, precision)) + " s"
36
+ elif latency_in_s > ms:
37
+ return str(round(latency_in_s / ms, precision)) + " ms"
38
+ else:
39
+ return str(round(latency_in_s / us, precision)) + " us"
40
+
41
+ def num_to_string(num, precision=2):
42
+ if num is None:
43
+ return "None"
44
+ if num // 10**12 > 0:
45
+ return str(round(num / 10.0**12, precision)) + " T"
46
+ elif num // 10**9 > 0:
47
+ return str(round(num / 10.0**9, precision)) + " G"
48
+ elif num // 10**6 > 0:
49
+ return str(round(num / 10.0**6, precision)) + " M"
50
+ elif num // 10**3 > 0:
51
+ return str(round(num / 10.0**3, precision)) + " K"
52
+ else:
53
+ return str(num)
54
+
55
+ def get_readable_summary_dict(summary_dict: dict, title="Summary") -> str:
56
+ log_str = f"\n{title.center(PRINT_LINE_WIDTH, '-')}\n"
57
+ for key, value in summary_dict.items():
58
+ if "num_tokens" in key or "num_params" in key or "flops" in key:
59
+ log_str += f"{key}: {num_to_string(value)}\n"
60
+ elif "gpu_hours" == key:
61
+ log_str += f"{key}: {int(value)}\n"
62
+ elif "memory" in key and "efficiency" not in key:
63
+ log_str += f"{key}: {num_to_string(value)}B\n"
64
+ elif "latency" in key:
65
+ log_str += f"{key}: {latency_to_string(value)}\n"
66
+ else:
67
+ log_str += f"{key}: {value}\n"
68
+ log_str += f"{'-' * PRINT_LINE_WIDTH}\n"
69
+ return log_str
70
+
71
+ def within_range(val, target, tolerance):
72
+ return abs(val - target) / target < tolerance
73
+
74
+ def average(lst):
75
+ if not lst:
76
+ return None
77
+ return sum(lst) / len(lst)
78
+
79
+ def max_value(lst):
80
+ if not lst:
81
+ return None
82
+ return max(lst)