File size: 4,894 Bytes
0c4803b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import gradio as gr
import io
import logging

from llm_profiler import *
import sys
from contextlib import redirect_stdout

# 模型列表
model_names = [
    "opt-1.3b",
    "opt-6.7b",
    "opt-13b",
    "opt-66b",
    "opt-175b",
    "gpt2",
    "gpt2-medium",
    "gpt2-large",
    "gpt2-xl",
    "bloom-560m",
    "bloom-7b",
    "bloom-175b",
    "llama-7b",
    "llama-13b",
    "llama-30b",
    "llama-65b",
    "llama2-13b",
    "llama2-70b",
    "internlm-20b",
    "baichuan2-13b",
]
# GPU 列表
gpu_names = [
    "t4-pcie-15gb",
    "v100-pcie-32gb",
    "v100-sxm-32gb",
    "br104p",
    "a100-pcie-40gb",
    "a100-sxm-40gb",
    "a100-pcie-80gb",
    "a100-sxm-80gb",
    "910b-64gb",
    "h100-sxm-80gb",
    "h100-pcie-80gb",
    "a30-pcie-24gb",
    "a30-sxm-24gb",
    "a40-pcie-48gb",
]


# 创建一个日志处理器,将日志消息写入 StringIO 对象
class StringHandler(logging.Handler):
    def __init__(self):
        super().__init__()
        self.stream = io.StringIO()
        self.setFormatter(logging.Formatter("%(message)s"))

    def emit(self, record):
        self.stream.write(self.format(record) + "\n")

    def get_value(self):
        return self.stream.getvalue()


# 创建一个日志记录器并添加 StringHandler
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
string_handler = StringHandler()
logger.addHandler(string_handler)


def gradio_interface(
    model_name="llama2-70b",
    gpu_name: str = "t4-pcie-15gb",
    bytes_per_param: int = BYTES_FP16,
    batch_size_per_gpu: int = 2,
    seq_len: int = 300,
    generate_len: int = 40,
    ds_zero: int = 0,
    dp_size: int = 1,
    tp_size: int = 4,
    pp_size: int = 1,
    sp_size: int = 1,
    use_kv_cache: bool = True,
    layernorm_dtype_bytes: int = BYTES_FP16,
    kv_cache_dtype_bytes: int = BYTES_FP16,
    flops_efficiency: float = FLOPS_EFFICIENCY,
    hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
    intra_node_memory_efficiency: float = INTRA_NODE_MEMORY_EFFICIENCY,
    inter_node_memory_efficiency: float = INTER_NODE_MEMORY_EFFICIENCY,
    mode: str = "inference",
    print_flag: bool = True,
) -> list:
    # 清空 StringIO 对象
    string_handler.stream.seek(0)
    string_handler.stream.truncate()

    # 重定向 sys.stdout 到 StringHandler
    original_stdout = sys.stdout
    sys.stdout = string_handler.stream

    # 调用你的推理函数
    results = llm_profile_infer(
        model_name,
        gpu_name,
        bytes_per_param,
        batch_size_per_gpu,
        seq_len,
        generate_len,
        ds_zero,
        dp_size,
        tp_size,
        pp_size,
        sp_size,
        use_kv_cache,
        layernorm_dtype_bytes,
        kv_cache_dtype_bytes,
        flops_efficiency,
        hbm_memory_efficiency,
        intra_node_memory_efficiency,
        inter_node_memory_efficiency,
        mode,
        print_flag,
    )

    # 恢复 sys.stdout
    sys.stdout = original_stdout

    # 获取日志消息
    log_output = string_handler.get_value()

    # 返回推理结果和日志输出
    return results, log_output


# 创建 Gradio 界面
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Dropdown(choices=model_names, label="Model Name", value="llama2-70b"),
        gr.Dropdown(choices=gpu_names, label="GPU Name", value="a100-sxm-80gb"),
        gr.Number(label="Bytes per Param", value=BYTES_FP16),
        gr.Number(label="Batch Size per GPU", value=2),
        gr.Number(label="Sequence Length", value=300),
        gr.Number(label="Generate Length", value=40),
        gr.Number(label="DS Zero", value=0),
        gr.Number(label="DP Size", value=1),
        gr.Number(label="TP Size", value=4),
        gr.Number(label="PP Size", value=1),
        gr.Number(label="SP Size", value=1),
        gr.Checkbox(label="Use KV Cache", value=True),
        gr.Number(label="Layernorm dtype Bytes", value=BYTES_FP16),
        gr.Number(label="KV Cache dtype Bytes", value=BYTES_FP16),
        gr.Number(label="FLOPS Efficiency", value=FLOPS_EFFICIENCY),
        gr.Number(label="HBM Memory Efficiency", value=HBM_MEMORY_EFFICIENCY),
        gr.Number(
            label="Intra Node Memory Efficiency", value=INTRA_NODE_MEMORY_EFFICIENCY
        ),
        gr.Number(
            label="Inter Node Memory Efficiency", value=INTER_NODE_MEMORY_EFFICIENCY
        ),
        gr.Radio(choices=["inference", "other_mode"], label="Mode", value="inference"),
        gr.Checkbox(label="Print Flag", value=True),
    ],
    outputs=[
        gr.Textbox(label="Inference Results"),  # 推理结果输出,带标签
        gr.Textbox(label="Detailed Analysis"),  # 日志输出,带标签
    ],
    title="LLM Profiler",
    description="Input parameters to profile your LLM.",
)

# 启动 Gradio 界面
iface.launch(auth=("xtrt-llm", "xtrt-llm"), share=False)
# iface.launch()