import gradio as gr import io import logging from llm_profiler import * import sys from contextlib import redirect_stdout # 模型列表 model_names = [ "opt-1.3b", "opt-6.7b", "opt-13b", "opt-66b", "opt-175b", "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "bloom-560m", "bloom-7b", "bloom-175b", "llama-7b", "llama-13b", "llama-30b", "llama-65b", "llama2-13b", "llama2-70b", "internlm-20b", "baichuan2-13b", ] # GPU 列表 gpu_names = [ "t4-pcie-15gb", "v100-pcie-32gb", "v100-sxm-32gb", "br104p", "a100-pcie-40gb", "a100-sxm-40gb", "a100-pcie-80gb", "a100-sxm-80gb", "910b-64gb", "h100-sxm-80gb", "h100-pcie-80gb", "a30-pcie-24gb", "a30-sxm-24gb", "a40-pcie-48gb", ] # 创建一个日志处理器,将日志消息写入 StringIO 对象 class StringHandler(logging.Handler): def __init__(self): super().__init__() self.stream = io.StringIO() self.setFormatter(logging.Formatter("%(message)s")) def emit(self, record): self.stream.write(self.format(record) + "\n") def get_value(self): return self.stream.getvalue() # 创建一个日志记录器并添加 StringHandler logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) string_handler = StringHandler() logger.addHandler(string_handler) def gradio_interface( model_name="llama2-70b", gpu_name: str = "t4-pcie-15gb", bytes_per_param: int = BYTES_FP16, batch_size_per_gpu: int = 2, seq_len: int = 300, generate_len: int = 40, ds_zero: int = 0, dp_size: int = 1, tp_size: int = 4, pp_size: int = 1, sp_size: int = 1, use_kv_cache: bool = True, layernorm_dtype_bytes: int = BYTES_FP16, kv_cache_dtype_bytes: int = BYTES_FP16, flops_efficiency: float = FLOPS_EFFICIENCY, hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY, intra_node_memory_efficiency: float = INTRA_NODE_MEMORY_EFFICIENCY, inter_node_memory_efficiency: float = INTER_NODE_MEMORY_EFFICIENCY, mode: str = "inference", print_flag: bool = True, ) -> list: # 清空 StringIO 对象 string_handler.stream.seek(0) string_handler.stream.truncate() # 重定向 sys.stdout 到 StringHandler original_stdout = sys.stdout sys.stdout = string_handler.stream # 调用你的推理函数 results = llm_profile_infer( model_name, gpu_name, bytes_per_param, batch_size_per_gpu, seq_len, generate_len, ds_zero, dp_size, tp_size, pp_size, sp_size, use_kv_cache, layernorm_dtype_bytes, kv_cache_dtype_bytes, flops_efficiency, hbm_memory_efficiency, intra_node_memory_efficiency, inter_node_memory_efficiency, mode, print_flag, ) # 恢复 sys.stdout sys.stdout = original_stdout # 获取日志消息 log_output = string_handler.get_value() # 返回推理结果和日志输出 return results, log_output # 创建 Gradio 界面 iface = gr.Interface( fn=gradio_interface, inputs=[ gr.Dropdown(choices=model_names, label="Model Name", value="llama2-70b"), gr.Dropdown(choices=gpu_names, label="GPU Name", value="a100-sxm-80gb"), gr.Number(label="Bytes per Param", value=BYTES_FP16), gr.Number(label="Batch Size per GPU", value=2), gr.Number(label="Sequence Length", value=300), gr.Number(label="Generate Length", value=40), gr.Number(label="DS Zero", value=0), gr.Number(label="DP Size", value=1), gr.Number(label="TP Size", value=4), gr.Number(label="PP Size", value=1), gr.Number(label="SP Size", value=1), gr.Checkbox(label="Use KV Cache", value=True), gr.Number(label="Layernorm dtype Bytes", value=BYTES_FP16), gr.Number(label="KV Cache dtype Bytes", value=BYTES_FP16), gr.Number(label="FLOPS Efficiency", value=FLOPS_EFFICIENCY), gr.Number(label="HBM Memory Efficiency", value=HBM_MEMORY_EFFICIENCY), gr.Number( label="Intra Node Memory Efficiency", value=INTRA_NODE_MEMORY_EFFICIENCY ), gr.Number( label="Inter Node Memory Efficiency", value=INTER_NODE_MEMORY_EFFICIENCY ), gr.Radio(choices=["inference", "other_mode"], label="Mode", value="inference"), gr.Checkbox(label="Print Flag", value=True), ], outputs=[ gr.Textbox(label="Inference Results"), # 推理结果输出,带标签 gr.Textbox(label="Detailed Analysis"), # 日志输出,带标签 ], title="LLM Profiler", description="Input parameters to profile your LLM.", ) # 启动 Gradio 界面 iface.launch(auth=("xtrt-llm", "xtrt-llm"), share=False) # iface.launch()