Commit
·
3d7033f
1
Parent(s):
16a8bbd
update
Browse files- app.py +38 -61
- script.py +14 -0
- src/utils.py +10 -5
app.py
CHANGED
|
@@ -21,33 +21,27 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
| 21 |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
| 22 |
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
|
| 23 |
ALL_COLUMNS_MAPPING = {
|
| 24 |
-
# model
|
| 25 |
"Model": "Model 🤗",
|
| 26 |
"Arch": "Arch 🏛️",
|
| 27 |
-
"Size": "
|
| 28 |
# deployment settings
|
| 29 |
"backend.name": "Backend 🏭",
|
| 30 |
"backend.torch_dtype": "Dtype 📥",
|
| 31 |
-
"
|
| 32 |
"quantization": "Quantization 🗜️",
|
| 33 |
-
#
|
| 34 |
-
"Score": "
|
| 35 |
-
# throughput measurements
|
| 36 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
|
| 37 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
|
| 38 |
-
# latency measurements
|
| 39 |
"forward.latency(s)": "Prefill Latency (s) ⬇️",
|
| 40 |
"generate.latency(s)": "E2E Latency (s) ⬇️",
|
| 41 |
-
# memory measurements
|
| 42 |
"generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
|
| 43 |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
|
| 44 |
"generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
|
| 45 |
-
# energy measurements
|
| 46 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
|
| 47 |
}
|
| 48 |
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
| 49 |
SORTING_ASCENDING = [False, True]
|
| 50 |
-
|
| 51 |
ALL_COLUMNS_DATATYPES = [
|
| 52 |
# open llm
|
| 53 |
"markdown",
|
|
@@ -70,17 +64,18 @@ ALL_COLUMNS_DATATYPES = [
|
|
| 70 |
"number",
|
| 71 |
"number",
|
| 72 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
# download data
|
| 77 |
-
hf_hub_download(
|
| 78 |
-
repo_id="optimum/llm-perf-dataset",
|
| 79 |
-
filename="open-llm.csv",
|
| 80 |
-
local_dir="dataset",
|
| 81 |
-
repo_type="dataset",
|
| 82 |
-
token=HF_TOKEN,
|
| 83 |
-
)
|
| 84 |
hf_hub_download(
|
| 85 |
repo_id="optimum/llm-perf-dataset",
|
| 86 |
filename=f"{machine}/full-report.csv",
|
|
@@ -88,11 +83,13 @@ def get_benchmark_df(machine="hf-dgx-01"):
|
|
| 88 |
repo_type="dataset",
|
| 89 |
token=HF_TOKEN,
|
| 90 |
)
|
| 91 |
-
|
| 92 |
-
|
| 93 |
|
|
|
|
| 94 |
# merge on model
|
| 95 |
-
|
|
|
|
| 96 |
# transpose energy consumption
|
| 97 |
merged_df["generate.energy_consumption(tokens/kWh)"] = (
|
| 98 |
1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
|
|
@@ -102,8 +99,8 @@ def get_benchmark_df(machine="hf-dgx-01"):
|
|
| 102 |
merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
| 103 |
"generate.energy_consumption(tokens/kWh)",
|
| 104 |
] = pd.NA
|
| 105 |
-
# add
|
| 106 |
-
merged_df["
|
| 107 |
["backend.to_bettertransformer", "backend.use_flash_attention_2"]
|
| 108 |
].apply(
|
| 109 |
lambda x: "BetterTransformer"
|
|
@@ -135,10 +132,10 @@ def get_benchmark_table(bench_df):
|
|
| 135 |
copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
|
| 136 |
copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
|
| 137 |
# process quantization
|
| 138 |
-
copy_df["
|
| 139 |
-
lambda x: f"{x['
|
| 140 |
if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
|
| 141 |
-
else x["
|
| 142 |
axis=1,
|
| 143 |
)
|
| 144 |
return copy_df
|
|
@@ -151,7 +148,7 @@ def get_benchmark_chart(bench_df):
|
|
| 151 |
# plot
|
| 152 |
fig = px.scatter(
|
| 153 |
copy_df,
|
| 154 |
-
y="
|
| 155 |
x="E2E Latency (s) ⬇️",
|
| 156 |
size="Allocated Memory (MB) ⬇️",
|
| 157 |
color="Arch 🏛️",
|
|
@@ -167,7 +164,7 @@ def get_benchmark_chart(bench_df):
|
|
| 167 |
"yanchor": "top",
|
| 168 |
},
|
| 169 |
xaxis_title="Per 1000 Tokens Latency (s)",
|
| 170 |
-
yaxis_title="
|
| 171 |
legend_title="LLM Architecture",
|
| 172 |
width=1200,
|
| 173 |
height=600,
|
|
@@ -188,7 +185,7 @@ def filter_query(
|
|
| 188 |
backends,
|
| 189 |
datatypes,
|
| 190 |
optimizations,
|
| 191 |
-
|
| 192 |
score,
|
| 193 |
memory,
|
| 194 |
machine,
|
|
@@ -198,29 +195,9 @@ def filter_query(
|
|
| 198 |
raw_df["Model 🤗"].str.contains(text, case=False)
|
| 199 |
& raw_df["Backend ����"].isin(backends)
|
| 200 |
& raw_df["Dtype 📥"].isin(datatypes)
|
| 201 |
-
& (
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
raw_df["Optimizations 🛠️"].str.contains(optimization, case=False)
|
| 205 |
-
for optimization in optimizations
|
| 206 |
-
],
|
| 207 |
-
axis=1,
|
| 208 |
-
).any(axis="columns")
|
| 209 |
-
if len(optimizations) > 0
|
| 210 |
-
else True
|
| 211 |
-
)
|
| 212 |
-
& (
|
| 213 |
-
pd.concat(
|
| 214 |
-
[
|
| 215 |
-
raw_df["Quantization 🗜️"].str.contains(quantization, case=False)
|
| 216 |
-
for quantization in quantization_scheme
|
| 217 |
-
],
|
| 218 |
-
axis=1,
|
| 219 |
-
).any(axis="columns")
|
| 220 |
-
if len(quantization_scheme) > 0
|
| 221 |
-
else True
|
| 222 |
-
)
|
| 223 |
-
& (raw_df["Avg Score (%) ⬆️"] >= score)
|
| 224 |
& (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
|
| 225 |
]
|
| 226 |
filtered_table = get_benchmark_table(filtered_df)
|
|
@@ -289,7 +266,7 @@ with demo:
|
|
| 289 |
with gr.Row():
|
| 290 |
with gr.Column(scale=1):
|
| 291 |
score_slider = gr.Slider(
|
| 292 |
-
label="Open LLM Score 📈",
|
| 293 |
info="🎚️ Slide to minimum Open LLM score",
|
| 294 |
value=0,
|
| 295 |
elem_id="threshold-slider",
|
|
@@ -321,12 +298,12 @@ with demo:
|
|
| 321 |
elem_id="dtype-checkboxes",
|
| 322 |
)
|
| 323 |
with gr.Column(scale=1):
|
| 324 |
-
|
| 325 |
label="Optimizations 🛠️",
|
| 326 |
-
choices=["None", "BetterTransformer"],
|
| 327 |
-
value=["None", "BetterTransformer"],
|
| 328 |
-
info="☑️ Select the
|
| 329 |
-
elem_id="
|
| 330 |
)
|
| 331 |
with gr.Column(scale=1):
|
| 332 |
quantization_checkboxes = gr.CheckboxGroup(
|
|
@@ -348,7 +325,7 @@ with demo:
|
|
| 348 |
search_bar,
|
| 349 |
backend_checkboxes,
|
| 350 |
datatype_checkboxes,
|
| 351 |
-
|
| 352 |
quantization_checkboxes,
|
| 353 |
score_slider,
|
| 354 |
memory_slider,
|
|
|
|
| 21 |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
| 22 |
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
|
| 23 |
ALL_COLUMNS_MAPPING = {
|
|
|
|
| 24 |
"Model": "Model 🤗",
|
| 25 |
"Arch": "Arch 🏛️",
|
| 26 |
+
"Size": "Params (B) 📏",
|
| 27 |
# deployment settings
|
| 28 |
"backend.name": "Backend 🏭",
|
| 29 |
"backend.torch_dtype": "Dtype 📥",
|
| 30 |
+
"optimization": "Optimization 🛠️",
|
| 31 |
"quantization": "Quantization 🗜️",
|
| 32 |
+
# measurements
|
| 33 |
+
"Score": "Open LLM Score (%) ⬆️",
|
|
|
|
| 34 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
|
| 35 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
|
|
|
|
| 36 |
"forward.latency(s)": "Prefill Latency (s) ⬇️",
|
| 37 |
"generate.latency(s)": "E2E Latency (s) ⬇️",
|
|
|
|
| 38 |
"generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
|
| 39 |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
|
| 40 |
"generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
|
|
|
|
| 41 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
|
| 42 |
}
|
| 43 |
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
| 44 |
SORTING_ASCENDING = [False, True]
|
|
|
|
| 45 |
ALL_COLUMNS_DATATYPES = [
|
| 46 |
# open llm
|
| 47 |
"markdown",
|
|
|
|
| 64 |
"number",
|
| 65 |
"number",
|
| 66 |
]
|
| 67 |
+
# download data
|
| 68 |
+
hf_hub_download(
|
| 69 |
+
repo_id="optimum/llm-perf-dataset",
|
| 70 |
+
filename="open-llm.csv",
|
| 71 |
+
local_dir="dataset",
|
| 72 |
+
repo_type="dataset",
|
| 73 |
+
token=HF_TOKEN,
|
| 74 |
+
)
|
| 75 |
+
OPEN_LLM = pd.read_csv("dataset/open-llm.csv")
|
| 76 |
|
| 77 |
+
MACHINE_TO_DATAFRAME = {}
|
| 78 |
+
for machine in MACHINE_TO_HARDWARE:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
hf_hub_download(
|
| 80 |
repo_id="optimum/llm-perf-dataset",
|
| 81 |
filename=f"{machine}/full-report.csv",
|
|
|
|
| 83 |
repo_type="dataset",
|
| 84 |
token=HF_TOKEN,
|
| 85 |
)
|
| 86 |
+
MACHINE_TO_DATAFRAME[machine] = pd.read_csv(f"dataset/{machine}/full-report.csv")
|
| 87 |
+
|
| 88 |
|
| 89 |
+
def get_benchmark_df(machine="hf-dgx-01"):
|
| 90 |
# merge on model
|
| 91 |
+
llm_perf = MACHINE_TO_DATAFRAME[machine].copy()
|
| 92 |
+
merged_df = OPEN_LLM.merge(llm_perf, left_on="Model", right_on="model")
|
| 93 |
# transpose energy consumption
|
| 94 |
merged_df["generate.energy_consumption(tokens/kWh)"] = (
|
| 95 |
1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
|
|
|
|
| 99 |
merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
| 100 |
"generate.energy_consumption(tokens/kWh)",
|
| 101 |
] = pd.NA
|
| 102 |
+
# add optimization column
|
| 103 |
+
merged_df["optimization"] = merged_df[
|
| 104 |
["backend.to_bettertransformer", "backend.use_flash_attention_2"]
|
| 105 |
].apply(
|
| 106 |
lambda x: "BetterTransformer"
|
|
|
|
| 132 |
copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
|
| 133 |
copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
|
| 134 |
# process quantization
|
| 135 |
+
copy_df["Open LLM Score (%) ⬆️"] = copy_df.apply(
|
| 136 |
+
lambda x: f"{x['Open LLM Score (%) ⬆️']}**"
|
| 137 |
if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
|
| 138 |
+
else x["Open LLM Score (%) ⬆️"],
|
| 139 |
axis=1,
|
| 140 |
)
|
| 141 |
return copy_df
|
|
|
|
| 148 |
# plot
|
| 149 |
fig = px.scatter(
|
| 150 |
copy_df,
|
| 151 |
+
y="Open LLM Score (%) ⬆️",
|
| 152 |
x="E2E Latency (s) ⬇️",
|
| 153 |
size="Allocated Memory (MB) ⬇️",
|
| 154 |
color="Arch 🏛️",
|
|
|
|
| 164 |
"yanchor": "top",
|
| 165 |
},
|
| 166 |
xaxis_title="Per 1000 Tokens Latency (s)",
|
| 167 |
+
yaxis_title="Open LLM Score (%)",
|
| 168 |
legend_title="LLM Architecture",
|
| 169 |
width=1200,
|
| 170 |
height=600,
|
|
|
|
| 185 |
backends,
|
| 186 |
datatypes,
|
| 187 |
optimizations,
|
| 188 |
+
quantizations,
|
| 189 |
score,
|
| 190 |
memory,
|
| 191 |
machine,
|
|
|
|
| 195 |
raw_df["Model 🤗"].str.contains(text, case=False)
|
| 196 |
& raw_df["Backend ����"].isin(backends)
|
| 197 |
& raw_df["Dtype 📥"].isin(datatypes)
|
| 198 |
+
& raw_df["Optimization 🛠️"].isin(optimizations)
|
| 199 |
+
& raw_df["Quantization 🗜️"].isin(quantizations)
|
| 200 |
+
& (raw_df["Open LLM Score (%) ⬆️"] >= score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
& (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
|
| 202 |
]
|
| 203 |
filtered_table = get_benchmark_table(filtered_df)
|
|
|
|
| 266 |
with gr.Row():
|
| 267 |
with gr.Column(scale=1):
|
| 268 |
score_slider = gr.Slider(
|
| 269 |
+
label="Open LLM Score (%) 📈",
|
| 270 |
info="🎚️ Slide to minimum Open LLM score",
|
| 271 |
value=0,
|
| 272 |
elem_id="threshold-slider",
|
|
|
|
| 298 |
elem_id="dtype-checkboxes",
|
| 299 |
)
|
| 300 |
with gr.Column(scale=1):
|
| 301 |
+
optimization_checkboxes = gr.CheckboxGroup(
|
| 302 |
label="Optimizations 🛠️",
|
| 303 |
+
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
| 304 |
+
value=["None", "BetterTransformer", "FlashAttentionV2"],
|
| 305 |
+
info="☑️ Select the optimization",
|
| 306 |
+
elem_id="optimization-checkboxes",
|
| 307 |
)
|
| 308 |
with gr.Column(scale=1):
|
| 309 |
quantization_checkboxes = gr.CheckboxGroup(
|
|
|
|
| 325 |
search_bar,
|
| 326 |
backend_checkboxes,
|
| 327 |
datatype_checkboxes,
|
| 328 |
+
optimization_checkboxes,
|
| 329 |
quantization_checkboxes,
|
| 330 |
score_slider,
|
| 331 |
memory_slider,
|
script.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import hf_hub_download
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
hf_hub_download(
|
| 6 |
+
repo_id="optimum/llm-perf-dataset",
|
| 7 |
+
filename="open-llm.csv",
|
| 8 |
+
local_dir="dataset",
|
| 9 |
+
repo_type="dataset",
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
open_llm = pd.read_csv("dataset/open-llm.csv")
|
| 13 |
+
print(open_llm["Arch"].unique())
|
| 14 |
+
print(open_llm[open_llm["Arch"] == "rwkv"]["Model"].unique())
|
src/utils.py
CHANGED
|
@@ -18,26 +18,31 @@ def change_tab(query_param):
|
|
| 18 |
|
| 19 |
|
| 20 |
LLM_MODEL_ARCHS = {
|
| 21 |
-
|
| 22 |
-
"gpt_bigcode": "GPT-BigCode 🌸",
|
| 23 |
"RefinedWebModel": "Falcon 🦅",
|
|
|
|
| 24 |
"RefinedWeb": "Falcon 🦅",
|
| 25 |
"baichuan": "Baichuan 🌊",
|
|
|
|
|
|
|
|
|
|
| 26 |
"bloom": "Bloom 🌸",
|
| 27 |
"llama": "LLaMA 🦙",
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
| 29 |
"stablelm_alpha": "StableLM-Alpha",
|
| 30 |
"gpt_neox": "GPT-NeoX",
|
| 31 |
"gpt_neo": "GPT-Neo",
|
| 32 |
-
"codegen": "CodeGen",
|
| 33 |
"chatglm": "ChatGLM",
|
|
|
|
| 34 |
"gpt2": "GPT-2",
|
| 35 |
"gptj": "GPT-J",
|
| 36 |
"xglm": "XGLM",
|
| 37 |
"rwkv": "RWKV",
|
| 38 |
"bart": "BART",
|
| 39 |
"opt": "OPT",
|
| 40 |
-
"mpt": "MPT",
|
| 41 |
}
|
| 42 |
|
| 43 |
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
LLM_MODEL_ARCHS = {
|
| 21 |
+
"mixformer-sequential": "Phi φ",
|
|
|
|
| 22 |
"RefinedWebModel": "Falcon 🦅",
|
| 23 |
+
"gpt_bigcode": "StarCoder ⭐",
|
| 24 |
"RefinedWeb": "Falcon 🦅",
|
| 25 |
"baichuan": "Baichuan 🌊",
|
| 26 |
+
"mistral": "Mistral Ⓜ️",
|
| 27 |
+
"codegen": "CodeGen ♾️",
|
| 28 |
+
"falcon": "Falcon 🦅",
|
| 29 |
"bloom": "Bloom 🌸",
|
| 30 |
"llama": "LLaMA 🦙",
|
| 31 |
+
"mpt": "MPT 🧱",
|
| 32 |
+
"Yi": "Yi 人",
|
| 33 |
+
# suggest something
|
| 34 |
+
"stablelm_epoch": "StableLM-Epoch",
|
| 35 |
"stablelm_alpha": "StableLM-Alpha",
|
| 36 |
"gpt_neox": "GPT-NeoX",
|
| 37 |
"gpt_neo": "GPT-Neo",
|
|
|
|
| 38 |
"chatglm": "ChatGLM",
|
| 39 |
+
"internlm": "InternLM",
|
| 40 |
"gpt2": "GPT-2",
|
| 41 |
"gptj": "GPT-J",
|
| 42 |
"xglm": "XGLM",
|
| 43 |
"rwkv": "RWKV",
|
| 44 |
"bart": "BART",
|
| 45 |
"opt": "OPT",
|
|
|
|
| 46 |
}
|
| 47 |
|
| 48 |
|