Spaces:
Running
Running
Commit
Β·
0232cf1
1
Parent(s):
a8a6326
update
Browse files- app.py +5 -3
- src/bettertransformer.py +21 -21
- src/control_panel.py +31 -23
- src/flashattentionv2.py +22 -22
- src/latency_score_memory.py +12 -12
- src/leaderboard.py +34 -15
- src/llm_perf.py +15 -8
- src/quantization_kernels.py +24 -24
app.py
CHANGED
|
@@ -35,7 +35,6 @@ with demo:
|
|
| 35 |
(
|
| 36 |
filter_button,
|
| 37 |
machine_textbox,
|
| 38 |
-
search_bar,
|
| 39 |
score_slider,
|
| 40 |
memory_slider,
|
| 41 |
backend_checkboxes,
|
|
@@ -48,7 +47,7 @@ with demo:
|
|
| 48 |
llm_perf_df = get_llm_perf_df(machine=machine)
|
| 49 |
####################### LEADERBOARD TAB #######################
|
| 50 |
with gr.TabItem("Leaderboard π
", id=0):
|
| 51 |
-
|
| 52 |
lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
|
| 53 |
####################### BETTERTRANSFORMER SPEEDUP TAB #######################
|
| 54 |
with gr.TabItem("BetterTransformer π", id=2):
|
|
@@ -63,14 +62,15 @@ with demo:
|
|
| 63 |
filter_button,
|
| 64 |
# inputs
|
| 65 |
machine_textbox,
|
| 66 |
-
search_bar,
|
| 67 |
score_slider,
|
| 68 |
memory_slider,
|
| 69 |
backend_checkboxes,
|
| 70 |
datatype_checkboxes,
|
| 71 |
optimization_checkboxes,
|
| 72 |
quantization_checkboxes,
|
|
|
|
| 73 |
columns_checkboxes,
|
|
|
|
| 74 |
# outputs
|
| 75 |
leaderboard_table,
|
| 76 |
lat_score_mem_plot,
|
|
@@ -85,7 +85,9 @@ with demo:
|
|
| 85 |
create_select_callback(
|
| 86 |
# inputs
|
| 87 |
machine_textbox,
|
|
|
|
| 88 |
columns_checkboxes,
|
|
|
|
| 89 |
# outputs
|
| 90 |
leaderboard_table,
|
| 91 |
)
|
|
|
|
| 35 |
(
|
| 36 |
filter_button,
|
| 37 |
machine_textbox,
|
|
|
|
| 38 |
score_slider,
|
| 39 |
memory_slider,
|
| 40 |
backend_checkboxes,
|
|
|
|
| 47 |
llm_perf_df = get_llm_perf_df(machine=machine)
|
| 48 |
####################### LEADERBOARD TAB #######################
|
| 49 |
with gr.TabItem("Leaderboard π
", id=0):
|
| 50 |
+
search_bar, columns_checkboxes, leaderboard_table = create_leaderboard_table(llm_perf_df)
|
| 51 |
lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
|
| 52 |
####################### BETTERTRANSFORMER SPEEDUP TAB #######################
|
| 53 |
with gr.TabItem("BetterTransformer π", id=2):
|
|
|
|
| 62 |
filter_button,
|
| 63 |
# inputs
|
| 64 |
machine_textbox,
|
|
|
|
| 65 |
score_slider,
|
| 66 |
memory_slider,
|
| 67 |
backend_checkboxes,
|
| 68 |
datatype_checkboxes,
|
| 69 |
optimization_checkboxes,
|
| 70 |
quantization_checkboxes,
|
| 71 |
+
# interactive
|
| 72 |
columns_checkboxes,
|
| 73 |
+
search_bar,
|
| 74 |
# outputs
|
| 75 |
leaderboard_table,
|
| 76 |
lat_score_mem_plot,
|
|
|
|
| 85 |
create_select_callback(
|
| 86 |
# inputs
|
| 87 |
machine_textbox,
|
| 88 |
+
# interactive
|
| 89 |
columns_checkboxes,
|
| 90 |
+
search_bar,
|
| 91 |
# outputs
|
| 92 |
leaderboard_table,
|
| 93 |
)
|
src/bettertransformer.py
CHANGED
|
@@ -6,10 +6,10 @@ import plotly.express as px
|
|
| 6 |
BETTERTRANSFORMER_DATA = [
|
| 7 |
# open llm
|
| 8 |
"Model π€",
|
| 9 |
-
"Arch ποΈ",
|
| 10 |
"DType π₯",
|
| 11 |
"Backend π",
|
| 12 |
"Params (B)",
|
|
|
|
| 13 |
"Open LLM Score (%)",
|
| 14 |
# deployment settings
|
| 15 |
"DType π₯",
|
|
@@ -18,15 +18,15 @@ BETTERTRANSFORMER_DATA = [
|
|
| 18 |
"Quantization ποΈ",
|
| 19 |
"Optimization π οΈ BetterTransformer",
|
| 20 |
# primary measurements
|
| 21 |
-
"Prefill
|
| 22 |
-
"Prefill
|
| 23 |
-
"Decode
|
| 24 |
-
"Decode
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
# speedups
|
| 28 |
-
"Prefill
|
| 29 |
-
"Decode
|
| 30 |
]
|
| 31 |
|
| 32 |
|
|
@@ -43,15 +43,15 @@ def get_bt_df(llm_perf_df):
|
|
| 43 |
suffixes=["", " BetterTransformer"],
|
| 44 |
)
|
| 45 |
# compute speedups
|
| 46 |
-
bt_df["Prefill
|
| 47 |
-
(bt_df["Prefill
|
| 48 |
).round(2) - 100
|
| 49 |
-
bt_df["Decode
|
| 50 |
-
(bt_df["Decode
|
| 51 |
).round(2) - 100
|
| 52 |
# filter speedups > 1000%
|
| 53 |
-
bt_df = bt_df[bt_df["Prefill
|
| 54 |
-
bt_df = bt_df[bt_df["Decode
|
| 55 |
|
| 56 |
return bt_df
|
| 57 |
|
|
@@ -61,8 +61,8 @@ def get_bt_prefill_fig(llm_perf_df):
|
|
| 61 |
# plot
|
| 62 |
prefill_fig = px.box(
|
| 63 |
bt_df,
|
| 64 |
-
x="
|
| 65 |
-
y="Prefill
|
| 66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 67 |
custom_data=BETTERTRANSFORMER_DATA,
|
| 68 |
color="Quantization ποΈ",
|
|
@@ -77,7 +77,7 @@ def get_bt_prefill_fig(llm_perf_df):
|
|
| 77 |
# add layout
|
| 78 |
prefill_fig.update_layout(
|
| 79 |
title={
|
| 80 |
-
"text": "Prefill
|
| 81 |
"y": 0.95,
|
| 82 |
"x": 0.5,
|
| 83 |
"xanchor": "center",
|
|
@@ -98,8 +98,8 @@ def get_bt_decode_fig(llm_perf_df):
|
|
| 98 |
# plot
|
| 99 |
decode_fig = px.box(
|
| 100 |
bt_df,
|
| 101 |
-
x="
|
| 102 |
-
y="Decode
|
| 103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 104 |
custom_data=BETTERTRANSFORMER_DATA,
|
| 105 |
color="Quantization ποΈ",
|
|
@@ -114,7 +114,7 @@ def get_bt_decode_fig(llm_perf_df):
|
|
| 114 |
# add layout
|
| 115 |
decode_fig.update_layout(
|
| 116 |
title={
|
| 117 |
-
"text": "Decode
|
| 118 |
"y": 0.95,
|
| 119 |
"x": 0.5,
|
| 120 |
"xanchor": "center",
|
|
|
|
| 6 |
BETTERTRANSFORMER_DATA = [
|
| 7 |
# open llm
|
| 8 |
"Model π€",
|
|
|
|
| 9 |
"DType π₯",
|
| 10 |
"Backend π",
|
| 11 |
"Params (B)",
|
| 12 |
+
"Architecture ποΈ",
|
| 13 |
"Open LLM Score (%)",
|
| 14 |
# deployment settings
|
| 15 |
"DType π₯",
|
|
|
|
| 18 |
"Quantization ποΈ",
|
| 19 |
"Optimization π οΈ BetterTransformer",
|
| 20 |
# primary measurements
|
| 21 |
+
"Prefill (s)",
|
| 22 |
+
"Prefill (s) BetterTransformer",
|
| 23 |
+
"Decode (tokens/s)",
|
| 24 |
+
"Decode (tokens/s) BetterTransformer",
|
| 25 |
+
"End-to-End (tokens/s)",
|
| 26 |
+
"End-to-End (tokens/s) BetterTransformer",
|
| 27 |
# speedups
|
| 28 |
+
"Prefill Speedup (%)",
|
| 29 |
+
"Decode Speedup (%)",
|
| 30 |
]
|
| 31 |
|
| 32 |
|
|
|
|
| 43 |
suffixes=["", " BetterTransformer"],
|
| 44 |
)
|
| 45 |
# compute speedups
|
| 46 |
+
bt_df["Prefill Speedup (%)"] = (
|
| 47 |
+
(bt_df["Prefill (s)"] / bt_df["Prefill (s) BetterTransformer"]) * 100
|
| 48 |
).round(2) - 100
|
| 49 |
+
bt_df["Decode Speedup (%)"] = (
|
| 50 |
+
(bt_df["Decode (tokens/s) BetterTransformer"] / bt_df["Decode (tokens/s)"]) * 100
|
| 51 |
).round(2) - 100
|
| 52 |
# filter speedups > 1000%
|
| 53 |
+
bt_df = bt_df[bt_df["Prefill Speedup (%)"] < 1000]
|
| 54 |
+
bt_df = bt_df[bt_df["Decode Speedup (%)"] < 1000]
|
| 55 |
|
| 56 |
return bt_df
|
| 57 |
|
|
|
|
| 61 |
# plot
|
| 62 |
prefill_fig = px.box(
|
| 63 |
bt_df,
|
| 64 |
+
x="Architecture ποΈ",
|
| 65 |
+
y="Prefill Speedup (%)",
|
| 66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 67 |
custom_data=BETTERTRANSFORMER_DATA,
|
| 68 |
color="Quantization ποΈ",
|
|
|
|
| 77 |
# add layout
|
| 78 |
prefill_fig.update_layout(
|
| 79 |
title={
|
| 80 |
+
"text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
|
| 81 |
"y": 0.95,
|
| 82 |
"x": 0.5,
|
| 83 |
"xanchor": "center",
|
|
|
|
| 98 |
# plot
|
| 99 |
decode_fig = px.box(
|
| 100 |
bt_df,
|
| 101 |
+
x="Architecture ποΈ",
|
| 102 |
+
y="Decode Speedup (%)",
|
| 103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 104 |
custom_data=BETTERTRANSFORMER_DATA,
|
| 105 |
color="Quantization ποΈ",
|
|
|
|
| 114 |
# add layout
|
| 115 |
decode_fig.update_layout(
|
| 116 |
title={
|
| 117 |
+
"text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
|
| 118 |
"y": 0.95,
|
| 119 |
"x": 0.5,
|
| 120 |
"xanchor": "center",
|
src/control_panel.py
CHANGED
|
@@ -12,13 +12,6 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
| 12 |
# controls
|
| 13 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
| 14 |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"):
|
| 15 |
-
with gr.Row():
|
| 16 |
-
with gr.Column():
|
| 17 |
-
search_bar = gr.Textbox(
|
| 18 |
-
label="Model π€",
|
| 19 |
-
info="π Search for a model name",
|
| 20 |
-
elem_id="search-bar",
|
| 21 |
-
)
|
| 22 |
with gr.Row():
|
| 23 |
with gr.Column(scale=1, variant="panel"):
|
| 24 |
score_slider = gr.Slider(
|
|
@@ -98,7 +91,6 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
| 98 |
return (
|
| 99 |
filter_button,
|
| 100 |
machine_textbox,
|
| 101 |
-
search_bar,
|
| 102 |
score_slider,
|
| 103 |
memory_slider,
|
| 104 |
backend_checkboxes,
|
|
@@ -110,27 +102,28 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
| 110 |
|
| 111 |
def filter_fn(
|
| 112 |
machine,
|
| 113 |
-
|
|
|
|
|
|
|
| 114 |
backends,
|
| 115 |
datatypes,
|
| 116 |
optimizations,
|
| 117 |
quantizations,
|
|
|
|
| 118 |
columns,
|
| 119 |
-
|
| 120 |
-
memory,
|
| 121 |
):
|
| 122 |
raw_df = get_llm_perf_df(machine=machine)
|
| 123 |
filtered_df = raw_df[
|
| 124 |
-
raw_df["Model π€"].str.contains(model, case=False)
|
| 125 |
-
|
| 126 |
& raw_df["DType π₯"].isin(datatypes)
|
| 127 |
& raw_df["Optimization π οΈ"].isin(optimizations)
|
| 128 |
& raw_df["Quantization ποΈ"].isin(quantizations)
|
| 129 |
& (raw_df["Open LLM Score (%)"] >= score)
|
| 130 |
& (raw_df["Allocated Memory (MB)"] <= memory)
|
| 131 |
]
|
| 132 |
-
filtered_leaderboard_df =
|
| 133 |
-
filtered_leaderboard_df = filtered_leaderboard_df[columns]
|
| 134 |
filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
|
| 135 |
filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
| 136 |
filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
|
@@ -154,16 +147,18 @@ def filter_fn(
|
|
| 154 |
def create_control_callback(
|
| 155 |
# button
|
| 156 |
filter_button,
|
| 157 |
-
#
|
| 158 |
machine_textbox,
|
| 159 |
-
|
| 160 |
score_slider,
|
| 161 |
memory_slider,
|
| 162 |
backend_checkboxes,
|
| 163 |
datatype_checkboxes,
|
| 164 |
optimization_checkboxes,
|
| 165 |
quantization_checkboxes,
|
|
|
|
| 166 |
columns_checkboxes,
|
|
|
|
| 167 |
# outputs
|
| 168 |
leaderboard_table,
|
| 169 |
lat_score_mem_plot,
|
|
@@ -177,15 +172,18 @@ def create_control_callback(
|
|
| 177 |
filter_button.click(
|
| 178 |
fn=filter_fn,
|
| 179 |
inputs=[
|
|
|
|
| 180 |
machine_textbox,
|
| 181 |
-
|
|
|
|
|
|
|
| 182 |
backend_checkboxes,
|
| 183 |
datatype_checkboxes,
|
| 184 |
optimization_checkboxes,
|
| 185 |
quantization_checkboxes,
|
|
|
|
| 186 |
columns_checkboxes,
|
| 187 |
-
|
| 188 |
-
memory_slider,
|
| 189 |
],
|
| 190 |
outputs=[
|
| 191 |
leaderboard_table,
|
|
@@ -200,23 +198,33 @@ def create_control_callback(
|
|
| 200 |
)
|
| 201 |
|
| 202 |
|
| 203 |
-
def select_fn(machine, columns):
|
| 204 |
raw_df = get_llm_perf_df(machine=machine)
|
| 205 |
selected_leaderboard_df = get_leaderboard_df(raw_df)
|
| 206 |
selected_leaderboard_df = selected_leaderboard_df[columns]
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
return selected_leaderboard_df
|
| 209 |
|
| 210 |
|
| 211 |
def create_select_callback(
|
| 212 |
-
#
|
| 213 |
machine_textbox,
|
|
|
|
| 214 |
columns_checkboxes,
|
|
|
|
| 215 |
# outputs
|
| 216 |
leaderboard_table,
|
| 217 |
):
|
| 218 |
columns_checkboxes.change(
|
| 219 |
fn=select_fn,
|
| 220 |
-
inputs=[machine_textbox, columns_checkboxes],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
outputs=[leaderboard_table],
|
| 222 |
)
|
|
|
|
| 12 |
# controls
|
| 13 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
| 14 |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
with gr.Row():
|
| 16 |
with gr.Column(scale=1, variant="panel"):
|
| 17 |
score_slider = gr.Slider(
|
|
|
|
| 91 |
return (
|
| 92 |
filter_button,
|
| 93 |
machine_textbox,
|
|
|
|
| 94 |
score_slider,
|
| 95 |
memory_slider,
|
| 96 |
backend_checkboxes,
|
|
|
|
| 102 |
|
| 103 |
def filter_fn(
|
| 104 |
machine,
|
| 105 |
+
# inputs
|
| 106 |
+
score,
|
| 107 |
+
memory,
|
| 108 |
backends,
|
| 109 |
datatypes,
|
| 110 |
optimizations,
|
| 111 |
quantizations,
|
| 112 |
+
# interactive
|
| 113 |
columns,
|
| 114 |
+
search,
|
|
|
|
| 115 |
):
|
| 116 |
raw_df = get_llm_perf_df(machine=machine)
|
| 117 |
filtered_df = raw_df[
|
| 118 |
+
# raw_df["Model π€"].str.contains(model, case=False)
|
| 119 |
+
raw_df["Backend π"].isin(backends)
|
| 120 |
& raw_df["DType π₯"].isin(datatypes)
|
| 121 |
& raw_df["Optimization π οΈ"].isin(optimizations)
|
| 122 |
& raw_df["Quantization ποΈ"].isin(quantizations)
|
| 123 |
& (raw_df["Open LLM Score (%)"] >= score)
|
| 124 |
& (raw_df["Allocated Memory (MB)"] <= memory)
|
| 125 |
]
|
| 126 |
+
filtered_leaderboard_df = select_fn(machine, columns, search)
|
|
|
|
| 127 |
filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
|
| 128 |
filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
| 129 |
filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
|
|
|
| 147 |
def create_control_callback(
|
| 148 |
# button
|
| 149 |
filter_button,
|
| 150 |
+
# fixed
|
| 151 |
machine_textbox,
|
| 152 |
+
# inputs
|
| 153 |
score_slider,
|
| 154 |
memory_slider,
|
| 155 |
backend_checkboxes,
|
| 156 |
datatype_checkboxes,
|
| 157 |
optimization_checkboxes,
|
| 158 |
quantization_checkboxes,
|
| 159 |
+
# interactive
|
| 160 |
columns_checkboxes,
|
| 161 |
+
search_bar,
|
| 162 |
# outputs
|
| 163 |
leaderboard_table,
|
| 164 |
lat_score_mem_plot,
|
|
|
|
| 172 |
filter_button.click(
|
| 173 |
fn=filter_fn,
|
| 174 |
inputs=[
|
| 175 |
+
# fixed
|
| 176 |
machine_textbox,
|
| 177 |
+
# inputs
|
| 178 |
+
score_slider,
|
| 179 |
+
memory_slider,
|
| 180 |
backend_checkboxes,
|
| 181 |
datatype_checkboxes,
|
| 182 |
optimization_checkboxes,
|
| 183 |
quantization_checkboxes,
|
| 184 |
+
# interactive
|
| 185 |
columns_checkboxes,
|
| 186 |
+
search_bar,
|
|
|
|
| 187 |
],
|
| 188 |
outputs=[
|
| 189 |
leaderboard_table,
|
|
|
|
| 198 |
)
|
| 199 |
|
| 200 |
|
| 201 |
+
def select_fn(machine, columns, search):
|
| 202 |
raw_df = get_llm_perf_df(machine=machine)
|
| 203 |
selected_leaderboard_df = get_leaderboard_df(raw_df)
|
| 204 |
selected_leaderboard_df = selected_leaderboard_df[columns]
|
| 205 |
+
selected_leaderboard_df = selected_leaderboard_df[
|
| 206 |
+
selected_leaderboard_df["Model π€"].str.contains(search, case=False)
|
| 207 |
+
]
|
| 208 |
|
| 209 |
return selected_leaderboard_df
|
| 210 |
|
| 211 |
|
| 212 |
def create_select_callback(
|
| 213 |
+
# fixed
|
| 214 |
machine_textbox,
|
| 215 |
+
# interactive
|
| 216 |
columns_checkboxes,
|
| 217 |
+
search_bar,
|
| 218 |
# outputs
|
| 219 |
leaderboard_table,
|
| 220 |
):
|
| 221 |
columns_checkboxes.change(
|
| 222 |
fn=select_fn,
|
| 223 |
+
inputs=[machine_textbox, columns_checkboxes, search_bar],
|
| 224 |
+
outputs=[leaderboard_table],
|
| 225 |
+
)
|
| 226 |
+
search_bar.change(
|
| 227 |
+
fn=select_fn,
|
| 228 |
+
inputs=[machine_textbox, columns_checkboxes, search_bar],
|
| 229 |
outputs=[leaderboard_table],
|
| 230 |
)
|
src/flashattentionv2.py
CHANGED
|
@@ -6,10 +6,10 @@ import plotly.express as px
|
|
| 6 |
FLASHATTENTIONV2_DATA = [
|
| 7 |
# open llm
|
| 8 |
"Model π€",
|
| 9 |
-
"Arch ποΈ",
|
| 10 |
"DType π₯",
|
| 11 |
"Backend π",
|
| 12 |
"Params (B)",
|
|
|
|
| 13 |
"Open LLM Score (%)",
|
| 14 |
# deployment settings
|
| 15 |
"DType π₯",
|
|
@@ -18,15 +18,15 @@ FLASHATTENTIONV2_DATA = [
|
|
| 18 |
"Quantization ποΈ",
|
| 19 |
"Optimization π οΈ FlashAttentionV2",
|
| 20 |
# primary measurements
|
| 21 |
-
"Prefill
|
| 22 |
-
"Prefill
|
| 23 |
-
"Decode
|
| 24 |
-
"Decode
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
# speedups
|
| 28 |
-
"Prefill
|
| 29 |
-
"Decode
|
| 30 |
]
|
| 31 |
|
| 32 |
|
|
@@ -43,15 +43,15 @@ def get_fa2_df(llm_perf_df):
|
|
| 43 |
suffixes=["", " FlashAttentionV2"],
|
| 44 |
)
|
| 45 |
# compute speedups
|
| 46 |
-
fa2_df["Prefill
|
| 47 |
-
|
| 48 |
-
)
|
| 49 |
-
fa2_df["Decode
|
| 50 |
-
(fa2_df["Decode
|
| 51 |
).round(2) - 100
|
| 52 |
# filter speedups > 1000%
|
| 53 |
-
fa2_df = fa2_df[fa2_df["Prefill
|
| 54 |
-
fa2_df = fa2_df[fa2_df["Decode
|
| 55 |
|
| 56 |
return fa2_df
|
| 57 |
|
|
@@ -61,8 +61,8 @@ def get_fa2_decode_fig(llm_perf_df):
|
|
| 61 |
# plot
|
| 62 |
decode_fig = px.box(
|
| 63 |
fa2_df,
|
| 64 |
-
x="
|
| 65 |
-
y="Decode
|
| 66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 67 |
custom_data=FLASHATTENTIONV2_DATA,
|
| 68 |
color="Quantization ποΈ",
|
|
@@ -77,7 +77,7 @@ def get_fa2_decode_fig(llm_perf_df):
|
|
| 77 |
# add layout
|
| 78 |
decode_fig.update_layout(
|
| 79 |
title={
|
| 80 |
-
"text": "Decode
|
| 81 |
"y": 0.95,
|
| 82 |
"x": 0.5,
|
| 83 |
"xanchor": "center",
|
|
@@ -98,8 +98,8 @@ def get_fa2_prefill_fig(llm_perf_df):
|
|
| 98 |
# plot
|
| 99 |
prefill_fig = px.box(
|
| 100 |
fa2_df,
|
| 101 |
-
x="
|
| 102 |
-
y="Prefill
|
| 103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 104 |
custom_data=FLASHATTENTIONV2_DATA,
|
| 105 |
color="Quantization ποΈ",
|
|
@@ -114,7 +114,7 @@ def get_fa2_prefill_fig(llm_perf_df):
|
|
| 114 |
# add layout
|
| 115 |
prefill_fig.update_layout(
|
| 116 |
title={
|
| 117 |
-
"text": "Prefill
|
| 118 |
"y": 0.95,
|
| 119 |
"x": 0.5,
|
| 120 |
"xanchor": "center",
|
|
|
|
| 6 |
FLASHATTENTIONV2_DATA = [
|
| 7 |
# open llm
|
| 8 |
"Model π€",
|
|
|
|
| 9 |
"DType π₯",
|
| 10 |
"Backend π",
|
| 11 |
"Params (B)",
|
| 12 |
+
"Architecture ποΈ",
|
| 13 |
"Open LLM Score (%)",
|
| 14 |
# deployment settings
|
| 15 |
"DType π₯",
|
|
|
|
| 18 |
"Quantization ποΈ",
|
| 19 |
"Optimization π οΈ FlashAttentionV2",
|
| 20 |
# primary measurements
|
| 21 |
+
"Prefill (s)",
|
| 22 |
+
"Prefill (s) FlashAttentionV2",
|
| 23 |
+
"Decode (tokens/s)",
|
| 24 |
+
"Decode (tokens/s) FlashAttentionV2",
|
| 25 |
+
"End-to-End (tokens/s)",
|
| 26 |
+
"End-to-End (tokens/s) FlashAttentionV2",
|
| 27 |
# speedups
|
| 28 |
+
"Prefill Speedup (%)",
|
| 29 |
+
"Decode Speedup (%)",
|
| 30 |
]
|
| 31 |
|
| 32 |
|
|
|
|
| 43 |
suffixes=["", " FlashAttentionV2"],
|
| 44 |
)
|
| 45 |
# compute speedups
|
| 46 |
+
fa2_df["Prefill Speedup (%)"] = ((fa2_df["Prefill (s)"] / fa2_df["Prefill (s) FlashAttentionV2"]) * 100).round(
|
| 47 |
+
2
|
| 48 |
+
) - 100
|
| 49 |
+
fa2_df["Decode Speedup (%)"] = (
|
| 50 |
+
(fa2_df["Decode (tokens/s) FlashAttentionV2"] / fa2_df["Decode (tokens/s)"]) * 100
|
| 51 |
).round(2) - 100
|
| 52 |
# filter speedups > 1000%
|
| 53 |
+
fa2_df = fa2_df[fa2_df["Prefill Speedup (%)"] < 1000]
|
| 54 |
+
fa2_df = fa2_df[fa2_df["Decode Speedup (%)"] < 1000]
|
| 55 |
|
| 56 |
return fa2_df
|
| 57 |
|
|
|
|
| 61 |
# plot
|
| 62 |
decode_fig = px.box(
|
| 63 |
fa2_df,
|
| 64 |
+
x="Architecture ποΈ",
|
| 65 |
+
y="Decode Speedup (%)",
|
| 66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 67 |
custom_data=FLASHATTENTIONV2_DATA,
|
| 68 |
color="Quantization ποΈ",
|
|
|
|
| 77 |
# add layout
|
| 78 |
decode_fig.update_layout(
|
| 79 |
title={
|
| 80 |
+
"text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
|
| 81 |
"y": 0.95,
|
| 82 |
"x": 0.5,
|
| 83 |
"xanchor": "center",
|
|
|
|
| 98 |
# plot
|
| 99 |
prefill_fig = px.box(
|
| 100 |
fa2_df,
|
| 101 |
+
x="Architecture ποΈ",
|
| 102 |
+
y="Prefill Speedup (%)",
|
| 103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 104 |
custom_data=FLASHATTENTIONV2_DATA,
|
| 105 |
color="Quantization ποΈ",
|
|
|
|
| 114 |
# add layout
|
| 115 |
prefill_fig.update_layout(
|
| 116 |
title={
|
| 117 |
+
"text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
|
| 118 |
"y": 0.95,
|
| 119 |
"x": 0.5,
|
| 120 |
"xanchor": "center",
|
src/latency_score_memory.py
CHANGED
|
@@ -4,18 +4,18 @@ import plotly.express as px
|
|
| 4 |
|
| 5 |
SCORE_MEMORY_LATENCY_DATA = [
|
| 6 |
"Model π€",
|
| 7 |
-
"Arch ποΈ",
|
| 8 |
-
"Params (B)",
|
| 9 |
"DType π₯",
|
| 10 |
"Backend π",
|
|
|
|
|
|
|
| 11 |
"Optimization π οΈ",
|
| 12 |
"Quantization ποΈ",
|
| 13 |
"Open LLM Score (%)",
|
| 14 |
-
"Prefill
|
| 15 |
-
"Decode
|
| 16 |
-
"
|
| 17 |
-
"
|
| 18 |
-
# "
|
| 19 |
]
|
| 20 |
|
| 21 |
|
|
@@ -24,10 +24,10 @@ def get_lat_score_mem_fig(llm_perf_df):
|
|
| 24 |
# plot
|
| 25 |
fig = px.scatter(
|
| 26 |
copy_df,
|
| 27 |
-
x="
|
| 28 |
y="Open LLM Score (%)",
|
| 29 |
-
size="
|
| 30 |
-
color="
|
| 31 |
custom_data=SCORE_MEMORY_LATENCY_DATA,
|
| 32 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 33 |
)
|
|
@@ -38,7 +38,7 @@ def get_lat_score_mem_fig(llm_perf_df):
|
|
| 38 |
)
|
| 39 |
fig.update_layout(
|
| 40 |
title={
|
| 41 |
-
"text": "
|
| 42 |
"y": 0.95,
|
| 43 |
"x": 0.5,
|
| 44 |
"xanchor": "center",
|
|
@@ -56,7 +56,7 @@ def get_lat_score_mem_fig(llm_perf_df):
|
|
| 56 |
|
| 57 |
def create_lat_score_mem_plot(llm_perf_df):
|
| 58 |
# descriptive text
|
| 59 |
-
gr.HTML("π Hover over the points π for additional information. ",elem_id="text")
|
| 60 |
# get figure
|
| 61 |
fig = get_lat_score_mem_fig(llm_perf_df)
|
| 62 |
# create plot
|
|
|
|
| 4 |
|
| 5 |
SCORE_MEMORY_LATENCY_DATA = [
|
| 6 |
"Model π€",
|
|
|
|
|
|
|
| 7 |
"DType π₯",
|
| 8 |
"Backend π",
|
| 9 |
+
"Params (B)",
|
| 10 |
+
"Architecture ποΈ",
|
| 11 |
"Optimization π οΈ",
|
| 12 |
"Quantization ποΈ",
|
| 13 |
"Open LLM Score (%)",
|
| 14 |
+
"Prefill (s)",
|
| 15 |
+
"Decode (tokens/s)",
|
| 16 |
+
"Memory (MB)",
|
| 17 |
+
"End-to-End (s)",
|
| 18 |
+
# "End-to-End (tokens/s)",
|
| 19 |
]
|
| 20 |
|
| 21 |
|
|
|
|
| 24 |
# plot
|
| 25 |
fig = px.scatter(
|
| 26 |
copy_df,
|
| 27 |
+
x="End-to-End (s)",
|
| 28 |
y="Open LLM Score (%)",
|
| 29 |
+
size="Memory (MB)",
|
| 30 |
+
color="Architecture ποΈ",
|
| 31 |
custom_data=SCORE_MEMORY_LATENCY_DATA,
|
| 32 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 33 |
)
|
|
|
|
| 38 |
)
|
| 39 |
fig.update_layout(
|
| 40 |
title={
|
| 41 |
+
"text": "vs. Score vs. Memory",
|
| 42 |
"y": 0.95,
|
| 43 |
"x": 0.5,
|
| 44 |
"xanchor": "center",
|
|
|
|
| 56 |
|
| 57 |
def create_lat_score_mem_plot(llm_perf_df):
|
| 58 |
# descriptive text
|
| 59 |
+
gr.HTML("π Hover over the points π for additional information. ", elem_id="text")
|
| 60 |
# get figure
|
| 61 |
fig = get_lat_score_mem_fig(llm_perf_df)
|
| 62 |
# create plot
|
src/leaderboard.py
CHANGED
|
@@ -8,9 +8,9 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
|
|
| 8 |
"Model π€": "markdown",
|
| 9 |
"Experiment π§ͺ": "str",
|
| 10 |
# primary measurements
|
| 11 |
-
"Prefill
|
| 12 |
-
"Decode
|
| 13 |
-
"
|
| 14 |
"Energy (tokens/kWh)": "number",
|
| 15 |
# deployment settings
|
| 16 |
"DType π₯": "str",
|
|
@@ -18,15 +18,25 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
|
|
| 18 |
"Optimization π οΈ": "str",
|
| 19 |
"Quantization ποΈ": "str",
|
| 20 |
# additional measurements
|
| 21 |
-
"
|
| 22 |
"Params (B)": "number",
|
| 23 |
"Open LLM Score (%)": "number",
|
| 24 |
-
"
|
| 25 |
-
"
|
| 26 |
"Reserved Memory (MB)": "number",
|
| 27 |
"Used Memory (MB)": "number",
|
| 28 |
}
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def process_model(model_name):
|
| 32 |
link = f"https://huggingface.co/{model_name}"
|
|
@@ -48,20 +58,29 @@ def get_leaderboard_df(llm_perf_df):
|
|
| 48 |
def create_leaderboard_table(llm_perf_df):
|
| 49 |
# get dataframe
|
| 50 |
leaderboard_df = get_leaderboard_df(llm_perf_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# create checkboxes
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
| 59 |
# create table
|
| 60 |
leaderboard_table = gr.components.Dataframe(
|
| 61 |
-
value=leaderboard_df,
|
| 62 |
datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
|
| 63 |
headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
| 64 |
elem_id="leaderboard-table",
|
| 65 |
)
|
| 66 |
|
| 67 |
-
return
|
|
|
|
| 8 |
"Model π€": "markdown",
|
| 9 |
"Experiment π§ͺ": "str",
|
| 10 |
# primary measurements
|
| 11 |
+
"Prefill (s)": "number",
|
| 12 |
+
"Decode (tokens/s)": "number",
|
| 13 |
+
"Memory (MB)": "number",
|
| 14 |
"Energy (tokens/kWh)": "number",
|
| 15 |
# deployment settings
|
| 16 |
"DType π₯": "str",
|
|
|
|
| 18 |
"Optimization π οΈ": "str",
|
| 19 |
"Quantization ποΈ": "str",
|
| 20 |
# additional measurements
|
| 21 |
+
"Architecture ποΈ": "markdown",
|
| 22 |
"Params (B)": "number",
|
| 23 |
"Open LLM Score (%)": "number",
|
| 24 |
+
"End-to-End (s)": "number",
|
| 25 |
+
"End-to-End (tokens/s)": "number",
|
| 26 |
"Reserved Memory (MB)": "number",
|
| 27 |
"Used Memory (MB)": "number",
|
| 28 |
}
|
| 29 |
|
| 30 |
+
PRIMARY_COLUMNS = [
|
| 31 |
+
"Model π€",
|
| 32 |
+
"Experiment π§ͺ",
|
| 33 |
+
"Prefill (s)",
|
| 34 |
+
"Decode (tokens/s)",
|
| 35 |
+
"Memory (MB)",
|
| 36 |
+
"Energy (tokens/kWh)",
|
| 37 |
+
"Open LLM Score (%)",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
|
| 41 |
def process_model(model_name):
|
| 42 |
link = f"https://huggingface.co/{model_name}"
|
|
|
|
| 58 |
def create_leaderboard_table(llm_perf_df):
|
| 59 |
# get dataframe
|
| 60 |
leaderboard_df = get_leaderboard_df(llm_perf_df)
|
| 61 |
+
|
| 62 |
+
# create search bar
|
| 63 |
+
with gr.Row():
|
| 64 |
+
search_bar = gr.Textbox(
|
| 65 |
+
label="Model π€",
|
| 66 |
+
info="π Search for a model name",
|
| 67 |
+
elem_id="search-bar",
|
| 68 |
+
)
|
| 69 |
# create checkboxes
|
| 70 |
+
with gr.Row():
|
| 71 |
+
columns_checkboxes = gr.CheckboxGroup(
|
| 72 |
+
label="Columns π",
|
| 73 |
+
value=PRIMARY_COLUMNS,
|
| 74 |
+
choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
| 75 |
+
info="βοΈ Select the columns to display",
|
| 76 |
+
elem_id="columns-checkboxes",
|
| 77 |
+
)
|
| 78 |
# create table
|
| 79 |
leaderboard_table = gr.components.Dataframe(
|
| 80 |
+
value=leaderboard_df[PRIMARY_COLUMNS],
|
| 81 |
datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
|
| 82 |
headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
| 83 |
elem_id="leaderboard-table",
|
| 84 |
)
|
| 85 |
|
| 86 |
+
return search_bar, columns_checkboxes, leaderboard_table
|
src/llm_perf.py
CHANGED
|
@@ -12,9 +12,9 @@ COLUMNS_MAPPING = {
|
|
| 12 |
"Model": "Model π€",
|
| 13 |
"experiment_name": "Experiment π§ͺ",
|
| 14 |
# primary measurements
|
| 15 |
-
"forward.latency(s)": "Prefill
|
| 16 |
-
"decode.throughput(tokens/s)": "Decode
|
| 17 |
-
"generate.max_memory_allocated(MB)": "
|
| 18 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
|
| 19 |
# deployment settings
|
| 20 |
"backend.name": "Backend π",
|
|
@@ -22,18 +22,18 @@ COLUMNS_MAPPING = {
|
|
| 22 |
"optimization": "Optimization π οΈ",
|
| 23 |
"quantization": "Quantization ποΈ",
|
| 24 |
# additional measurements
|
| 25 |
-
"Arch": "Arch ποΈ",
|
| 26 |
"Size": "Params (B)",
|
|
|
|
| 27 |
"Score": "Open LLM Score (%)",
|
| 28 |
-
"generate.latency(s)": "
|
| 29 |
-
"generate.throughput(tokens/s)": "
|
| 30 |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
|
| 31 |
"generate.max_memory_used(MB)": "Used Memory (MB)",
|
| 32 |
}
|
| 33 |
SORTING_COLUMNS = [
|
| 34 |
"Open LLM Score (%)",
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
]
|
| 38 |
SORTING_ASCENDING = [False, True, False]
|
| 39 |
|
|
@@ -107,6 +107,13 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
|
|
| 107 |
].apply(lambda x: process_quantization_scheme(x), axis=1)
|
| 108 |
# process experiment name
|
| 109 |
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# add arch
|
| 111 |
llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
|
| 112 |
# filter columns
|
|
|
|
| 12 |
"Model": "Model π€",
|
| 13 |
"experiment_name": "Experiment π§ͺ",
|
| 14 |
# primary measurements
|
| 15 |
+
"forward.latency(s)": "Prefill (s)",
|
| 16 |
+
"decode.throughput(tokens/s)": "Decode (tokens/s)",
|
| 17 |
+
"generate.max_memory_allocated(MB)": "Memory (MB)",
|
| 18 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
|
| 19 |
# deployment settings
|
| 20 |
"backend.name": "Backend π",
|
|
|
|
| 22 |
"optimization": "Optimization π οΈ",
|
| 23 |
"quantization": "Quantization ποΈ",
|
| 24 |
# additional measurements
|
|
|
|
| 25 |
"Size": "Params (B)",
|
| 26 |
+
"Arch": "Architecture ποΈ",
|
| 27 |
"Score": "Open LLM Score (%)",
|
| 28 |
+
"generate.latency(s)": "End-to-End (s)",
|
| 29 |
+
"generate.throughput(tokens/s)": "End-to-End (tokens/s)",
|
| 30 |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
|
| 31 |
"generate.max_memory_used(MB)": "Used Memory (MB)",
|
| 32 |
}
|
| 33 |
SORTING_COLUMNS = [
|
| 34 |
"Open LLM Score (%)",
|
| 35 |
+
"Decode (tokens/s)",
|
| 36 |
+
"Prefill (s)",
|
| 37 |
]
|
| 38 |
SORTING_ASCENDING = [False, True, False]
|
| 39 |
|
|
|
|
| 107 |
].apply(lambda x: process_quantization_scheme(x), axis=1)
|
| 108 |
# process experiment name
|
| 109 |
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
|
| 110 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(
|
| 111 |
+
lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x
|
| 112 |
+
)
|
| 113 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "awq"))
|
| 114 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "gptq"))
|
| 115 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "sdpa"))
|
| 116 |
+
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA2"))
|
| 117 |
# add arch
|
| 118 |
llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
|
| 119 |
# filter columns
|
src/quantization_kernels.py
CHANGED
|
@@ -6,10 +6,10 @@ import plotly.express as px
|
|
| 6 |
QUANT_DATA = [
|
| 7 |
# open llm
|
| 8 |
"Model π€",
|
| 9 |
-
"Arch ποΈ",
|
| 10 |
"DType π₯",
|
| 11 |
"Backend π",
|
| 12 |
"Params (B)",
|
|
|
|
| 13 |
"Open LLM Score (%)",
|
| 14 |
# deployment settings
|
| 15 |
"DType π₯",
|
|
@@ -19,13 +19,13 @@ QUANT_DATA = [
|
|
| 19 |
"Optimization π οΈ Custom Kernel",
|
| 20 |
"Quantization ποΈ Custom Kernel",
|
| 21 |
# primary measurements
|
| 22 |
-
"Prefill
|
| 23 |
-
"Prefill
|
| 24 |
-
"Decode
|
| 25 |
-
"Decode
|
| 26 |
# speedups
|
| 27 |
-
"Prefill
|
| 28 |
-
"Decode
|
| 29 |
]
|
| 30 |
|
| 31 |
|
|
@@ -33,10 +33,10 @@ def get_quant_df(llm_perf_df):
|
|
| 33 |
copy_df = llm_perf_df.copy()
|
| 34 |
# seperate vanilla GPTQ experiments from Custom Kernel experiments
|
| 35 |
vanilla_df = copy_df[
|
| 36 |
-
(copy_df["Backend π"] == "pytorch")
|
| 37 |
-
(copy_df["Quantization ποΈ"] == "None")
|
| 38 |
-
(copy_df["Optimization π οΈ"] == "None")
|
| 39 |
-
(copy_df["DType π₯"] == "float16")
|
| 40 |
]
|
| 41 |
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
|
| 42 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
|
@@ -70,15 +70,15 @@ def get_quant_df(llm_perf_df):
|
|
| 70 |
# concat the two dataframes row-wise
|
| 71 |
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
| 72 |
# compute speedups
|
| 73 |
-
quant_df["Prefill
|
| 74 |
-
|
| 75 |
-
)
|
| 76 |
-
quant_df["Decode
|
| 77 |
-
(quant_df["Decode
|
| 78 |
).round(2) - 100
|
| 79 |
# filter speedups > 1000%
|
| 80 |
-
quant_df = quant_df[quant_df["Prefill
|
| 81 |
-
quant_df = quant_df[quant_df["Decode
|
| 82 |
|
| 83 |
return quant_df
|
| 84 |
|
|
@@ -88,8 +88,8 @@ def get_quant_decode_fig(llm_perf_df):
|
|
| 88 |
# plot
|
| 89 |
decode_fig = px.box(
|
| 90 |
quant_df,
|
| 91 |
-
x="
|
| 92 |
-
y="Decode
|
| 93 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 94 |
custom_data=QUANT_DATA,
|
| 95 |
color="Quantization ποΈ Custom Kernel",
|
|
@@ -102,7 +102,7 @@ def get_quant_decode_fig(llm_perf_df):
|
|
| 102 |
# add layout
|
| 103 |
decode_fig.update_layout(
|
| 104 |
title={
|
| 105 |
-
"text": "Decode
|
| 106 |
"y": 0.95,
|
| 107 |
"x": 0.5,
|
| 108 |
"xanchor": "center",
|
|
@@ -123,8 +123,8 @@ def get_quant_prefill_fig(llm_perf_df):
|
|
| 123 |
# plot
|
| 124 |
prefill_fig = px.box(
|
| 125 |
quant_df,
|
| 126 |
-
x="
|
| 127 |
-
y="Prefill
|
| 128 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 129 |
custom_data=QUANT_DATA,
|
| 130 |
color="Quantization ποΈ Custom Kernel",
|
|
@@ -137,7 +137,7 @@ def get_quant_prefill_fig(llm_perf_df):
|
|
| 137 |
# add layout
|
| 138 |
prefill_fig.update_layout(
|
| 139 |
title={
|
| 140 |
-
"text": "Prefill
|
| 141 |
"y": 0.95,
|
| 142 |
"x": 0.5,
|
| 143 |
"xanchor": "center",
|
|
|
|
| 6 |
QUANT_DATA = [
|
| 7 |
# open llm
|
| 8 |
"Model π€",
|
|
|
|
| 9 |
"DType π₯",
|
| 10 |
"Backend π",
|
| 11 |
"Params (B)",
|
| 12 |
+
"Architecture ποΈ",
|
| 13 |
"Open LLM Score (%)",
|
| 14 |
# deployment settings
|
| 15 |
"DType π₯",
|
|
|
|
| 19 |
"Optimization π οΈ Custom Kernel",
|
| 20 |
"Quantization ποΈ Custom Kernel",
|
| 21 |
# primary measurements
|
| 22 |
+
"Prefill (s)",
|
| 23 |
+
"Prefill (s) Custom Kernel",
|
| 24 |
+
"Decode (tokens/s)",
|
| 25 |
+
"Decode (tokens/s) Custom Kernel",
|
| 26 |
# speedups
|
| 27 |
+
"Prefill Speedup (%)",
|
| 28 |
+
"Decode Speedup (%)",
|
| 29 |
]
|
| 30 |
|
| 31 |
|
|
|
|
| 33 |
copy_df = llm_perf_df.copy()
|
| 34 |
# seperate vanilla GPTQ experiments from Custom Kernel experiments
|
| 35 |
vanilla_df = copy_df[
|
| 36 |
+
(copy_df["Backend π"] == "pytorch")
|
| 37 |
+
& (copy_df["Quantization ποΈ"] == "None")
|
| 38 |
+
& (copy_df["Optimization π οΈ"] == "None")
|
| 39 |
+
& (copy_df["DType π₯"] == "float16")
|
| 40 |
]
|
| 41 |
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
|
| 42 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
|
|
|
| 70 |
# concat the two dataframes row-wise
|
| 71 |
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
| 72 |
# compute speedups
|
| 73 |
+
quant_df["Prefill Speedup (%)"] = ((quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100).round(
|
| 74 |
+
2
|
| 75 |
+
) - 100
|
| 76 |
+
quant_df["Decode Speedup (%)"] = (
|
| 77 |
+
(quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"]) * 100
|
| 78 |
).round(2) - 100
|
| 79 |
# filter speedups > 1000%
|
| 80 |
+
quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
|
| 81 |
+
quant_df = quant_df[quant_df["Decode Speedup (%)"] < 1000]
|
| 82 |
|
| 83 |
return quant_df
|
| 84 |
|
|
|
|
| 88 |
# plot
|
| 89 |
decode_fig = px.box(
|
| 90 |
quant_df,
|
| 91 |
+
x="Architecture ποΈ",
|
| 92 |
+
y="Decode Speedup (%)",
|
| 93 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 94 |
custom_data=QUANT_DATA,
|
| 95 |
color="Quantization ποΈ Custom Kernel",
|
|
|
|
| 102 |
# add layout
|
| 103 |
decode_fig.update_layout(
|
| 104 |
title={
|
| 105 |
+
"text": "Decode Speedup per Architecture",
|
| 106 |
"y": 0.95,
|
| 107 |
"x": 0.5,
|
| 108 |
"xanchor": "center",
|
|
|
|
| 123 |
# plot
|
| 124 |
prefill_fig = px.box(
|
| 125 |
quant_df,
|
| 126 |
+
x="Architecture ποΈ",
|
| 127 |
+
y="Prefill Speedup (%)",
|
| 128 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 129 |
custom_data=QUANT_DATA,
|
| 130 |
color="Quantization ποΈ Custom Kernel",
|
|
|
|
| 137 |
# add layout
|
| 138 |
prefill_fig.update_layout(
|
| 139 |
title={
|
| 140 |
+
"text": "Prefill Speedup per Architecture",
|
| 141 |
"y": 0.95,
|
| 142 |
"x": 0.5,
|
| 143 |
"xanchor": "center",
|