Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

IlyasMoutawwakil HF Staff commited on Apr 16, 2024

Commit

0232cf1

1 Parent(s): a8a6326

update

Browse files

Files changed (8) hide show

app.py +5 -3
src/bettertransformer.py +21 -21
src/control_panel.py +31 -23
src/flashattentionv2.py +22 -22
src/latency_score_memory.py +12 -12
src/leaderboard.py +34 -15
src/llm_perf.py +15 -8
src/quantization_kernels.py +24 -24

app.py CHANGED Viewed

@@ -35,7 +35,6 @@ with demo:
                 (
                     filter_button,
                     machine_textbox,
-                    search_bar,
                     score_slider,
                     memory_slider,
                     backend_checkboxes,
@@ -48,7 +47,7 @@ with demo:
                     llm_perf_df = get_llm_perf_df(machine=machine)
                     ####################### LEADERBOARD TAB #######################
                     with gr.TabItem("Leaderboard 🏅", id=0):
-                        leaderboard_table, columns_checkboxes = create_leaderboard_table(llm_perf_df)
                         lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
                     ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
                     with gr.TabItem("BetterTransformer 📈", id=2):
@@ -63,14 +62,15 @@ with demo:
                     filter_button,
                     # inputs
                     machine_textbox,
-                    search_bar,
                     score_slider,
                     memory_slider,
                     backend_checkboxes,
                     datatype_checkboxes,
                     optimization_checkboxes,
                     quantization_checkboxes,
                     columns_checkboxes,
                     # outputs
                     leaderboard_table,
                     lat_score_mem_plot,
@@ -85,7 +85,9 @@ with demo:
                 create_select_callback(
                     # inputs
                     machine_textbox,
                     columns_checkboxes,
                     # outputs
                     leaderboard_table,
                 )

                 (
                     filter_button,
                     machine_textbox,
                     score_slider,
                     memory_slider,
                     backend_checkboxes,
                     llm_perf_df = get_llm_perf_df(machine=machine)
                     ####################### LEADERBOARD TAB #######################
                     with gr.TabItem("Leaderboard 🏅", id=0):
+                        search_bar, columns_checkboxes, leaderboard_table = create_leaderboard_table(llm_perf_df)
                         lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
                     ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
                     with gr.TabItem("BetterTransformer 📈", id=2):
                     filter_button,
                     # inputs
                     machine_textbox,
                     score_slider,
                     memory_slider,
                     backend_checkboxes,
                     datatype_checkboxes,
                     optimization_checkboxes,
                     quantization_checkboxes,
+                    # interactive
                     columns_checkboxes,
+                    search_bar,
                     # outputs
                     leaderboard_table,
                     lat_score_mem_plot,
                 create_select_callback(
                     # inputs
                     machine_textbox,
+                    # interactive
                     columns_checkboxes,
+                    search_bar,
                     # outputs
                     leaderboard_table,
                 )

src/bettertransformer.py CHANGED Viewed

@@ -6,10 +6,10 @@ import plotly.express as px
 BETTERTRANSFORMER_DATA = [
     # open llm
     "Model 🤗",
-    "Arch 🏛️",
     "DType 📥",
     "Backend 🏭",
     "Params (B)",
     "Open LLM Score (%)",
     # deployment settings
     "DType 📥",
@@ -18,15 +18,15 @@ BETTERTRANSFORMER_DATA = [
     "Quantization 🗜️",
     "Optimization 🛠️ BetterTransformer",
     # primary measurements
-    "Prefill Latency (s)",
-    "Prefill Latency (s) BetterTransformer",
-    "Decode Throughput (tokens/s)",
-    "Decode Throughput (tokens/s) BetterTransformer",
-    "E2E Throughput (tokens/s)",
-    "E2E Throughput (tokens/s) BetterTransformer",
     # speedups
-    "Prefill Latency Speedup (%)",
-    "Decode Throughput Speedup (%)",
 ]
@@ -43,15 +43,15 @@ def get_bt_df(llm_perf_df):
         suffixes=["", " BetterTransformer"],
     )
     # compute speedups
-    bt_df["Prefill Latency Speedup (%)"] = (
-        (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
     ).round(2) - 100
-    bt_df["Decode Throughput Speedup (%)"] = (
-        (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
-    bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
-    bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
     return bt_df
@@ -61,8 +61,8 @@ def get_bt_prefill_fig(llm_perf_df):
     # plot
     prefill_fig = px.box(
         bt_df,
-        x="Arch 🏛️",
-        y="Prefill Latency Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=BETTERTRANSFORMER_DATA,
         color="Quantization 🗜️",
@@ -77,7 +77,7 @@ def get_bt_prefill_fig(llm_perf_df):
     # add layout
     prefill_fig.update_layout(
         title={
-            "text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",
@@ -98,8 +98,8 @@ def get_bt_decode_fig(llm_perf_df):
     # plot
     decode_fig = px.box(
         bt_df,
-        x="Arch 🏛️",
-        y="Decode Throughput Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=BETTERTRANSFORMER_DATA,
         color="Quantization 🗜️",
@@ -114,7 +114,7 @@ def get_bt_decode_fig(llm_perf_df):
     # add layout
     decode_fig.update_layout(
         title={
-            "text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",

 BETTERTRANSFORMER_DATA = [
     # open llm
     "Model 🤗",
     "DType 📥",
     "Backend 🏭",
     "Params (B)",
+    "Architecture 🏛️",
     "Open LLM Score (%)",
     # deployment settings
     "DType 📥",
     "Quantization 🗜️",
     "Optimization 🛠️ BetterTransformer",
     # primary measurements
+    "Prefill (s)",
+    "Prefill (s) BetterTransformer",
+    "Decode (tokens/s)",
+    "Decode (tokens/s) BetterTransformer",
+    "End-to-End (tokens/s)",
+    "End-to-End (tokens/s) BetterTransformer",
     # speedups
+    "Prefill Speedup (%)",
+    "Decode Speedup (%)",
 ]
         suffixes=["", " BetterTransformer"],
     )
     # compute speedups
+    bt_df["Prefill Speedup (%)"] = (
+        (bt_df["Prefill (s)"] / bt_df["Prefill (s) BetterTransformer"]) * 100
     ).round(2) - 100
+    bt_df["Decode Speedup (%)"] = (
+        (bt_df["Decode (tokens/s) BetterTransformer"] / bt_df["Decode (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
+    bt_df = bt_df[bt_df["Prefill Speedup (%)"] < 1000]
+    bt_df = bt_df[bt_df["Decode Speedup (%)"] < 1000]
     return bt_df
     # plot
     prefill_fig = px.box(
         bt_df,
+        x="Architecture 🏛️",
+        y="Prefill Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=BETTERTRANSFORMER_DATA,
         color="Quantization 🗜️",
     # add layout
     prefill_fig.update_layout(
         title={
+            "text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",
     # plot
     decode_fig = px.box(
         bt_df,
+        x="Architecture 🏛️",
+        y="Decode Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=BETTERTRANSFORMER_DATA,
         color="Quantization 🗜️",
     # add layout
     decode_fig.update_layout(
         title={
+            "text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",

src/control_panel.py CHANGED Viewed

@@ -12,13 +12,6 @@ def create_control_panel(machine: str = "hf-dgx-01"):
     # controls
     machine_textbox = gr.Textbox(value=machine, visible=False)
     with gr.Accordion("Control Panel 🎛️", open=False, elem_id="control-panel"):
-        with gr.Row():
-            with gr.Column():
-                search_bar = gr.Textbox(
-                    label="Model 🤗",
-                    info="🔍 Search for a model name",
-                    elem_id="search-bar",
-                )
         with gr.Row():
             with gr.Column(scale=1, variant="panel"):
                 score_slider = gr.Slider(
@@ -98,7 +91,6 @@ def create_control_panel(machine: str = "hf-dgx-01"):
     return (
         filter_button,
         machine_textbox,
-        search_bar,
         score_slider,
         memory_slider,
         backend_checkboxes,
@@ -110,27 +102,28 @@ def create_control_panel(machine: str = "hf-dgx-01"):
 def filter_fn(
     machine,
-    model,
     backends,
     datatypes,
     optimizations,
     quantizations,
     columns,
-    score,
-    memory,
 ):
     raw_df = get_llm_perf_df(machine=machine)
     filtered_df = raw_df[
-        raw_df["Model 🤗"].str.contains(model, case=False)
-        & raw_df["Backend 🏭"].isin(backends)
         & raw_df["DType 📥"].isin(datatypes)
         & raw_df["Optimization 🛠️"].isin(optimizations)
         & raw_df["Quantization 🗜️"].isin(quantizations)
         & (raw_df["Open LLM Score (%)"] >= score)
         & (raw_df["Allocated Memory (MB)"] <= memory)
     ]
-    filtered_leaderboard_df = get_leaderboard_df(filtered_df)
-    filtered_leaderboard_df = filtered_leaderboard_df[columns]
     filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
     filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
     filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
@@ -154,16 +147,18 @@ def filter_fn(
 def create_control_callback(
     # button
     filter_button,
-    # inputs
     machine_textbox,
-    search_bar,
     score_slider,
     memory_slider,
     backend_checkboxes,
     datatype_checkboxes,
     optimization_checkboxes,
     quantization_checkboxes,
     columns_checkboxes,
     # outputs
     leaderboard_table,
     lat_score_mem_plot,
@@ -177,15 +172,18 @@ def create_control_callback(
     filter_button.click(
         fn=filter_fn,
         inputs=[
             machine_textbox,
-            search_bar,
             backend_checkboxes,
             datatype_checkboxes,
             optimization_checkboxes,
             quantization_checkboxes,
             columns_checkboxes,
-            score_slider,
-            memory_slider,
         ],
         outputs=[
             leaderboard_table,
@@ -200,23 +198,33 @@ def create_control_callback(
     )
-def select_fn(machine, columns):
     raw_df = get_llm_perf_df(machine=machine)
     selected_leaderboard_df = get_leaderboard_df(raw_df)
     selected_leaderboard_df = selected_leaderboard_df[columns]
     return selected_leaderboard_df
 def create_select_callback(
-    # inputs
     machine_textbox,
     columns_checkboxes,
     # outputs
     leaderboard_table,
 ):
     columns_checkboxes.change(
         fn=select_fn,
-        inputs=[machine_textbox, columns_checkboxes],
         outputs=[leaderboard_table],
     )

     # controls
     machine_textbox = gr.Textbox(value=machine, visible=False)
     with gr.Accordion("Control Panel 🎛️", open=False, elem_id="control-panel"):
         with gr.Row():
             with gr.Column(scale=1, variant="panel"):
                 score_slider = gr.Slider(
     return (
         filter_button,
         machine_textbox,
         score_slider,
         memory_slider,
         backend_checkboxes,
 def filter_fn(
     machine,
+    # inputs
+    score,
+    memory,
     backends,
     datatypes,
     optimizations,
     quantizations,
+    # interactive
     columns,
+    search,
 ):
     raw_df = get_llm_perf_df(machine=machine)
     filtered_df = raw_df[
+        # raw_df["Model 🤗"].str.contains(model, case=False)
+        raw_df["Backend 🏭"].isin(backends)
         & raw_df["DType 📥"].isin(datatypes)
         & raw_df["Optimization 🛠️"].isin(optimizations)
         & raw_df["Quantization 🗜️"].isin(quantizations)
         & (raw_df["Open LLM Score (%)"] >= score)
         & (raw_df["Allocated Memory (MB)"] <= memory)
     ]
+    filtered_leaderboard_df = select_fn(machine, columns, search)
     filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
     filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
     filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
 def create_control_callback(
     # button
     filter_button,
+    # fixed
     machine_textbox,
+    # inputs
     score_slider,
     memory_slider,
     backend_checkboxes,
     datatype_checkboxes,
     optimization_checkboxes,
     quantization_checkboxes,
+    # interactive
     columns_checkboxes,
+    search_bar,
     # outputs
     leaderboard_table,
     lat_score_mem_plot,
     filter_button.click(
         fn=filter_fn,
         inputs=[
+            # fixed
             machine_textbox,
+            # inputs
+            score_slider,
+            memory_slider,
             backend_checkboxes,
             datatype_checkboxes,
             optimization_checkboxes,
             quantization_checkboxes,
+            # interactive
             columns_checkboxes,
+            search_bar,
         ],
         outputs=[
             leaderboard_table,
     )
+def select_fn(machine, columns, search):
     raw_df = get_llm_perf_df(machine=machine)
     selected_leaderboard_df = get_leaderboard_df(raw_df)
     selected_leaderboard_df = selected_leaderboard_df[columns]
+    selected_leaderboard_df = selected_leaderboard_df[
+        selected_leaderboard_df["Model 🤗"].str.contains(search, case=False)
+    ]
     return selected_leaderboard_df
 def create_select_callback(
+    # fixed
     machine_textbox,
+    # interactive
     columns_checkboxes,
+    search_bar,
     # outputs
     leaderboard_table,
 ):
     columns_checkboxes.change(
         fn=select_fn,
+        inputs=[machine_textbox, columns_checkboxes, search_bar],
+        outputs=[leaderboard_table],
+    )
+    search_bar.change(
+        fn=select_fn,
+        inputs=[machine_textbox, columns_checkboxes, search_bar],
         outputs=[leaderboard_table],
     )

src/flashattentionv2.py CHANGED Viewed

@@ -6,10 +6,10 @@ import plotly.express as px
 FLASHATTENTIONV2_DATA = [
     # open llm
     "Model 🤗",
-    "Arch 🏛️",
     "DType 📥",
     "Backend 🏭",
     "Params (B)",
     "Open LLM Score (%)",
     # deployment settings
     "DType 📥",
@@ -18,15 +18,15 @@ FLASHATTENTIONV2_DATA = [
     "Quantization 🗜️",
     "Optimization 🛠️ FlashAttentionV2",
     # primary measurements
-    "Prefill Latency (s)",
-    "Prefill Latency (s) FlashAttentionV2",
-    "Decode Throughput (tokens/s)",
-    "Decode Throughput (tokens/s) FlashAttentionV2",
-    "E2E Throughput (tokens/s)",
-    "E2E Throughput (tokens/s) FlashAttentionV2",
     # speedups
-    "Prefill Latency Speedup (%)",
-    "Decode Throughput Speedup (%)",
 ]
@@ -43,15 +43,15 @@ def get_fa2_df(llm_perf_df):
         suffixes=["", " FlashAttentionV2"],
     )
     # compute speedups
-    fa2_df["Prefill Latency Speedup (%)"] = (
-        (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
-    ).round(2) - 100
-    fa2_df["Decode Throughput Speedup (%)"] = (
-        (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
-    fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
-    fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
     return fa2_df
@@ -61,8 +61,8 @@ def get_fa2_decode_fig(llm_perf_df):
     # plot
     decode_fig = px.box(
         fa2_df,
-        x="Arch 🏛️",
-        y="Decode Throughput Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=FLASHATTENTIONV2_DATA,
         color="Quantization 🗜️",
@@ -77,7 +77,7 @@ def get_fa2_decode_fig(llm_perf_df):
     # add layout
     decode_fig.update_layout(
         title={
-            "text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",
@@ -98,8 +98,8 @@ def get_fa2_prefill_fig(llm_perf_df):
     # plot
     prefill_fig = px.box(
         fa2_df,
-        x="Arch 🏛️",
-        y="Prefill Latency Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=FLASHATTENTIONV2_DATA,
         color="Quantization 🗜️",
@@ -114,7 +114,7 @@ def get_fa2_prefill_fig(llm_perf_df):
     # add layout
     prefill_fig.update_layout(
         title={
-            "text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",

 FLASHATTENTIONV2_DATA = [
     # open llm
     "Model 🤗",
     "DType 📥",
     "Backend 🏭",
     "Params (B)",
+    "Architecture 🏛️",
     "Open LLM Score (%)",
     # deployment settings
     "DType 📥",
     "Quantization 🗜️",
     "Optimization 🛠️ FlashAttentionV2",
     # primary measurements
+    "Prefill (s)",
+    "Prefill (s) FlashAttentionV2",
+    "Decode (tokens/s)",
+    "Decode (tokens/s) FlashAttentionV2",
+    "End-to-End (tokens/s)",
+    "End-to-End (tokens/s) FlashAttentionV2",
     # speedups
+    "Prefill Speedup (%)",
+    "Decode Speedup (%)",
 ]
         suffixes=["", " FlashAttentionV2"],
     )
     # compute speedups
+    fa2_df["Prefill Speedup (%)"] = ((fa2_df["Prefill (s)"] / fa2_df["Prefill (s) FlashAttentionV2"]) * 100).round(
+        2
+    ) - 100
+    fa2_df["Decode Speedup (%)"] = (
+        (fa2_df["Decode (tokens/s) FlashAttentionV2"] / fa2_df["Decode (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
+    fa2_df = fa2_df[fa2_df["Prefill Speedup (%)"] < 1000]
+    fa2_df = fa2_df[fa2_df["Decode Speedup (%)"] < 1000]
     return fa2_df
     # plot
     decode_fig = px.box(
         fa2_df,
+        x="Architecture 🏛️",
+        y="Decode Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=FLASHATTENTIONV2_DATA,
         color="Quantization 🗜️",
     # add layout
     decode_fig.update_layout(
         title={
+            "text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",
     # plot
     prefill_fig = px.box(
         fa2_df,
+        x="Architecture 🏛️",
+        y="Prefill Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=FLASHATTENTIONV2_DATA,
         color="Quantization 🗜️",
     # add layout
     prefill_fig.update_layout(
         title={
+            "text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",

src/latency_score_memory.py CHANGED Viewed

@@ -4,18 +4,18 @@ import plotly.express as px
 SCORE_MEMORY_LATENCY_DATA = [
     "Model 🤗",
-    "Arch 🏛️",
-    "Params (B)",
     "DType 📥",
     "Backend 🏭",
     "Optimization 🛠️",
     "Quantization 🗜️",
     "Open LLM Score (%)",
-    "Prefill Latency (s)",
-    "Decode Throughput (tokens/s)",
-    "Allocated Memory (MB)",
-    "E2E Latency (s)",
-    # "E2E Throughput (tokens/s)",
 ]
@@ -24,10 +24,10 @@ def get_lat_score_mem_fig(llm_perf_df):
     # plot
     fig = px.scatter(
         copy_df,
-        x="E2E Latency (s)",
         y="Open LLM Score (%)",
-        size="Allocated Memory (MB)",
-        color="Arch 🏛️",
         custom_data=SCORE_MEMORY_LATENCY_DATA,
         color_discrete_sequence=px.colors.qualitative.Light24,
     )
@@ -38,7 +38,7 @@ def get_lat_score_mem_fig(llm_perf_df):
     )
     fig.update_layout(
         title={
-            "text": "Latency vs. Score vs. Memory",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",
@@ -56,7 +56,7 @@ def get_lat_score_mem_fig(llm_perf_df):
 def create_lat_score_mem_plot(llm_perf_df):
     # descriptive text
-    gr.HTML("👆 Hover over the points 👆 for additional information. ",elem_id="text")
     # get figure
     fig = get_lat_score_mem_fig(llm_perf_df)
     # create plot

 SCORE_MEMORY_LATENCY_DATA = [
     "Model 🤗",
     "DType 📥",
     "Backend 🏭",
+    "Params (B)",
+    "Architecture 🏛️",
     "Optimization 🛠️",
     "Quantization 🗜️",
     "Open LLM Score (%)",
+    "Prefill (s)",
+    "Decode (tokens/s)",
+    "Memory (MB)",
+    "End-to-End (s)",
+    # "End-to-End (tokens/s)",
 ]
     # plot
     fig = px.scatter(
         copy_df,
+        x="End-to-End (s)",
         y="Open LLM Score (%)",
+        size="Memory (MB)",
+        color="Architecture 🏛️",
         custom_data=SCORE_MEMORY_LATENCY_DATA,
         color_discrete_sequence=px.colors.qualitative.Light24,
     )
     )
     fig.update_layout(
         title={
+            "text": "vs. Score vs. Memory",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",
 def create_lat_score_mem_plot(llm_perf_df):
     # descriptive text
+    gr.HTML("👆 Hover over the points 👆 for additional information. ", elem_id="text")
     # get figure
     fig = get_lat_score_mem_fig(llm_perf_df)
     # create plot

src/leaderboard.py CHANGED Viewed

@@ -8,9 +8,9 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
     "Model 🤗": "markdown",
     "Experiment 🧪": "str",
     # primary measurements
-    "Prefill Latency (s)": "number",
-    "Decode Throughput (tokens/s)": "number",
-    "Allocated Memory (MB)": "number",
     "Energy (tokens/kWh)": "number",
     # deployment settings
     "DType 📥": "str",
@@ -18,15 +18,25 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
     "Optimization 🛠️": "str",
     "Quantization 🗜️": "str",
     # additional measurements
-    "Arch 🏛️": "markdown",
     "Params (B)": "number",
     "Open LLM Score (%)": "number",
-    "E2E Latency (s)": "number",
-    "E2E Throughput (tokens/s)": "number",
     "Reserved Memory (MB)": "number",
     "Used Memory (MB)": "number",
 }
 def process_model(model_name):
     link = f"https://huggingface.co/{model_name}"
@@ -48,20 +58,29 @@ def get_leaderboard_df(llm_perf_df):
 def create_leaderboard_table(llm_perf_df):
     # get dataframe
     leaderboard_df = get_leaderboard_df(llm_perf_df)
     # create checkboxes
-    columns_checkboxes = gr.CheckboxGroup(
-        label="Columns 📊",
-        choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
-        value=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
-        info="☑️ Select the columns to display",
-        elem_id="columns-checkboxes",
-    )
     # create table
     leaderboard_table = gr.components.Dataframe(
-        value=leaderboard_df,
         datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
         headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
         elem_id="leaderboard-table",
     )
-    return leaderboard_table, columns_checkboxes

     "Model 🤗": "markdown",
     "Experiment 🧪": "str",
     # primary measurements
+    "Prefill (s)": "number",
+    "Decode (tokens/s)": "number",
+    "Memory (MB)": "number",
     "Energy (tokens/kWh)": "number",
     # deployment settings
     "DType 📥": "str",
     "Optimization 🛠️": "str",
     "Quantization 🗜️": "str",
     # additional measurements
+    "Architecture 🏛️": "markdown",
     "Params (B)": "number",
     "Open LLM Score (%)": "number",
+    "End-to-End (s)": "number",
+    "End-to-End (tokens/s)": "number",
     "Reserved Memory (MB)": "number",
     "Used Memory (MB)": "number",
 }
+PRIMARY_COLUMNS = [
+    "Model 🤗",
+    "Experiment 🧪",
+    "Prefill (s)",
+    "Decode (tokens/s)",
+    "Memory (MB)",
+    "Energy (tokens/kWh)",
+    "Open LLM Score (%)",
+]
 def process_model(model_name):
     link = f"https://huggingface.co/{model_name}"
 def create_leaderboard_table(llm_perf_df):
     # get dataframe
     leaderboard_df = get_leaderboard_df(llm_perf_df)
+    # create search bar
+    with gr.Row():
+        search_bar = gr.Textbox(
+            label="Model 🤗",
+            info="🔍 Search for a model name",
+            elem_id="search-bar",
+        )
     # create checkboxes
+    with gr.Row():
+        columns_checkboxes = gr.CheckboxGroup(
+            label="Columns 📊",
+            value=PRIMARY_COLUMNS,
+            choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
+            info="☑️ Select the columns to display",
+            elem_id="columns-checkboxes",
+        )
     # create table
     leaderboard_table = gr.components.Dataframe(
+        value=leaderboard_df[PRIMARY_COLUMNS],
         datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
         headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
         elem_id="leaderboard-table",
     )
+    return search_bar, columns_checkboxes, leaderboard_table

src/llm_perf.py CHANGED Viewed

@@ -12,9 +12,9 @@ COLUMNS_MAPPING = {
     "Model": "Model 🤗",
     "experiment_name": "Experiment 🧪",
     # primary measurements
-    "forward.latency(s)": "Prefill Latency (s)",
-    "decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
-    "generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
     "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
     # deployment settings
     "backend.name": "Backend 🏭",
@@ -22,18 +22,18 @@ COLUMNS_MAPPING = {
     "optimization": "Optimization 🛠️",
     "quantization": "Quantization 🗜️",
     # additional measurements
-    "Arch": "Arch 🏛️",
     "Size": "Params (B)",
     "Score": "Open LLM Score (%)",
-    "generate.latency(s)": "E2E Latency (s)",
-    "generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
     "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
     "generate.max_memory_used(MB)": "Used Memory (MB)",
 }
 SORTING_COLUMNS = [
     "Open LLM Score (%)",
-    "Prefill Latency (s)",
-    "Decode Throughput (tokens/s)",
 ]
 SORTING_ASCENDING = [False, True, False]
@@ -107,6 +107,13 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
     ].apply(lambda x: process_quantization_scheme(x), axis=1)
     # process experiment name
     llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
     # add arch
     llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
     # filter columns

     "Model": "Model 🤗",
     "experiment_name": "Experiment 🧪",
     # primary measurements
+    "forward.latency(s)": "Prefill (s)",
+    "decode.throughput(tokens/s)": "Decode (tokens/s)",
+    "generate.max_memory_allocated(MB)": "Memory (MB)",
     "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
     # deployment settings
     "backend.name": "Backend 🏭",
     "optimization": "Optimization 🛠️",
     "quantization": "Quantization 🗜️",
     # additional measurements
     "Size": "Params (B)",
+    "Arch": "Architecture 🏛️",
     "Score": "Open LLM Score (%)",
+    "generate.latency(s)": "End-to-End (s)",
+    "generate.throughput(tokens/s)": "End-to-End (tokens/s)",
     "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
     "generate.max_memory_used(MB)": "Used Memory (MB)",
 }
 SORTING_COLUMNS = [
     "Open LLM Score (%)",
+    "Decode (tokens/s)",
+    "Prefill (s)",
 ]
 SORTING_ASCENDING = [False, True, False]
     ].apply(lambda x: process_quantization_scheme(x), axis=1)
     # process experiment name
     llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
+    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(
+        lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x
+    )
+    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "awq"))
+    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "gptq"))
+    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "sdpa"))
+    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA2"))
     # add arch
     llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
     # filter columns

src/quantization_kernels.py CHANGED Viewed

@@ -6,10 +6,10 @@ import plotly.express as px
 QUANT_DATA = [
     # open llm
     "Model 🤗",
-    "Arch 🏛️",
     "DType 📥",
     "Backend 🏭",
     "Params (B)",
     "Open LLM Score (%)",
     # deployment settings
     "DType 📥",
@@ -19,13 +19,13 @@ QUANT_DATA = [
     "Optimization 🛠️ Custom Kernel",
     "Quantization 🗜️ Custom Kernel",
     # primary measurements
-    "Prefill Latency (s)",
-    "Prefill Latency (s) Custom Kernel",
-    "Decode Throughput (tokens/s)",
-    "Decode Throughput (tokens/s) Custom Kernel",
     # speedups
-    "Prefill Latency Speedup (%)",
-    "Decode Throughput Speedup (%)",
 ]
@@ -33,10 +33,10 @@ def get_quant_df(llm_perf_df):
     copy_df = llm_perf_df.copy()
     # seperate vanilla GPTQ experiments from Custom Kernel experiments
     vanilla_df = copy_df[
-        (copy_df["Backend 🏭"] == "pytorch") &
-        (copy_df["Quantization 🗜️"] == "None") &
-        (copy_df["Optimization 🛠️"] == "None") &
-        (copy_df["DType 📥"] == "float16")
     ]
     exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
     exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
@@ -70,15 +70,15 @@ def get_quant_df(llm_perf_df):
     # concat the two dataframes row-wise
     quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
     # compute speedups
-    quant_df["Prefill Latency Speedup (%)"] = (
-        (quant_df["Prefill Latency (s)"] / quant_df["Prefill Latency (s) Custom Kernel"]) * 100
-    ).round(2) - 100
-    quant_df["Decode Throughput Speedup (%)"] = (
-        (quant_df["Decode Throughput (tokens/s) Custom Kernel"] / quant_df["Decode Throughput (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
-    quant_df = quant_df[quant_df["Prefill Latency Speedup (%)"] < 1000]
-    quant_df = quant_df[quant_df["Decode Throughput Speedup (%)"] < 1000]
     return quant_df
@@ -88,8 +88,8 @@ def get_quant_decode_fig(llm_perf_df):
     # plot
     decode_fig = px.box(
         quant_df,
-        x="Arch 🏛️",
-        y="Decode Throughput Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=QUANT_DATA,
         color="Quantization 🗜️ Custom Kernel",
@@ -102,7 +102,7 @@ def get_quant_decode_fig(llm_perf_df):
     # add layout
     decode_fig.update_layout(
         title={
-            "text": "Decode Throughput Speedup per Architecture",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",
@@ -123,8 +123,8 @@ def get_quant_prefill_fig(llm_perf_df):
     # plot
     prefill_fig = px.box(
         quant_df,
-        x="Arch 🏛️",
-        y="Prefill Latency Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=QUANT_DATA,
         color="Quantization 🗜️ Custom Kernel",
@@ -137,7 +137,7 @@ def get_quant_prefill_fig(llm_perf_df):
     # add layout
     prefill_fig.update_layout(
         title={
-            "text": "Prefill Latency Speedup per Architecture",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",

 QUANT_DATA = [
     # open llm
     "Model 🤗",
     "DType 📥",
     "Backend 🏭",
     "Params (B)",
+    "Architecture 🏛️",
     "Open LLM Score (%)",
     # deployment settings
     "DType 📥",
     "Optimization 🛠️ Custom Kernel",
     "Quantization 🗜️ Custom Kernel",
     # primary measurements
+    "Prefill (s)",
+    "Prefill (s) Custom Kernel",
+    "Decode (tokens/s)",
+    "Decode (tokens/s) Custom Kernel",
     # speedups
+    "Prefill Speedup (%)",
+    "Decode Speedup (%)",
 ]
     copy_df = llm_perf_df.copy()
     # seperate vanilla GPTQ experiments from Custom Kernel experiments
     vanilla_df = copy_df[
+        (copy_df["Backend 🏭"] == "pytorch")
+        & (copy_df["Quantization 🗜️"] == "None")
+        & (copy_df["Optimization 🛠️"] == "None")
+        & (copy_df["DType 📥"] == "float16")
     ]
     exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
     exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
     # concat the two dataframes row-wise
     quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
     # compute speedups
+    quant_df["Prefill Speedup (%)"] = ((quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100).round(
+        2
+    ) - 100
+    quant_df["Decode Speedup (%)"] = (
+        (quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
+    quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
+    quant_df = quant_df[quant_df["Decode Speedup (%)"] < 1000]
     return quant_df
     # plot
     decode_fig = px.box(
         quant_df,
+        x="Architecture 🏛️",
+        y="Decode Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=QUANT_DATA,
         color="Quantization 🗜️ Custom Kernel",
     # add layout
     decode_fig.update_layout(
         title={
+            "text": "Decode Speedup per Architecture",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",
     # plot
     prefill_fig = px.box(
         quant_df,
+        x="Architecture 🏛️",
+        y="Prefill Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
         custom_data=QUANT_DATA,
         color="Quantization 🗜️ Custom Kernel",
     # add layout
     prefill_fig.update_layout(
         title={
+            "text": "Prefill Speedup per Architecture",
             "y": 0.95,
             "x": 0.5,
             "xanchor": "center",