llm-perf-leaderboard

Sleeping

App Files Files Community

BenchmarkBot commited on Jul 7, 2023

Commit

8e8c463

1 Parent(s): 67cbded

added plot

Browse files

Files changed (3) hide show

app.py +106 -39
src/assets/text_content.py +1 -1
src/utils.py +2 -1

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import gradio as gr
 import pandas as pd
@@ -16,9 +17,9 @@ COLUMNS_MAPPING = {
     "model": "Model 🤗",
     "backend.name": "Backend 🏭",
     "backend.torch_dtype": "Datatype 📥",
-    "average": "Average H4 Score ⬆️",
     "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
 }
 COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
 SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
@@ -33,16 +34,14 @@ def get_benchmark_df(benchmark):
     # load
     bench_df = pd.read_csv(
-        f"./llm-perf-dataset/reports/{benchmark}/inference_report.csv")
     scores_df = pd.read_csv(
-        f"./llm-perf-dataset/reports/average_scores.csv")
     bench_df = bench_df.merge(scores_df, on="model", how="left")
-    bench_df["average"] = bench_df["average"].apply(
-        make_clickable_score)
     # preprocess
     bench_df["model"] = bench_df["model"].apply(make_clickable_model)
     # filter
     bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
     # rename
@@ -53,55 +52,98 @@ def get_benchmark_df(benchmark):
     return bench_df
-def submit_query(text, backends, datatypes, threshold, raw_df):
-    # extract the average score (float) from the clickable score (clickable markdown)
-    raw_df["Average H4 Score ⬆️"] = raw_df["Average H4 Score ⬆️"].apply(
-        extract_score_from_clickable)
-    filtered_df = raw_df[
-        raw_df["Model 🤗"].str.lower().str.contains(text.lower()) &
-        raw_df["Backend 🏭"].isin(backends) &
-        raw_df["Datatype 📥"].isin(datatypes) &
-        (raw_df["Average H4 Score ⬆️"] >= threshold)
-    ]
-    filtered_df["Average H4 Score ⬆️"] = filtered_df["Average H4 Score ⬆️"].apply(
-        make_clickable_score)
-    return filtered_df
-# Define demo interface
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    # controls
     with gr.Row():
         search_bar = gr.Textbox(
             label="Model 🤗",
-            info="Search for a model name",
             elem_id="search-bar",
         )
         backend_checkboxes = gr.CheckboxGroup(
             label="Backends 🏭",
             choices=["pytorch", "onnxruntime"],
             value=["pytorch", "onnxruntime"],
-            info="Select the backends",
             elem_id="backend-checkboxes",
         )
         datatype_checkboxes = gr.CheckboxGroup(
             label="Datatypes 📥",
             choices=["float32", "float16"],
             value=["float32", "float16"],
-            info="Select the load datatypes",
             elem_id="datatype-checkboxes",
         )
-    with gr.Row():
         threshold_slider = gr.Slider(
             label="Average H4 Score 📈",
-            info="Filter by minimum average H4 score",
             value=0.0,
             elem_id="threshold-slider",
         )
@@ -109,16 +151,14 @@ with demo:
     with gr.Row():
         submit_button = gr.Button(
             value="Submit 🚀",
-            info="Submit the filters",
             elem_id="submit-button",
         )
     # leaderboard tabs
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🖥️ A100-80GB Benchmark 🏋️", elem_id="A100-benchmark", id=0):
             gr.HTML(SINGLE_A100_TEXT)
-            single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
             # Original leaderboard table
             single_A100_leaderboard = gr.components.Dataframe(
                 value=single_A100_df,
@@ -135,15 +175,15 @@ with demo:
                 visible=False,
             )
-        # Callbacks
-        submit_button.click(
-            submit_query,
-            [
-                search_bar, backend_checkboxes, datatype_checkboxes, threshold_slider,
-                single_A100_for_search
-            ],
-            [single_A100_leaderboard]
-        )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
@@ -153,6 +193,33 @@ with demo:
                 elem_id="citation-button",
             ).style(show_copy_button=True)
 # Restart space every hour
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600,

+import plotly.express as px
 import os
 import gradio as gr
 import pandas as pd
     "model": "Model 🤗",
     "backend.name": "Backend 🏭",
     "backend.torch_dtype": "Datatype 📥",
     "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
+    "h4_score": "H4 Score ⬆️",
 }
 COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
 SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
     # load
     bench_df = pd.read_csv(
+        f"./llm-perf-dataset/reports/{benchmark}.csv")
     scores_df = pd.read_csv(
+        f"./llm-perf-dataset/reports/additional_data.csv")
     bench_df = bench_df.merge(scores_df, on="model", how="left")
     # preprocess
     bench_df["model"] = bench_df["model"].apply(make_clickable_model)
+    bench_df["h4_score"] = bench_df["h4_score"].apply(make_clickable_score)
     # filter
     bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
     # rename
     return bench_df
+# Dataframes
+single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
+def get_benchmark_plot(benchmark):
+    if llm_perf_dataset_repo:
+        llm_perf_dataset_repo.git_pull()
+    # load
+    bench_df = pd.read_csv(
+        f"./llm-perf-dataset/reports/{benchmark}.csv")
+    scores_df = pd.read_csv(
+        f"./llm-perf-dataset/reports/additional_data.csv")
+    bench_df = bench_df.merge(scores_df, on="model", how="left")
+    fig = px.scatter(
+        bench_df, x="h4_score", y="generate.latency(s)",
+        color='model_type', symbol='backend.name', size='forward.peak_memory(MB)',
+        custom_data=['model', 'backend.name', 'backend.torch_dtype',
+                     'forward.peak_memory(MB)', 'generate.throughput(tokens/s)'],
+    )
+    fig.update_traces(
+        title={
+            'text': "Model Score vs. Latency vs. Memory",
+            'y': 0.95, 'x': 0.5,
+            'xanchor': 'center',
+            'yanchor': 'top'
+        },
+        xaxis_title="Average H4 Score",
+        yaxis_title="Latency per 1000 Tokens (s)",
+        legend_title="Model Type",
+        legend=dict(
+            orientation="h",
+            yanchor="middle",
+            xanchor="center",
+            y=-0.15,
+            x=0.5
+        ),
+        hovertemplate="<br>".join([
+            "Model: %{customdata[0]}",
+            "Backend: %{customdata[1]}",
+            "Datatype: %{customdata[2]}",
+            "Peak Memory (MB): %{customdata[3]}",
+            "Throughput (tokens/s): %{customdata[4]}",
+            "Latency per 1000 Tokens (s): %{y}",
+            "Average H4 Score: %{x}"
+        ])
+    )
+    return fig
+# Plots
+single_A100_plot = get_benchmark_plot(benchmark="1xA100-80GB")
+# Demo interface
 demo = gr.Blocks(css=custom_css)
 with demo:
+    # leaderboard title
     gr.HTML(TITLE)
+    # introduction text
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    # control panel title
+    gr.HTML("<h2>Control Panel 🎛️</h2>")
+    # control panel interface
     with gr.Row():
         search_bar = gr.Textbox(
             label="Model 🤗",
+            info="🔍 Search for a model name",
             elem_id="search-bar",
         )
         backend_checkboxes = gr.CheckboxGroup(
             label="Backends 🏭",
             choices=["pytorch", "onnxruntime"],
             value=["pytorch", "onnxruntime"],
+            info="☑️ Select the backends",
             elem_id="backend-checkboxes",
         )
         datatype_checkboxes = gr.CheckboxGroup(
             label="Datatypes 📥",
             choices=["float32", "float16"],
             value=["float32", "float16"],
+            info="☑️ Select the load datatypes",
             elem_id="datatype-checkboxes",
         )
         threshold_slider = gr.Slider(
             label="Average H4 Score 📈",
+            info="lter by minimum average H4 score",
             value=0.0,
             elem_id="threshold-slider",
         )
     with gr.Row():
         submit_button = gr.Button(
             value="Submit 🚀",
             elem_id="submit-button",
         )
     # leaderboard tabs
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🖥️ A100-80GB Leaderboard 🏆", id=0):
             gr.HTML(SINGLE_A100_TEXT)
             # Original leaderboard table
             single_A100_leaderboard = gr.components.Dataframe(
                 value=single_A100_df,
                 visible=False,
             )
+        with gr.TabItem("🖥️ A100-80GB Plot 📈", id=1):
+            # Original leaderboard plot
+            gr.HTML(SINGLE_A100_TEXT)
+            single_A100_plotly = gr.components.Plot(
+                value=single_A100_plot,
+                elem_id="1xA100-plot",
+                show_label=False,
+            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
                 elem_id="citation-button",
             ).style(show_copy_button=True)
+def submit_query(text, backends, datatypes, threshold, raw_df):
+    raw_df["H4 Score ⬆️"] = raw_df["H4 Score ⬆️"].apply(
+        extract_score_from_clickable)
+    filtered_df = raw_df[
+        raw_df["Model 🤗"].str.lower().str.contains(text.lower()) &
+        raw_df["Backend 🏭"].isin(backends) &
+        raw_df["Datatype 📥"].isin(datatypes) &
+        (raw_df["H4 Score ⬆️"] >= threshold)
+    ]
+    filtered_df["H4 Score ⬆️"] = filtered_df["H4 Score ⬆️"].apply(
+        make_clickable_score)
+    return filtered_df
+# Callbacks
+submit_button.click(
+    submit_query,
+    [
+        search_bar, backend_checkboxes, datatype_checkboxes, threshold_slider,
+        single_A100_for_search
+    ],
+    [single_A100_leaderboard]
+)
 # Restart space every hour
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600,

src/assets/text_content.py CHANGED Viewed

@@ -8,7 +8,7 @@ Anyone from the community can request a model or a hardware+backend+optimization
 - Hardware+Backend+Optimization requests should be made in the 🤗 Open LLM-Perf Leaderboard 🏋️ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) for open discussion about their relevance and feasibility.
 """
-SINGLE_A100_TEXT = """<h3>Single-GPU Benchmarks (1xA100):</h3>
 <ul>
     <li>Singleton Batch (1)</li>
     <li>Thousand Tokens (1000)</li>

 - Hardware+Backend+Optimization requests should be made in the 🤗 Open LLM-Perf Leaderboard 🏋️ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) for open discussion about their relevance and feasibility.
 """
+SINGLE_A100_TEXT = """<h3>Single-GPU Benchmark (1xA100):</h3>
 <ul>
     <li>Singleton Batch (1)</li>
     <li>Thousand Tokens (1000)</li>

src/utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from huggingface_hub import HfApi, Repository
@@ -68,4 +69,4 @@ def make_clickable_score(score):
 def extract_score_from_clickable(clickable_score) -> float:
-    return float(clickable_score.split(">")[1].split("<")[0])

+import re
 from huggingface_hub import HfApi, Repository
 def extract_score_from_clickable(clickable_score) -> float:
+    return float(re.findall(r"\d+\.\d+", clickable_score)[-1])