Spaces:

UD-Filipino
/

filbench-leaderboard

Running

App Files Files Community

Lj Miranda commited on May 18

Commit

e2c374a

unverified ·

1 Parent(s): 380beab

Add plots in the leaderboard (#5)

Browse files

Files changed (4) hide show

app.py +29 -5
requirements.txt +2 -1
src/plots.py +132 -0
src/schema.py +1 -0

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from huggingface_hub import HfApi
 from src import about
 from src.display.css_html_js import custom_css
 from src.schema import AutoEvalColumn, EvalResult, fields
 # 1. Initialization
 _hf_token = os.environ.get("HF_TOKEN")
@@ -95,7 +96,7 @@ def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
     )
-def download_results():
     df, _ = get_results(source=REPO_RESULTS, aggregate=False)
     df_agg, _ = get_results(source=REPO_RESULTS, aggregate=True)
@@ -149,11 +150,14 @@ def download_results():
     df_agg = df_agg.rename(
         columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
     )
-    # Combine the full and aggregated results
     df_merge = df.merge(df_agg, on="Model")
     filepath = "filbench_results.csv"
-    df_merge.to_csv(filepath, index=False)
     return filepath
@@ -175,7 +179,27 @@ with demo:
         ):
             leaderboard = init_leaderboard(REPO_RESULTS, aggregate=False)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():

 from src import about
 from src.display.css_html_js import custom_css
 from src.schema import AutoEvalColumn, EvalResult, fields
+from src.plots import plot_parameter_efficiency, plot_cost_efficiency
 # 1. Initialization
 _hf_token = os.environ.get("HF_TOKEN")
     )
+def get_clean_df() -> pd.DataFrame:
     df, _ = get_results(source=REPO_RESULTS, aggregate=False)
     df_agg, _ = get_results(source=REPO_RESULTS, aggregate=True)
     df_agg = df_agg.rename(
         columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
     )
     df_merge = df.merge(df_agg, on="Model")
+    return df_merge
+def download_results():
+    df = get_clean_df()
     filepath = "filbench_results.csv"
+    df.to_csv(filepath, index=False)
     return filepath
         ):
             leaderboard = init_leaderboard(REPO_RESULTS, aggregate=False)
+        with gr.TabItem("📊 Analysis", id=2):
+            df = get_clean_df()
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("## Parameter-Efficiency Plot")
+                    plot_parameter_efficiency(df)
+                    gr.Markdown(
+                        "Model performance on FilBench with respect to their parameter size. "
+                        "For mixture-of-experts models, we plot their full parameter count. "
+                        "In general, we find that model size and performance are positively correlated."
+                    )
+                with gr.Column():
+                    gr.Markdown("## Cost-Efficiency Plot")
+                    plot_cost_efficiency(df)
+                    gr.Markdown(
+                        "Model performance on FilBench with respect to their per-token output cost ($/1M tokens). "
+                        "We use the token-pricing as published in [OpenRouter](https://openrouter.ai/models). "
+                        "For models not in OpenRouter, we either exlude them from the chart or use the cost of the base model it was finetuned from."
+                    )
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():

requirements.txt CHANGED Viewed

@@ -16,4 +16,5 @@ sentencepiece
 tokenizers>=0.15.0
 tqdm
 transformers
-pytz

 tokenizers>=0.15.0
 tqdm
 transformers
+pytz
+plotly

src/plots.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+def plot_parameter_efficiency(df) -> gr.Plot:
+    df = df[["Model", "Average", "# Parameters", "Multilingual"]]
+    df = df[df["# Parameters"] != -1]
+    fig = px.scatter(
+        df,
+        x="# Parameters",
+        y="Average",
+        color="Multilingual",
+        hover_name="Model",
+        hover_data={"Average": ":.1f", "# Parameters": ":.0f"},
+        labels={
+            "Average": "FilBench Score",
+            "# Parameters": "Number of Parameters (B)",
+        },
+        width=700,
+        height=500,  # Makes it square
+    )
+    # Customize layout
+    fig.update_layout(
+        # Font sizes
+        title_font_size=20,
+        legend_title_font_size=16,
+        legend_title_text="Model Type",
+        legend_font_size=14,
+        xaxis_title_font_size=16,
+        yaxis_title_font_size=16,
+        xaxis_tickfont_size=14,
+        yaxis_tickfont_size=14,
+        # Square aspect ratio
+        autosize=False,
+        # Axis limits and grid
+        yaxis_range=[0, 100],
+        plot_bgcolor="white",
+        xaxis_showgrid=True,
+        yaxis_showgrid=True,
+        xaxis_gridcolor="lightgray",
+        yaxis_gridcolor="lightgray",
+        # Legend position
+        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
+    )
+    # Marker size and style
+    fig.update_traces(
+        marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
+        selector=dict(mode="markers"),
+    )
+    return gr.Plot(fig, container=False)
+def plot_cost_efficiency(df) -> gr.Plot:
+    MODEL_PRICES = {
+        "gpt-4o-2024-08-06": 10,
+        "gpt-4o-mini": 0.6,
+        "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": 0.6,
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.3,
+        "meta-llama/Llama-3.1-70B-Instruct": 0.28,
+        "meta-llama/Llama-3.1-8B-Instruct": 0.03,
+        "Qwen/Qwen2.5-72B-Instruct": 0.39,
+        "Qwen/Qwen2.5-7B-Instruct": 0.1,
+        "google/gemma-3-27b-it": 0.2,
+        "google/gemma-2-27b-it": 0.3,
+        "google/gemma-2-9b-it": 0.06,
+        "mistralai/Ministral-8B-Instruct-2410": 0.1,
+        "mistralai/Mixtral-8x22B-Instruct-v0.1": 1.2,
+        "aisingapore/Llama-SEA-LION-v3-70B-IT": 0.28,
+        "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": 0.06,
+        "aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": 0.03,
+    }
+    df = df[["Model", "Average", "# Parameters", "Multilingual"]]
+    price_df = (
+        pd.DataFrame([MODEL_PRICES])
+        .T.reset_index()
+        .rename(columns={"index": "Model", 0: "Price-per-token"})
+    )
+    df = price_df.merge(df, on="Model", how="left")
+    # df = df[df["# Parameters"] <= 399]
+    fig = px.scatter(
+        df,
+        x="Price-per-token",
+        y="Average",
+        color="Multilingual",
+        hover_name="Model",
+        hover_data={"Price-per-token": ":.1f", "# Parameters": ":.0f"},
+        labels={
+            "Average": "FilBench Score",
+            "Price-per-token": "Price-per-token ($/1M output tokens), log scale",
+        },
+        width=700,
+        height=500,  # Makes it square
+        log_x=True,
+    )
+    # Customize layout
+    fig.update_layout(
+        # Font sizes
+        title_font_size=20,
+        legend_title_font_size=16,
+        legend_title_text="Model Type",
+        legend_font_size=14,
+        xaxis_title_font_size=16,
+        yaxis_title_font_size=16,
+        xaxis_tickfont_size=14,
+        yaxis_tickfont_size=14,
+        # Square aspect ratio
+        autosize=False,
+        # Axis limits and grid
+        yaxis_range=[0, 100],
+        plot_bgcolor="white",
+        xaxis_showgrid=True,
+        yaxis_showgrid=True,
+        xaxis_gridcolor="lightgray",
+        yaxis_gridcolor="lightgray",
+        # Legend position
+        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
+    )
+    # Marker size and style
+    fig.update_traces(
+        marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
+        selector=dict(mode="markers"),
+    )
+    return gr.Plot(fig, container=False)

src/schema.py CHANGED Viewed

@@ -41,6 +41,7 @@ class ModelSUT:
 model_registry = {
     # fmt: off
     "gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
     "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
     "aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
     "aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),

 model_registry = {
     # fmt: off
     "gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
+    "gpt-4o-mini": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
     "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
     "aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
     "aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),