Lj Miranda
commited on
Add plots in the leaderboard (#5)
Browse files- app.py +29 -5
- requirements.txt +2 -1
- src/plots.py +132 -0
- src/schema.py +1 -0
app.py
CHANGED
@@ -11,6 +11,7 @@ from huggingface_hub import HfApi
|
|
11 |
from src import about
|
12 |
from src.display.css_html_js import custom_css
|
13 |
from src.schema import AutoEvalColumn, EvalResult, fields
|
|
|
14 |
|
15 |
# 1. Initialization
|
16 |
_hf_token = os.environ.get("HF_TOKEN")
|
@@ -95,7 +96,7 @@ def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
|
|
95 |
)
|
96 |
|
97 |
|
98 |
-
def
|
99 |
df, _ = get_results(source=REPO_RESULTS, aggregate=False)
|
100 |
df_agg, _ = get_results(source=REPO_RESULTS, aggregate=True)
|
101 |
|
@@ -149,11 +150,14 @@ def download_results():
|
|
149 |
df_agg = df_agg.rename(
|
150 |
columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
|
151 |
)
|
152 |
-
|
153 |
-
# Combine the full and aggregated results
|
154 |
df_merge = df.merge(df_agg, on="Model")
|
|
|
|
|
|
|
|
|
|
|
155 |
filepath = "filbench_results.csv"
|
156 |
-
|
157 |
return filepath
|
158 |
|
159 |
|
@@ -175,7 +179,27 @@ with demo:
|
|
175 |
):
|
176 |
leaderboard = init_leaderboard(REPO_RESULTS, aggregate=False)
|
177 |
|
178 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
180 |
|
181 |
with gr.Row():
|
|
|
11 |
from src import about
|
12 |
from src.display.css_html_js import custom_css
|
13 |
from src.schema import AutoEvalColumn, EvalResult, fields
|
14 |
+
from src.plots import plot_parameter_efficiency, plot_cost_efficiency
|
15 |
|
16 |
# 1. Initialization
|
17 |
_hf_token = os.environ.get("HF_TOKEN")
|
|
|
96 |
)
|
97 |
|
98 |
|
99 |
+
def get_clean_df() -> pd.DataFrame:
|
100 |
df, _ = get_results(source=REPO_RESULTS, aggregate=False)
|
101 |
df_agg, _ = get_results(source=REPO_RESULTS, aggregate=True)
|
102 |
|
|
|
150 |
df_agg = df_agg.rename(
|
151 |
columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
|
152 |
)
|
|
|
|
|
153 |
df_merge = df.merge(df_agg, on="Model")
|
154 |
+
return df_merge
|
155 |
+
|
156 |
+
|
157 |
+
def download_results():
|
158 |
+
df = get_clean_df()
|
159 |
filepath = "filbench_results.csv"
|
160 |
+
df.to_csv(filepath, index=False)
|
161 |
return filepath
|
162 |
|
163 |
|
|
|
179 |
):
|
180 |
leaderboard = init_leaderboard(REPO_RESULTS, aggregate=False)
|
181 |
|
182 |
+
with gr.TabItem("📊 Analysis", id=2):
|
183 |
+
df = get_clean_df()
|
184 |
+
with gr.Row():
|
185 |
+
with gr.Column():
|
186 |
+
gr.Markdown("## Parameter-Efficiency Plot")
|
187 |
+
plot_parameter_efficiency(df)
|
188 |
+
gr.Markdown(
|
189 |
+
"Model performance on FilBench with respect to their parameter size. "
|
190 |
+
"For mixture-of-experts models, we plot their full parameter count. "
|
191 |
+
"In general, we find that model size and performance are positively correlated."
|
192 |
+
)
|
193 |
+
with gr.Column():
|
194 |
+
gr.Markdown("## Cost-Efficiency Plot")
|
195 |
+
plot_cost_efficiency(df)
|
196 |
+
gr.Markdown(
|
197 |
+
"Model performance on FilBench with respect to their per-token output cost ($/1M tokens). "
|
198 |
+
"We use the token-pricing as published in [OpenRouter](https://openrouter.ai/models). "
|
199 |
+
"For models not in OpenRouter, we either exlude them from the chart or use the cost of the base model it was finetuned from."
|
200 |
+
)
|
201 |
+
|
202 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
203 |
gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
204 |
|
205 |
with gr.Row():
|
requirements.txt
CHANGED
@@ -16,4 +16,5 @@ sentencepiece
|
|
16 |
tokenizers>=0.15.0
|
17 |
tqdm
|
18 |
transformers
|
19 |
-
pytz
|
|
|
|
16 |
tokenizers>=0.15.0
|
17 |
tqdm
|
18 |
transformers
|
19 |
+
pytz
|
20 |
+
plotly
|
src/plots.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
|
5 |
+
|
6 |
+
def plot_parameter_efficiency(df) -> gr.Plot:
|
7 |
+
df = df[["Model", "Average", "# Parameters", "Multilingual"]]
|
8 |
+
df = df[df["# Parameters"] != -1]
|
9 |
+
fig = px.scatter(
|
10 |
+
df,
|
11 |
+
x="# Parameters",
|
12 |
+
y="Average",
|
13 |
+
color="Multilingual",
|
14 |
+
hover_name="Model",
|
15 |
+
hover_data={"Average": ":.1f", "# Parameters": ":.0f"},
|
16 |
+
labels={
|
17 |
+
"Average": "FilBench Score",
|
18 |
+
"# Parameters": "Number of Parameters (B)",
|
19 |
+
},
|
20 |
+
width=700,
|
21 |
+
height=500, # Makes it square
|
22 |
+
)
|
23 |
+
|
24 |
+
# Customize layout
|
25 |
+
fig.update_layout(
|
26 |
+
# Font sizes
|
27 |
+
title_font_size=20,
|
28 |
+
legend_title_font_size=16,
|
29 |
+
legend_title_text="Model Type",
|
30 |
+
legend_font_size=14,
|
31 |
+
xaxis_title_font_size=16,
|
32 |
+
yaxis_title_font_size=16,
|
33 |
+
xaxis_tickfont_size=14,
|
34 |
+
yaxis_tickfont_size=14,
|
35 |
+
# Square aspect ratio
|
36 |
+
autosize=False,
|
37 |
+
# Axis limits and grid
|
38 |
+
yaxis_range=[0, 100],
|
39 |
+
plot_bgcolor="white",
|
40 |
+
xaxis_showgrid=True,
|
41 |
+
yaxis_showgrid=True,
|
42 |
+
xaxis_gridcolor="lightgray",
|
43 |
+
yaxis_gridcolor="lightgray",
|
44 |
+
# Legend position
|
45 |
+
legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
|
46 |
+
)
|
47 |
+
|
48 |
+
# Marker size and style
|
49 |
+
fig.update_traces(
|
50 |
+
marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
|
51 |
+
selector=dict(mode="markers"),
|
52 |
+
)
|
53 |
+
|
54 |
+
return gr.Plot(fig, container=False)
|
55 |
+
|
56 |
+
|
57 |
+
def plot_cost_efficiency(df) -> gr.Plot:
|
58 |
+
MODEL_PRICES = {
|
59 |
+
"gpt-4o-2024-08-06": 10,
|
60 |
+
"gpt-4o-mini": 0.6,
|
61 |
+
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": 0.6,
|
62 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.3,
|
63 |
+
"meta-llama/Llama-3.1-70B-Instruct": 0.28,
|
64 |
+
"meta-llama/Llama-3.1-8B-Instruct": 0.03,
|
65 |
+
"Qwen/Qwen2.5-72B-Instruct": 0.39,
|
66 |
+
"Qwen/Qwen2.5-7B-Instruct": 0.1,
|
67 |
+
"google/gemma-3-27b-it": 0.2,
|
68 |
+
"google/gemma-2-27b-it": 0.3,
|
69 |
+
"google/gemma-2-9b-it": 0.06,
|
70 |
+
"mistralai/Ministral-8B-Instruct-2410": 0.1,
|
71 |
+
"mistralai/Mixtral-8x22B-Instruct-v0.1": 1.2,
|
72 |
+
"aisingapore/Llama-SEA-LION-v3-70B-IT": 0.28,
|
73 |
+
"aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": 0.06,
|
74 |
+
"aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": 0.03,
|
75 |
+
}
|
76 |
+
|
77 |
+
df = df[["Model", "Average", "# Parameters", "Multilingual"]]
|
78 |
+
|
79 |
+
price_df = (
|
80 |
+
pd.DataFrame([MODEL_PRICES])
|
81 |
+
.T.reset_index()
|
82 |
+
.rename(columns={"index": "Model", 0: "Price-per-token"})
|
83 |
+
)
|
84 |
+
df = price_df.merge(df, on="Model", how="left")
|
85 |
+
# df = df[df["# Parameters"] <= 399]
|
86 |
+
fig = px.scatter(
|
87 |
+
df,
|
88 |
+
x="Price-per-token",
|
89 |
+
y="Average",
|
90 |
+
color="Multilingual",
|
91 |
+
hover_name="Model",
|
92 |
+
hover_data={"Price-per-token": ":.1f", "# Parameters": ":.0f"},
|
93 |
+
labels={
|
94 |
+
"Average": "FilBench Score",
|
95 |
+
"Price-per-token": "Price-per-token ($/1M output tokens), log scale",
|
96 |
+
},
|
97 |
+
width=700,
|
98 |
+
height=500, # Makes it square
|
99 |
+
log_x=True,
|
100 |
+
)
|
101 |
+
|
102 |
+
# Customize layout
|
103 |
+
fig.update_layout(
|
104 |
+
# Font sizes
|
105 |
+
title_font_size=20,
|
106 |
+
legend_title_font_size=16,
|
107 |
+
legend_title_text="Model Type",
|
108 |
+
legend_font_size=14,
|
109 |
+
xaxis_title_font_size=16,
|
110 |
+
yaxis_title_font_size=16,
|
111 |
+
xaxis_tickfont_size=14,
|
112 |
+
yaxis_tickfont_size=14,
|
113 |
+
# Square aspect ratio
|
114 |
+
autosize=False,
|
115 |
+
# Axis limits and grid
|
116 |
+
yaxis_range=[0, 100],
|
117 |
+
plot_bgcolor="white",
|
118 |
+
xaxis_showgrid=True,
|
119 |
+
yaxis_showgrid=True,
|
120 |
+
xaxis_gridcolor="lightgray",
|
121 |
+
yaxis_gridcolor="lightgray",
|
122 |
+
# Legend position
|
123 |
+
legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
|
124 |
+
)
|
125 |
+
|
126 |
+
# Marker size and style
|
127 |
+
fig.update_traces(
|
128 |
+
marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
|
129 |
+
selector=dict(mode="markers"),
|
130 |
+
)
|
131 |
+
|
132 |
+
return gr.Plot(fig, container=False)
|
src/schema.py
CHANGED
@@ -41,6 +41,7 @@ class ModelSUT:
|
|
41 |
model_registry = {
|
42 |
# fmt: off
|
43 |
"gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
|
|
|
44 |
"aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
|
45 |
"aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
|
46 |
"aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
|
|
|
41 |
model_registry = {
|
42 |
# fmt: off
|
43 |
"gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
|
44 |
+
"gpt-4o-mini": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
|
45 |
"aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
|
46 |
"aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
|
47 |
"aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
|