Lj Miranda commited on
Commit
e2c374a
·
unverified ·
1 Parent(s): 380beab

Add plots in the leaderboard (#5)

Browse files
Files changed (4) hide show
  1. app.py +29 -5
  2. requirements.txt +2 -1
  3. src/plots.py +132 -0
  4. src/schema.py +1 -0
app.py CHANGED
@@ -11,6 +11,7 @@ from huggingface_hub import HfApi
11
  from src import about
12
  from src.display.css_html_js import custom_css
13
  from src.schema import AutoEvalColumn, EvalResult, fields
 
14
 
15
  # 1. Initialization
16
  _hf_token = os.environ.get("HF_TOKEN")
@@ -95,7 +96,7 @@ def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
95
  )
96
 
97
 
98
- def download_results():
99
  df, _ = get_results(source=REPO_RESULTS, aggregate=False)
100
  df_agg, _ = get_results(source=REPO_RESULTS, aggregate=True)
101
 
@@ -149,11 +150,14 @@ def download_results():
149
  df_agg = df_agg.rename(
150
  columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
151
  )
152
-
153
- # Combine the full and aggregated results
154
  df_merge = df.merge(df_agg, on="Model")
 
 
 
 
 
155
  filepath = "filbench_results.csv"
156
- df_merge.to_csv(filepath, index=False)
157
  return filepath
158
 
159
 
@@ -175,7 +179,27 @@ with demo:
175
  ):
176
  leaderboard = init_leaderboard(REPO_RESULTS, aggregate=False)
177
 
178
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
180
 
181
  with gr.Row():
 
11
  from src import about
12
  from src.display.css_html_js import custom_css
13
  from src.schema import AutoEvalColumn, EvalResult, fields
14
+ from src.plots import plot_parameter_efficiency, plot_cost_efficiency
15
 
16
  # 1. Initialization
17
  _hf_token = os.environ.get("HF_TOKEN")
 
96
  )
97
 
98
 
99
+ def get_clean_df() -> pd.DataFrame:
100
  df, _ = get_results(source=REPO_RESULTS, aggregate=False)
101
  df_agg, _ = get_results(source=REPO_RESULTS, aggregate=True)
102
 
 
150
  df_agg = df_agg.rename(
151
  columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
152
  )
 
 
153
  df_merge = df.merge(df_agg, on="Model")
154
+ return df_merge
155
+
156
+
157
+ def download_results():
158
+ df = get_clean_df()
159
  filepath = "filbench_results.csv"
160
+ df.to_csv(filepath, index=False)
161
  return filepath
162
 
163
 
 
179
  ):
180
  leaderboard = init_leaderboard(REPO_RESULTS, aggregate=False)
181
 
182
+ with gr.TabItem("📊 Analysis", id=2):
183
+ df = get_clean_df()
184
+ with gr.Row():
185
+ with gr.Column():
186
+ gr.Markdown("## Parameter-Efficiency Plot")
187
+ plot_parameter_efficiency(df)
188
+ gr.Markdown(
189
+ "Model performance on FilBench with respect to their parameter size. "
190
+ "For mixture-of-experts models, we plot their full parameter count. "
191
+ "In general, we find that model size and performance are positively correlated."
192
+ )
193
+ with gr.Column():
194
+ gr.Markdown("## Cost-Efficiency Plot")
195
+ plot_cost_efficiency(df)
196
+ gr.Markdown(
197
+ "Model performance on FilBench with respect to their per-token output cost ($/1M tokens). "
198
+ "We use the token-pricing as published in [OpenRouter](https://openrouter.ai/models). "
199
+ "For models not in OpenRouter, we either exlude them from the chart or use the cost of the base model it was finetuned from."
200
+ )
201
+
202
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
203
  gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
204
 
205
  with gr.Row():
requirements.txt CHANGED
@@ -16,4 +16,5 @@ sentencepiece
16
  tokenizers>=0.15.0
17
  tqdm
18
  transformers
19
- pytz
 
 
16
  tokenizers>=0.15.0
17
  tqdm
18
  transformers
19
+ pytz
20
+ plotly
src/plots.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+
6
+ def plot_parameter_efficiency(df) -> gr.Plot:
7
+ df = df[["Model", "Average", "# Parameters", "Multilingual"]]
8
+ df = df[df["# Parameters"] != -1]
9
+ fig = px.scatter(
10
+ df,
11
+ x="# Parameters",
12
+ y="Average",
13
+ color="Multilingual",
14
+ hover_name="Model",
15
+ hover_data={"Average": ":.1f", "# Parameters": ":.0f"},
16
+ labels={
17
+ "Average": "FilBench Score",
18
+ "# Parameters": "Number of Parameters (B)",
19
+ },
20
+ width=700,
21
+ height=500, # Makes it square
22
+ )
23
+
24
+ # Customize layout
25
+ fig.update_layout(
26
+ # Font sizes
27
+ title_font_size=20,
28
+ legend_title_font_size=16,
29
+ legend_title_text="Model Type",
30
+ legend_font_size=14,
31
+ xaxis_title_font_size=16,
32
+ yaxis_title_font_size=16,
33
+ xaxis_tickfont_size=14,
34
+ yaxis_tickfont_size=14,
35
+ # Square aspect ratio
36
+ autosize=False,
37
+ # Axis limits and grid
38
+ yaxis_range=[0, 100],
39
+ plot_bgcolor="white",
40
+ xaxis_showgrid=True,
41
+ yaxis_showgrid=True,
42
+ xaxis_gridcolor="lightgray",
43
+ yaxis_gridcolor="lightgray",
44
+ # Legend position
45
+ legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
46
+ )
47
+
48
+ # Marker size and style
49
+ fig.update_traces(
50
+ marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
51
+ selector=dict(mode="markers"),
52
+ )
53
+
54
+ return gr.Plot(fig, container=False)
55
+
56
+
57
+ def plot_cost_efficiency(df) -> gr.Plot:
58
+ MODEL_PRICES = {
59
+ "gpt-4o-2024-08-06": 10,
60
+ "gpt-4o-mini": 0.6,
61
+ "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": 0.6,
62
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.3,
63
+ "meta-llama/Llama-3.1-70B-Instruct": 0.28,
64
+ "meta-llama/Llama-3.1-8B-Instruct": 0.03,
65
+ "Qwen/Qwen2.5-72B-Instruct": 0.39,
66
+ "Qwen/Qwen2.5-7B-Instruct": 0.1,
67
+ "google/gemma-3-27b-it": 0.2,
68
+ "google/gemma-2-27b-it": 0.3,
69
+ "google/gemma-2-9b-it": 0.06,
70
+ "mistralai/Ministral-8B-Instruct-2410": 0.1,
71
+ "mistralai/Mixtral-8x22B-Instruct-v0.1": 1.2,
72
+ "aisingapore/Llama-SEA-LION-v3-70B-IT": 0.28,
73
+ "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": 0.06,
74
+ "aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": 0.03,
75
+ }
76
+
77
+ df = df[["Model", "Average", "# Parameters", "Multilingual"]]
78
+
79
+ price_df = (
80
+ pd.DataFrame([MODEL_PRICES])
81
+ .T.reset_index()
82
+ .rename(columns={"index": "Model", 0: "Price-per-token"})
83
+ )
84
+ df = price_df.merge(df, on="Model", how="left")
85
+ # df = df[df["# Parameters"] <= 399]
86
+ fig = px.scatter(
87
+ df,
88
+ x="Price-per-token",
89
+ y="Average",
90
+ color="Multilingual",
91
+ hover_name="Model",
92
+ hover_data={"Price-per-token": ":.1f", "# Parameters": ":.0f"},
93
+ labels={
94
+ "Average": "FilBench Score",
95
+ "Price-per-token": "Price-per-token ($/1M output tokens), log scale",
96
+ },
97
+ width=700,
98
+ height=500, # Makes it square
99
+ log_x=True,
100
+ )
101
+
102
+ # Customize layout
103
+ fig.update_layout(
104
+ # Font sizes
105
+ title_font_size=20,
106
+ legend_title_font_size=16,
107
+ legend_title_text="Model Type",
108
+ legend_font_size=14,
109
+ xaxis_title_font_size=16,
110
+ yaxis_title_font_size=16,
111
+ xaxis_tickfont_size=14,
112
+ yaxis_tickfont_size=14,
113
+ # Square aspect ratio
114
+ autosize=False,
115
+ # Axis limits and grid
116
+ yaxis_range=[0, 100],
117
+ plot_bgcolor="white",
118
+ xaxis_showgrid=True,
119
+ yaxis_showgrid=True,
120
+ xaxis_gridcolor="lightgray",
121
+ yaxis_gridcolor="lightgray",
122
+ # Legend position
123
+ legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
124
+ )
125
+
126
+ # Marker size and style
127
+ fig.update_traces(
128
+ marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
129
+ selector=dict(mode="markers"),
130
+ )
131
+
132
+ return gr.Plot(fig, container=False)
src/schema.py CHANGED
@@ -41,6 +41,7 @@ class ModelSUT:
41
  model_registry = {
42
  # fmt: off
43
  "gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
 
44
  "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
45
  "aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
46
  "aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
 
41
  model_registry = {
42
  # fmt: off
43
  "gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
44
+ "gpt-4o-mini": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
45
  "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
46
  "aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
47
  "aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),