kaizuberbuehler commited on
Commit
03738e4
·
1 Parent(s): 9ac5371

Update data of ARC-AGI and Simple Bench; Add Codeforces and PlanBench

Browse files
app.py CHANGED
@@ -159,81 +159,12 @@ def create_size_for_performance_plot(category_to_display: str,
159
  gr.Dropdown(choices=list(elo_ratings_for_category.keys()), value=model_to_compare, interactive=True))
160
 
161
 
162
- def create_arc_agi_plot() -> go.Figure:
163
- arc_agi_leaderboard = []
164
- with open("arc_agi_leaderboard.jsonl", 'r') as file:
165
- for line in file:
166
- arc_agi_leaderboard.append(json.loads(line))
167
-
168
- models = []
169
- with open("models.jsonl", 'r') as file:
170
- for line in file:
171
- models.append(json.loads(line))
172
-
173
- data = []
174
- for entry in arc_agi_leaderboard:
175
- model_name = entry['model']
176
- score = entry['score']
177
- model_info = next((m for m in models if m['Name'] == model_name), None)
178
- if model_info:
179
- release_date = datetime.strptime(model_info['Release Date'], "%Y-%m-%d")
180
- data.append({'model': model_name, 'score': score, 'release_date': release_date})
181
- else:
182
- print(f"[WARNING] Model '{model_name}' not found in models.jsonl")
183
-
184
- data.sort(key=lambda x: x['release_date'])
185
-
186
- x_dates = [d['release_date'] for d in data]
187
- y_scores = []
188
- max_score = 0
189
- for entry in data:
190
- if entry['score'] > max_score:
191
- max_score = entry['score']
192
- y_scores.append(max_score)
193
-
194
- fig = go.Figure()
195
-
196
- fig.add_trace(go.Scatter(
197
- x=x_dates,
198
- y=y_scores,
199
- mode='lines',
200
- line=dict(shape='hv', width=2),
201
- name='ARC-AGI Score'
202
- ))
203
-
204
- for i, entry in enumerate(data):
205
- if i == 0 or y_scores[i] > y_scores[i - 1]:
206
- fig.add_trace(go.Scatter(
207
- x=[entry['release_date']],
208
- y=[entry['score']],
209
- mode='markers+text',
210
- marker=dict(size=10),
211
- text=[entry['model']],
212
- textposition="top center",
213
- name=entry['model']
214
- ))
215
-
216
- fig.update_layout(
217
- title='ARC-AGI Score Progression Over Time',
218
- xaxis_title='Release Date',
219
- yaxis_title='ARC-AGI Score',
220
- hovermode='x unified',
221
- xaxis=dict(
222
- range=[date(2024, 5, 13), date(2024, 9, 17)],
223
- type='date'
224
- ),
225
- yaxis=dict(
226
- range=[0, 100]
227
- ),
228
- height=800
229
- )
230
-
231
- return fig
232
-
233
-
234
- def create_simple_bench_plot() -> go.Figure:
235
  simple_bench_leaderboard = []
236
- with open("simple_bench_leaderboard.jsonl", 'r') as file:
237
  for line in file:
238
  simple_bench_leaderboard.append(json.loads(line))
239
 
@@ -270,7 +201,7 @@ def create_simple_bench_plot() -> go.Figure:
270
  y=y_scores,
271
  mode='lines',
272
  line=dict(shape='hv', width=2),
273
- name='Simple Bench Score'
274
  ))
275
 
276
  for i, entry in enumerate(data):
@@ -286,16 +217,16 @@ def create_simple_bench_plot() -> go.Figure:
286
  ))
287
 
288
  fig.update_layout(
289
- title='Simple Bench Score Progression Over Time',
290
  xaxis_title='Release Date',
291
- yaxis_title='Simple Bench Score',
292
  hovermode='x unified',
293
  xaxis=dict(
294
- range=[date(2023, 6, 13), date(2024, 8, 14)],
295
  type='date'
296
  ),
297
  yaxis=dict(
298
- range=[0, 100]
299
  ),
300
  height=800
301
  )
@@ -333,22 +264,28 @@ with gr.Blocks() as demo:
333
  with gr.Tab("API Cost for Specific Performance Level", interactive=False):
334
  api_cost_for_performance_plot: gr.Plot = gr.Plot()
335
  with gr.Tab("System Performance Over Time"):
336
- with gr.Tab("ARC-AGI") as arc_agi_tab:
337
  arc_agi_plot: gr.Plot = gr.Plot()
338
  with gr.Tab("Simple Bench") as simple_bench_tab:
339
  simple_bench_plot: gr.Plot = gr.Plot()
 
 
 
 
 
 
 
 
340
  with gr.Tab("BigCodeBench", interactive=False):
341
  bigcodebench_plot: gr.Plot = gr.Plot()
342
- with gr.Tab("Codeforces", interactive=False):
343
- codeforces_plot: gr.Plot = gr.Plot()
344
  with gr.Tab("GAIA", interactive=False):
345
  gaia_plot: gr.Plot = gr.Plot()
346
  with gr.Tab("GPQA", interactive=False):
347
  gpqa_plot: gr.Plot = gr.Plot()
348
  with gr.Tab("HumanEval", interactive=False):
349
  humaneval_plot: gr.Plot = gr.Plot()
350
- with gr.Tab("LMSYS", interactive=False):
351
- lmsys_plot: gr.Plot = gr.Plot()
352
  with gr.Tab("MATH", interactive=False):
353
  math_plot: gr.Plot = gr.Plot()
354
  with gr.Tab("OpenCompass", interactive=False):
@@ -374,8 +311,23 @@ with gr.Blocks() as demo:
374
  outputs=[size_for_performance_plot,
375
  size_for_performance_category_dropdown,
376
  size_for_performance_comparison_model_dropdown])
377
- arc_agi_tab.select(fn=create_arc_agi_plot, outputs=arc_agi_plot)
378
- simple_bench_tab.select(fn=create_simple_bench_plot, outputs=simple_bench_plot)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
 
381
  if __name__ == "__main__":
 
159
  gr.Dropdown(choices=list(elo_ratings_for_category.keys()), value=model_to_compare, interactive=True))
160
 
161
 
162
+ def create_simple_plot(data_path: str,
163
+ name: str,
164
+ start_date: datetime, end_date: datetime,
165
+ min_value: int = 0, max_value: int = 100) -> go.Figure:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  simple_bench_leaderboard = []
167
+ with open(data_path, 'r') as file:
168
  for line in file:
169
  simple_bench_leaderboard.append(json.loads(line))
170
 
 
201
  y=y_scores,
202
  mode='lines',
203
  line=dict(shape='hv', width=2),
204
+ name='Best Score to Date'
205
  ))
206
 
207
  for i, entry in enumerate(data):
 
217
  ))
218
 
219
  fig.update_layout(
220
+ title=f'{name} Over Time',
221
  xaxis_title='Release Date',
222
+ yaxis_title=name,
223
  hovermode='x unified',
224
  xaxis=dict(
225
+ range=[start_date, end_date],
226
  type='date'
227
  ),
228
  yaxis=dict(
229
+ range=[min_value, max_value]
230
  ),
231
  height=800
232
  )
 
264
  with gr.Tab("API Cost for Specific Performance Level", interactive=False):
265
  api_cost_for_performance_plot: gr.Plot = gr.Plot()
266
  with gr.Tab("System Performance Over Time"):
267
+ with gr.Tab("ARC-AGI-Pub") as arc_agi_tab:
268
  arc_agi_plot: gr.Plot = gr.Plot()
269
  with gr.Tab("Simple Bench") as simple_bench_tab:
270
  simple_bench_plot: gr.Plot = gr.Plot()
271
+ with gr.Tab("PlanBench") as planbench_tab:
272
+ planbench_plot: gr.Plot = gr.Plot()
273
+ planbench_markdown: gr.Markdown = gr.Markdown(
274
+ value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
275
+ )
276
+ with gr.Tab("Codeforces") as codeforces_tab:
277
+ with gr.Tab("General-Purpose Systems"):
278
+ codeforces_plot: gr.Plot = gr.Plot()
279
  with gr.Tab("BigCodeBench", interactive=False):
280
  bigcodebench_plot: gr.Plot = gr.Plot()
 
 
281
  with gr.Tab("GAIA", interactive=False):
282
  gaia_plot: gr.Plot = gr.Plot()
283
  with gr.Tab("GPQA", interactive=False):
284
  gpqa_plot: gr.Plot = gr.Plot()
285
  with gr.Tab("HumanEval", interactive=False):
286
  humaneval_plot: gr.Plot = gr.Plot()
287
+ with gr.Tab("Chatbot Arena", interactive=False):
288
+ chatbot_arena_plot: gr.Plot = gr.Plot()
289
  with gr.Tab("MATH", interactive=False):
290
  math_plot: gr.Plot = gr.Plot()
291
  with gr.Tab("OpenCompass", interactive=False):
 
311
  outputs=[size_for_performance_plot,
312
  size_for_performance_category_dropdown,
313
  size_for_performance_comparison_model_dropdown])
314
+ arc_agi_tab.select(fn=create_simple_plot,
315
+ inputs=[gr.State("arc_agi_leaderboard.jsonl"), gr.State("ARC-AGI-Pub (Public Eval) Score"),
316
+ gr.State(date(2024, 5, 13)), gr.State(date(2024, 12, 20))],
317
+ outputs=arc_agi_plot)
318
+ simple_bench_tab.select(fn=create_simple_plot,
319
+ inputs=[gr.State("simple_bench_leaderboard.jsonl"), gr.State("Simple Bench Score"),
320
+ gr.State(date(2023, 6, 13)), gr.State(date(2024, 8, 14))],
321
+ outputs=simple_bench_plot)
322
+ codeforces_tab.select(fn=create_simple_plot,
323
+ inputs=[gr.State("codeforces_leaderboard.jsonl"), gr.State("Codeforces (Elo Rating)"),
324
+ gr.State(date(2024, 5, 13)), gr.State(date(2024, 12, 20)),
325
+ gr.State(800), gr.State(3000)],
326
+ outputs=codeforces_plot)
327
+ planbench_tab.select(fn=create_simple_plot,
328
+ inputs=[gr.State("planbench_leaderboard.jsonl"), gr.State("PlanBench (Mystery Blocksworld, 0-shot) Score"),
329
+ gr.State(date(2023, 3, 14)), gr.State(date(2024, 9, 23))],
330
+ outputs=planbench_plot)
331
 
332
 
333
  if __name__ == "__main__":
arc_agi_leaderboard.jsonl CHANGED
@@ -1,3 +1,4 @@
 
1
  {"model": "o1-preview-2024-09-12", "score": 21}
2
  {"model": "claude-3-5-sonnet-20240620", "score": 21}
3
  {"model": "o1-mini-2024-09-12", "score": 13}
 
1
+ {"model": "o3", "score": 82.8}
2
  {"model": "o1-preview-2024-09-12", "score": 21}
3
  {"model": "claude-3-5-sonnet-20240620", "score": 21}
4
  {"model": "o1-mini-2024-09-12", "score": 13}
codeforces_leaderboard.jsonl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"model": "o3", "score": 2727}
2
+ {"model": "o3-mini", "score": 2073}
3
+ {"model": "o1", "score": 1673}
4
+ {"model": "o1-mini", "score": 1650}
5
+ {"model": "o1-preview", "score": 1258}
6
+ {"model": "gpt-4o", "score": 808}
models.jsonl CHANGED
@@ -1,5 +1,10 @@
 
 
 
1
  {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
2
  {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
3
  {"Name": "deepseek-v2.5", "Release Date": "2024-09-05", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
4
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
5
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -11,6 +16,7 @@
11
  {"Name": "command-r-08-2024", "Release Date": "2024-08-19", "Total Parameters": 32, "Active Parameters": 32, "API Cost": 0}
12
  {"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
13
  {"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
14
  {"Name": "gpt-4o-2024-05-13", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
15
  {"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
16
  {"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
@@ -19,6 +25,7 @@
19
  {"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
20
  {"Name": "llama-3.1-405b-instruct-bf16", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
21
  {"Name": "llama-3.1-405b-instruct-fp8", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
 
22
  {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
23
  {"Name": "gemini-1.5-pro-api-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
24
  {"Name": "gemini-1.5-pro-api-0409-preview", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -48,6 +55,8 @@
48
  {"Name": "command-r-plus", "Release Date": "2024-04-04", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
49
  {"Name": "gemma-2-9b-it", "Release Date": "2024-06-27", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
50
  {"Name": "qwen2-72b-instruct", "Release Date": "2024-06-07", "Total Parameters": 72, "Active Parameters": 0, "API Cost": 0}
 
 
51
  {"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
52
  {"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
53
  {"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -143,4 +152,5 @@
143
  {"Name": "fastchat-t5-3b", "Release Date": "2023-04-27", "Total Parameters": 3, "Active Parameters": 3, "API Cost": 0}
144
  {"Name": "stablelm-tuned-alpha-7b", "Release Date": "2023-04-20", "Total Parameters": 7, "Active Parameters": 7, "API Cost": 0}
145
  {"Name": "dolly-v2-12b", "Release Date": "2023-04-12", "Total Parameters": 12, "Active Parameters": 12, "API Cost": 0}
146
- {"Name": "llama-13b", "Release Date": "2023-02-27", "Total Parameters": 13, "Active Parameters": 13, "API Cost": 0}
 
 
1
+ {"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
2
+ {"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
3
+ {"Name": "o1", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
4
  {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
5
+ {"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
6
  {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
7
+ {"Name": "o1-mini", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
8
  {"Name": "deepseek-v2.5", "Release Date": "2024-09-05", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
9
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
10
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
16
  {"Name": "command-r-08-2024", "Release Date": "2024-08-19", "Total Parameters": 32, "Active Parameters": 32, "API Cost": 0}
17
  {"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
18
  {"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
19
+ {"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
20
  {"Name": "gpt-4o-2024-05-13", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
21
  {"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
22
  {"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
 
25
  {"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
26
  {"Name": "llama-3.1-405b-instruct-bf16", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
27
  {"Name": "llama-3.1-405b-instruct-fp8", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
28
+ {"Name": "llama-3.1-405b", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
29
  {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
30
  {"Name": "gemini-1.5-pro-api-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
31
  {"Name": "gemini-1.5-pro-api-0409-preview", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
55
  {"Name": "command-r-plus", "Release Date": "2024-04-04", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
56
  {"Name": "gemma-2-9b-it", "Release Date": "2024-06-27", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
57
  {"Name": "qwen2-72b-instruct", "Release Date": "2024-06-07", "Total Parameters": 72, "Active Parameters": 0, "API Cost": 0}
58
+ {"Name": "gpt-4", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
59
+ {"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
60
  {"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
61
  {"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
62
  {"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
152
  {"Name": "fastchat-t5-3b", "Release Date": "2023-04-27", "Total Parameters": 3, "Active Parameters": 3, "API Cost": 0}
153
  {"Name": "stablelm-tuned-alpha-7b", "Release Date": "2023-04-20", "Total Parameters": 7, "Active Parameters": 7, "API Cost": 0}
154
  {"Name": "dolly-v2-12b", "Release Date": "2023-04-12", "Total Parameters": 12, "Active Parameters": 12, "API Cost": 0}
155
+ {"Name": "llama-13b", "Release Date": "2023-02-27", "Total Parameters": 13, "Active Parameters": 13, "API Cost": 0}
156
+ {"Name": "gpt-3.5", "Release Date": "2022-11-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
planbench_leaderboard.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"model": "o1-preview-2024-09-12", "score": 52.8}
2
+ {"model": "llama-3.1-405b", "score": 0.8}
3
+ {"model": "gpt-4", "score": 0.16}