Konstantin Chernyshev commited on
Commit
c933ce0
ยท
1 Parent(s): 7c8ff05

fix: add charts

Browse files
Files changed (3) hide show
  1. app.py +18 -1
  2. data/mu_math_eval_results.json +10 -10
  3. src/populate.py +40 -5
app.py CHANGED
@@ -186,9 +186,25 @@ with demo:
186
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
187
  with gr.TabItem("๐Ÿ† U-MATH", elem_id="u-math-benchmark-tab-table", id=0):
188
  leaderboard_umath = init_leaderboard(LEADERBOARD_U_MATH_DF, U_MATH_COLUMNS_DICT)
 
 
 
 
 
 
 
 
189
 
190
  with gr.TabItem("๐Ÿ… ฮผ-MATH (Meta-Benchmark)", elem_id="mu-math-benchmark-tab-table", id=1):
191
  leaderboard_mumath = init_leaderboard(LEADERBOARD_MU_MATH_DF, MU_MATH_COLUMNS_DICT)
 
 
 
 
 
 
 
 
192
 
193
  with gr.TabItem("๐Ÿ“ About", elem_id="about-tab-table", id=2):
194
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -205,4 +221,5 @@ with demo:
205
  scheduler = BackgroundScheduler()
206
  scheduler.add_job(restart_space, "interval", seconds=60 * 60)
207
  scheduler.start()
208
- demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
 
 
186
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
187
  with gr.TabItem("๐Ÿ† U-MATH", elem_id="u-math-benchmark-tab-table", id=0):
188
  leaderboard_umath = init_leaderboard(LEADERBOARD_U_MATH_DF, U_MATH_COLUMNS_DICT)
189
+ gr.ScatterPlot(
190
+ value=LEADERBOARD_U_MATH_DF,
191
+ title="U-MATH: Text vs Visual Accuracy",
192
+ x=U_MATH_COLUMNS_DICT["u_math_text_acc"].pretty_name,
193
+ y=U_MATH_COLUMNS_DICT["u_math_visual_acc"].pretty_name,
194
+ color=U_MATH_COLUMNS_DICT["model_family"].pretty_name,
195
+ tooltip=[U_MATH_COLUMNS_DICT["full_model_name"].pretty_name, U_MATH_COLUMNS_DICT["u_math_acc"].pretty_name],
196
+ )
197
 
198
  with gr.TabItem("๐Ÿ… ฮผ-MATH (Meta-Benchmark)", elem_id="mu-math-benchmark-tab-table", id=1):
199
  leaderboard_mumath = init_leaderboard(LEADERBOARD_MU_MATH_DF, MU_MATH_COLUMNS_DICT)
200
+ gr.ScatterPlot(
201
+ value=LEADERBOARD_MU_MATH_DF,
202
+ title="ฮผ-MATH: True Positive Rate (Recall) vs True Negative Rate (Specificity)",
203
+ x=MU_MATH_COLUMNS_DICT["mu_math_tpr"].pretty_name,
204
+ y=MU_MATH_COLUMNS_DICT["mu_math_tnr"].pretty_name,
205
+ color=MU_MATH_COLUMNS_DICT["model_family"].pretty_name,
206
+ tooltip=[MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name],
207
+ )
208
 
209
  with gr.TabItem("๐Ÿ“ About", elem_id="about-tab-table", id=2):
210
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
221
  scheduler = BackgroundScheduler()
222
  scheduler.add_job(restart_space, "interval", seconds=60 * 60)
223
  scheduler.start()
224
+ # demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
225
+ demo.queue(default_concurrency_limit=40).launch()
data/mu_math_eval_results.json CHANGED
@@ -2,19 +2,19 @@
2
  {
3
  "model_name": "mistralai/Ministral-8B-Instruct-2410",
4
  "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
5
- "mu_math": [0.664, 0.33, 0.651, 0.68, 0.701, 0.628],
6
- "GPT-4o": [0.664, 0.332, 0.621, 0.71, 0.696, 0.637],
7
- "Gemini-1.5-Pro": [0.672, 0.279, 0.709, 0.585, 0.798, 0.466],
8
- "Llama-3.1-70B-Instruct": [0.675, 0.317, 0.619, 0.707, 0.541, 0.769],
9
- "Qwen2.5-72B-Instruct": [0.646, 0.295, 0.626, 0.672, 0.719, 0.574]
10
  },
11
  {
12
  "model_name": "meta-llama/Llama-3.3-70B-Instruct",
13
  "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
14
- "mu_math": [0.741, 0.496, 0.666, 0.827, 0.816, 0.682],
15
- "GPT-4o": [0.731, 0.475, 0.636, 0.832, 0.802, 0.681],
16
- "Gemini-1.5-Pro": [0.705, 0.394, 0.693, 0.732, 0.856, 0.508],
17
- "Llama-3.1-70B-Instruct": [0.823, 0.605, 0.67, 0.908, 0.802, 0.832],
18
- "Qwen2.5-72B-Instruct": [0.705, 0.421, 0.658, 0.767, 0.791, 0.627]
19
  }
20
  ]
 
2
  {
3
  "model_name": "mistralai/Ministral-8B-Instruct-2410",
4
  "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
5
+ "mu_math": [0.664, 0.33, 0.651, 0.68, 0.701, 0.628, 0.574],
6
+ "GPT-4o": [0.664, 0.332, 0.621, 0.71, 0.696, 0.637, 0.574],
7
+ "Gemini-1.5-Pro": [0.672, 0.279, 0.709, 0.585, 0.798, 0.466, 0.574],
8
+ "Llama-3.1-70B-Instruct": [0.675, 0.317, 0.619, 0.707, 0.541, 0.769, 0.574],
9
+ "Qwen2.5-72B-Instruct": [0.646, 0.295, 0.626, 0.672, 0.719, 0.574, 0.574]
10
  },
11
  {
12
  "model_name": "meta-llama/Llama-3.3-70B-Instruct",
13
  "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
14
+ "mu_math": [0.741, 0.496, 0.666, 0.827, 0.816, 0.682, 0.574],
15
+ "GPT-4o": [0.731, 0.475, 0.636, 0.832, 0.802, 0.681, 0.574],
16
+ "Gemini-1.5-Pro": [0.705, 0.394, 0.693, 0.732, 0.856, 0.508, 0.574],
17
+ "Llama-3.1-70B-Instruct": [0.823, 0.605, 0.67, 0.908, 0.802, 0.832, 0.574],
18
+ "Qwen2.5-72B-Instruct": [0.705, 0.421, 0.658, 0.767, 0.791, 0.627, 0.574]
19
  }
20
  ]
src/populate.py CHANGED
@@ -7,6 +7,9 @@ from huggingface_hub import model_info
7
  from transformers import AutoConfig
8
 
9
 
 
 
 
10
  def is_model_on_hub(
11
  model_name: str, revision: str, token: str = None, trust_remote_code=False
12
  ) -> tuple[bool, str | None, str | None]:
@@ -48,6 +51,22 @@ def model_type_to_symbol(model_type: str) -> str:
48
 
49
  def get_hf_data_by_model_name(model_name: str) -> dict:
50
  """Get model data from Hugging Face API by model name"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
52
  if not still_on_hub and '/' in model_name:
53
  print(f"Model {model_name} is not on the hub, try unsloth/...")
@@ -69,6 +88,8 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
69
  print("SafeTensors not found in", model_name, e)
70
  if 'Pixtral-12B' in model_name:
71
  num_params = 12
 
 
72
  pass
73
  print("num_params", model_name, num_params)
74
 
@@ -93,6 +114,7 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
93
  "model_size": num_params if num_params else None,
94
  "model_url": model_url,
95
  "model_license": model_license,
 
96
  }
97
 
98
 
@@ -109,11 +131,14 @@ class Field:
109
  MODEL_COLUMNS_DICT = {
110
  "model_type_symbol": Field("T", "str", never_hidden=True),
111
  "model_size_symbol": Field("S", "str", never_hidden=True),
 
112
  "model_name": Field("Model Name", "markdown", never_hidden=True),
113
  "model_type": Field("Type", "str", displayed_by_default=False),
114
  "model_size": Field("#Params (B)", "number", displayed_by_default=False),
 
115
  "model_architecture": Field("Architecture", "str", displayed_by_default=False),
116
  "model_license": Field("License", "markdown", displayed_by_default=False),
 
117
  }
118
 
119
  U_MATH_COLUMNS_DICT = {
@@ -233,8 +258,11 @@ def get_u_math_leaderboard_df() -> pd.DataFrame:
233
  df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
234
  df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
235
  df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
 
236
  df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
 
237
  df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
 
238
  df["model_name"] = df["model_name"].apply(
239
  lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
240
  )
@@ -253,12 +281,16 @@ def get_mu_math_leaderboard_df() -> pd.DataFrame:
253
 
254
  # Calculate columns with prefixes f1, tpr, tnr, ppv, npv
255
  for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
256
- df[col + "_f1"] = df[col].apply(lambda x: x[0])
257
- df[col + "_tpr"] = df[col].apply(lambda x: x[1])
258
- df[col + "_tnr"] = df[col].apply(lambda x: x[2])
259
- df[col + "_ppv"] = df[col].apply(lambda x: x[3])
260
- df[col + "_npv"] = df[col].apply(lambda x: x[4])
 
 
261
  del df[col]
 
 
262
 
263
  # # flatten list [x, y, z] in columns as ["_f1", "_precision", "_recall"] suffixes for columns
264
  # for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
@@ -281,8 +313,11 @@ def get_mu_math_leaderboard_df() -> pd.DataFrame:
281
  df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
282
  df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
283
  df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
 
284
  df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
 
285
  df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
 
286
  df["model_name"] = df["model_name"].apply(
287
  lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
288
  )
 
7
  from transformers import AutoConfig
8
 
9
 
10
+ UNKNOWN_MODEL_SHOW_SIZE = 150
11
+
12
+
13
  def is_model_on_hub(
14
  model_name: str, revision: str, token: str = None, trust_remote_code=False
15
  ) -> tuple[bool, str | None, str | None]:
 
51
 
52
  def get_hf_data_by_model_name(model_name: str) -> dict:
53
  """Get model data from Hugging Face API by model name"""
54
+ model_family = "Unknown"
55
+ if 'mistral' in model_name.lower() or 'numina' in model_name.lower():
56
+ model_family = "Mistral"
57
+ elif 'meta-llama' in model_name.lower():
58
+ model_family = "LLaMA"
59
+ elif 'claude' in model_name.lower():
60
+ model_family = "Claude"
61
+ elif 'qwen' in model_name.lower() or 'athene' in model_name.lower() or 'qwq' in model_name.lower() or 'qvq' in model_name.lower():
62
+ model_family = "Qwen"
63
+ elif 'gpt' in model_name.lower() or 'o1' in model_name.lower():
64
+ model_family = "GPT"
65
+ elif 'gemini' in model_name.lower():
66
+ model_family = "Gemini"
67
+ elif 'deepseek' in model_name.lower():
68
+ model_family = "DeepSeek"
69
+
70
  still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
71
  if not still_on_hub and '/' in model_name:
72
  print(f"Model {model_name} is not on the hub, try unsloth/...")
 
88
  print("SafeTensors not found in", model_name, e)
89
  if 'Pixtral-12B' in model_name:
90
  num_params = 12
91
+ elif 'Pixtral-Large-Instruct-2411' in model_name:
92
+ num_params = 123.3
93
  pass
94
  print("num_params", model_name, num_params)
95
 
 
114
  "model_size": num_params if num_params else None,
115
  "model_url": model_url,
116
  "model_license": model_license,
117
+ "model_family": model_family,
118
  }
119
 
120
 
 
131
  MODEL_COLUMNS_DICT = {
132
  "model_type_symbol": Field("T", "str", never_hidden=True),
133
  "model_size_symbol": Field("S", "str", never_hidden=True),
134
+ "full_model_name": Field("Full Model Name", "markdown", fully_hidden=True),
135
  "model_name": Field("Model Name", "markdown", never_hidden=True),
136
  "model_type": Field("Type", "str", displayed_by_default=False),
137
  "model_size": Field("#Params (B)", "number", displayed_by_default=False),
138
+ "model_size_including_unknown": Field("#Params inc. Proprietary (B)", "number", fully_hidden=True),
139
  "model_architecture": Field("Architecture", "str", displayed_by_default=False),
140
  "model_license": Field("License", "markdown", displayed_by_default=False),
141
+ "model_family": Field("Family", "str", displayed_by_default=False),
142
  }
143
 
144
  U_MATH_COLUMNS_DICT = {
 
258
  df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
259
  df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
260
  df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
261
+ df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
262
  df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
263
+ df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
264
  df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
265
+ df["full_model_name"] = df["model_name"]
266
  df["model_name"] = df["model_name"].apply(
267
  lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
268
  )
 
281
 
282
  # Calculate columns with prefixes f1, tpr, tnr, ppv, npv
283
  for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
284
+ df[col + "_acc"] = df[col].apply(lambda x: x[0])
285
+ df[col + "_f1"] = df[col].apply(lambda x: x[1])
286
+ df[col + "_mcc"] = df[col].apply(lambda x: x[2])
287
+ df[col + "_tpr"] = df[col].apply(lambda x: x[3])
288
+ df[col + "_tnr"] = df[col].apply(lambda x: x[4])
289
+ df[col + "_ppv"] = df[col].apply(lambda x: x[5])
290
+ df[col + "_npv"] = df[col].apply(lambda x: x[6])
291
  del df[col]
292
+ del df[col + "_acc"]
293
+ del df[col + "_mcc"]
294
 
295
  # # flatten list [x, y, z] in columns as ["_f1", "_precision", "_recall"] suffixes for columns
296
  # for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
 
313
  df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
314
  df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
315
  df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
316
+ df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
317
  df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
318
+ df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
319
  df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
320
+ df["full_model_name"] = df["model_name"]
321
  df["model_name"] = df["model_name"].apply(
322
  lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
323
  )