Spaces:
Running
Running
Konstantin Chernyshev
commited on
Commit
ยท
c933ce0
1
Parent(s):
7c8ff05
fix: add charts
Browse files- app.py +18 -1
- data/mu_math_eval_results.json +10 -10
- src/populate.py +40 -5
app.py
CHANGED
@@ -186,9 +186,25 @@ with demo:
|
|
186 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
187 |
with gr.TabItem("๐ U-MATH", elem_id="u-math-benchmark-tab-table", id=0):
|
188 |
leaderboard_umath = init_leaderboard(LEADERBOARD_U_MATH_DF, U_MATH_COLUMNS_DICT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
with gr.TabItem("๐
ฮผ-MATH (Meta-Benchmark)", elem_id="mu-math-benchmark-tab-table", id=1):
|
191 |
leaderboard_mumath = init_leaderboard(LEADERBOARD_MU_MATH_DF, MU_MATH_COLUMNS_DICT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
with gr.TabItem("๐ About", elem_id="about-tab-table", id=2):
|
194 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
@@ -205,4 +221,5 @@ with demo:
|
|
205 |
scheduler = BackgroundScheduler()
|
206 |
scheduler.add_job(restart_space, "interval", seconds=60 * 60)
|
207 |
scheduler.start()
|
208 |
-
demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
|
|
|
|
186 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
187 |
with gr.TabItem("๐ U-MATH", elem_id="u-math-benchmark-tab-table", id=0):
|
188 |
leaderboard_umath = init_leaderboard(LEADERBOARD_U_MATH_DF, U_MATH_COLUMNS_DICT)
|
189 |
+
gr.ScatterPlot(
|
190 |
+
value=LEADERBOARD_U_MATH_DF,
|
191 |
+
title="U-MATH: Text vs Visual Accuracy",
|
192 |
+
x=U_MATH_COLUMNS_DICT["u_math_text_acc"].pretty_name,
|
193 |
+
y=U_MATH_COLUMNS_DICT["u_math_visual_acc"].pretty_name,
|
194 |
+
color=U_MATH_COLUMNS_DICT["model_family"].pretty_name,
|
195 |
+
tooltip=[U_MATH_COLUMNS_DICT["full_model_name"].pretty_name, U_MATH_COLUMNS_DICT["u_math_acc"].pretty_name],
|
196 |
+
)
|
197 |
|
198 |
with gr.TabItem("๐
ฮผ-MATH (Meta-Benchmark)", elem_id="mu-math-benchmark-tab-table", id=1):
|
199 |
leaderboard_mumath = init_leaderboard(LEADERBOARD_MU_MATH_DF, MU_MATH_COLUMNS_DICT)
|
200 |
+
gr.ScatterPlot(
|
201 |
+
value=LEADERBOARD_MU_MATH_DF,
|
202 |
+
title="ฮผ-MATH: True Positive Rate (Recall) vs True Negative Rate (Specificity)",
|
203 |
+
x=MU_MATH_COLUMNS_DICT["mu_math_tpr"].pretty_name,
|
204 |
+
y=MU_MATH_COLUMNS_DICT["mu_math_tnr"].pretty_name,
|
205 |
+
color=MU_MATH_COLUMNS_DICT["model_family"].pretty_name,
|
206 |
+
tooltip=[MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name],
|
207 |
+
)
|
208 |
|
209 |
with gr.TabItem("๐ About", elem_id="about-tab-table", id=2):
|
210 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
221 |
scheduler = BackgroundScheduler()
|
222 |
scheduler.add_job(restart_space, "interval", seconds=60 * 60)
|
223 |
scheduler.start()
|
224 |
+
# demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
|
225 |
+
demo.queue(default_concurrency_limit=40).launch()
|
data/mu_math_eval_results.json
CHANGED
@@ -2,19 +2,19 @@
|
|
2 |
{
|
3 |
"model_name": "mistralai/Ministral-8B-Instruct-2410",
|
4 |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
|
5 |
-
"mu_math": [0.664, 0.33, 0.651, 0.68, 0.701, 0.628],
|
6 |
-
"GPT-4o": [0.664, 0.332, 0.621, 0.71, 0.696, 0.637],
|
7 |
-
"Gemini-1.5-Pro": [0.672, 0.279, 0.709, 0.585, 0.798, 0.466],
|
8 |
-
"Llama-3.1-70B-Instruct": [0.675, 0.317, 0.619, 0.707, 0.541, 0.769],
|
9 |
-
"Qwen2.5-72B-Instruct": [0.646, 0.295, 0.626, 0.672, 0.719, 0.574]
|
10 |
},
|
11 |
{
|
12 |
"model_name": "meta-llama/Llama-3.3-70B-Instruct",
|
13 |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
|
14 |
-
"mu_math": [0.741, 0.496, 0.666, 0.827, 0.816, 0.682],
|
15 |
-
"GPT-4o": [0.731, 0.475, 0.636, 0.832, 0.802, 0.681],
|
16 |
-
"Gemini-1.5-Pro": [0.705, 0.394, 0.693, 0.732, 0.856, 0.508],
|
17 |
-
"Llama-3.1-70B-Instruct": [0.823, 0.605, 0.67, 0.908, 0.802, 0.832],
|
18 |
-
"Qwen2.5-72B-Instruct": [0.705, 0.421, 0.658, 0.767, 0.791, 0.627]
|
19 |
}
|
20 |
]
|
|
|
2 |
{
|
3 |
"model_name": "mistralai/Ministral-8B-Instruct-2410",
|
4 |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
|
5 |
+
"mu_math": [0.664, 0.33, 0.651, 0.68, 0.701, 0.628, 0.574],
|
6 |
+
"GPT-4o": [0.664, 0.332, 0.621, 0.71, 0.696, 0.637, 0.574],
|
7 |
+
"Gemini-1.5-Pro": [0.672, 0.279, 0.709, 0.585, 0.798, 0.466, 0.574],
|
8 |
+
"Llama-3.1-70B-Instruct": [0.675, 0.317, 0.619, 0.707, 0.541, 0.769, 0.574],
|
9 |
+
"Qwen2.5-72B-Instruct": [0.646, 0.295, 0.626, 0.672, 0.719, 0.574, 0.574]
|
10 |
},
|
11 |
{
|
12 |
"model_name": "meta-llama/Llama-3.3-70B-Instruct",
|
13 |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
|
14 |
+
"mu_math": [0.741, 0.496, 0.666, 0.827, 0.816, 0.682, 0.574],
|
15 |
+
"GPT-4o": [0.731, 0.475, 0.636, 0.832, 0.802, 0.681, 0.574],
|
16 |
+
"Gemini-1.5-Pro": [0.705, 0.394, 0.693, 0.732, 0.856, 0.508, 0.574],
|
17 |
+
"Llama-3.1-70B-Instruct": [0.823, 0.605, 0.67, 0.908, 0.802, 0.832, 0.574],
|
18 |
+
"Qwen2.5-72B-Instruct": [0.705, 0.421, 0.658, 0.767, 0.791, 0.627, 0.574]
|
19 |
}
|
20 |
]
|
src/populate.py
CHANGED
@@ -7,6 +7,9 @@ from huggingface_hub import model_info
|
|
7 |
from transformers import AutoConfig
|
8 |
|
9 |
|
|
|
|
|
|
|
10 |
def is_model_on_hub(
|
11 |
model_name: str, revision: str, token: str = None, trust_remote_code=False
|
12 |
) -> tuple[bool, str | None, str | None]:
|
@@ -48,6 +51,22 @@ def model_type_to_symbol(model_type: str) -> str:
|
|
48 |
|
49 |
def get_hf_data_by_model_name(model_name: str) -> dict:
|
50 |
"""Get model data from Hugging Face API by model name"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
|
52 |
if not still_on_hub and '/' in model_name:
|
53 |
print(f"Model {model_name} is not on the hub, try unsloth/...")
|
@@ -69,6 +88,8 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
|
|
69 |
print("SafeTensors not found in", model_name, e)
|
70 |
if 'Pixtral-12B' in model_name:
|
71 |
num_params = 12
|
|
|
|
|
72 |
pass
|
73 |
print("num_params", model_name, num_params)
|
74 |
|
@@ -93,6 +114,7 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
|
|
93 |
"model_size": num_params if num_params else None,
|
94 |
"model_url": model_url,
|
95 |
"model_license": model_license,
|
|
|
96 |
}
|
97 |
|
98 |
|
@@ -109,11 +131,14 @@ class Field:
|
|
109 |
MODEL_COLUMNS_DICT = {
|
110 |
"model_type_symbol": Field("T", "str", never_hidden=True),
|
111 |
"model_size_symbol": Field("S", "str", never_hidden=True),
|
|
|
112 |
"model_name": Field("Model Name", "markdown", never_hidden=True),
|
113 |
"model_type": Field("Type", "str", displayed_by_default=False),
|
114 |
"model_size": Field("#Params (B)", "number", displayed_by_default=False),
|
|
|
115 |
"model_architecture": Field("Architecture", "str", displayed_by_default=False),
|
116 |
"model_license": Field("License", "markdown", displayed_by_default=False),
|
|
|
117 |
}
|
118 |
|
119 |
U_MATH_COLUMNS_DICT = {
|
@@ -233,8 +258,11 @@ def get_u_math_leaderboard_df() -> pd.DataFrame:
|
|
233 |
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
234 |
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
235 |
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
|
|
236 |
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
|
|
237 |
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
|
|
238 |
df["model_name"] = df["model_name"].apply(
|
239 |
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
240 |
)
|
@@ -253,12 +281,16 @@ def get_mu_math_leaderboard_df() -> pd.DataFrame:
|
|
253 |
|
254 |
# Calculate columns with prefixes f1, tpr, tnr, ppv, npv
|
255 |
for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
256 |
-
df[col + "
|
257 |
-
df[col + "
|
258 |
-
df[col + "
|
259 |
-
df[col + "
|
260 |
-
df[col + "
|
|
|
|
|
261 |
del df[col]
|
|
|
|
|
262 |
|
263 |
# # flatten list [x, y, z] in columns as ["_f1", "_precision", "_recall"] suffixes for columns
|
264 |
# for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
@@ -281,8 +313,11 @@ def get_mu_math_leaderboard_df() -> pd.DataFrame:
|
|
281 |
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
282 |
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
283 |
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
|
|
284 |
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
|
|
285 |
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
|
|
286 |
df["model_name"] = df["model_name"].apply(
|
287 |
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
288 |
)
|
|
|
7 |
from transformers import AutoConfig
|
8 |
|
9 |
|
10 |
+
UNKNOWN_MODEL_SHOW_SIZE = 150
|
11 |
+
|
12 |
+
|
13 |
def is_model_on_hub(
|
14 |
model_name: str, revision: str, token: str = None, trust_remote_code=False
|
15 |
) -> tuple[bool, str | None, str | None]:
|
|
|
51 |
|
52 |
def get_hf_data_by_model_name(model_name: str) -> dict:
|
53 |
"""Get model data from Hugging Face API by model name"""
|
54 |
+
model_family = "Unknown"
|
55 |
+
if 'mistral' in model_name.lower() or 'numina' in model_name.lower():
|
56 |
+
model_family = "Mistral"
|
57 |
+
elif 'meta-llama' in model_name.lower():
|
58 |
+
model_family = "LLaMA"
|
59 |
+
elif 'claude' in model_name.lower():
|
60 |
+
model_family = "Claude"
|
61 |
+
elif 'qwen' in model_name.lower() or 'athene' in model_name.lower() or 'qwq' in model_name.lower() or 'qvq' in model_name.lower():
|
62 |
+
model_family = "Qwen"
|
63 |
+
elif 'gpt' in model_name.lower() or 'o1' in model_name.lower():
|
64 |
+
model_family = "GPT"
|
65 |
+
elif 'gemini' in model_name.lower():
|
66 |
+
model_family = "Gemini"
|
67 |
+
elif 'deepseek' in model_name.lower():
|
68 |
+
model_family = "DeepSeek"
|
69 |
+
|
70 |
still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
|
71 |
if not still_on_hub and '/' in model_name:
|
72 |
print(f"Model {model_name} is not on the hub, try unsloth/...")
|
|
|
88 |
print("SafeTensors not found in", model_name, e)
|
89 |
if 'Pixtral-12B' in model_name:
|
90 |
num_params = 12
|
91 |
+
elif 'Pixtral-Large-Instruct-2411' in model_name:
|
92 |
+
num_params = 123.3
|
93 |
pass
|
94 |
print("num_params", model_name, num_params)
|
95 |
|
|
|
114 |
"model_size": num_params if num_params else None,
|
115 |
"model_url": model_url,
|
116 |
"model_license": model_license,
|
117 |
+
"model_family": model_family,
|
118 |
}
|
119 |
|
120 |
|
|
|
131 |
MODEL_COLUMNS_DICT = {
|
132 |
"model_type_symbol": Field("T", "str", never_hidden=True),
|
133 |
"model_size_symbol": Field("S", "str", never_hidden=True),
|
134 |
+
"full_model_name": Field("Full Model Name", "markdown", fully_hidden=True),
|
135 |
"model_name": Field("Model Name", "markdown", never_hidden=True),
|
136 |
"model_type": Field("Type", "str", displayed_by_default=False),
|
137 |
"model_size": Field("#Params (B)", "number", displayed_by_default=False),
|
138 |
+
"model_size_including_unknown": Field("#Params inc. Proprietary (B)", "number", fully_hidden=True),
|
139 |
"model_architecture": Field("Architecture", "str", displayed_by_default=False),
|
140 |
"model_license": Field("License", "markdown", displayed_by_default=False),
|
141 |
+
"model_family": Field("Family", "str", displayed_by_default=False),
|
142 |
}
|
143 |
|
144 |
U_MATH_COLUMNS_DICT = {
|
|
|
258 |
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
259 |
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
260 |
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
261 |
+
df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
|
262 |
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
263 |
+
df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
|
264 |
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
265 |
+
df["full_model_name"] = df["model_name"]
|
266 |
df["model_name"] = df["model_name"].apply(
|
267 |
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
268 |
)
|
|
|
281 |
|
282 |
# Calculate columns with prefixes f1, tpr, tnr, ppv, npv
|
283 |
for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
284 |
+
df[col + "_acc"] = df[col].apply(lambda x: x[0])
|
285 |
+
df[col + "_f1"] = df[col].apply(lambda x: x[1])
|
286 |
+
df[col + "_mcc"] = df[col].apply(lambda x: x[2])
|
287 |
+
df[col + "_tpr"] = df[col].apply(lambda x: x[3])
|
288 |
+
df[col + "_tnr"] = df[col].apply(lambda x: x[4])
|
289 |
+
df[col + "_ppv"] = df[col].apply(lambda x: x[5])
|
290 |
+
df[col + "_npv"] = df[col].apply(lambda x: x[6])
|
291 |
del df[col]
|
292 |
+
del df[col + "_acc"]
|
293 |
+
del df[col + "_mcc"]
|
294 |
|
295 |
# # flatten list [x, y, z] in columns as ["_f1", "_precision", "_recall"] suffixes for columns
|
296 |
# for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
|
|
313 |
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
314 |
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
315 |
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
316 |
+
df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
|
317 |
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
318 |
+
df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
|
319 |
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
320 |
+
df["full_model_name"] = df["model_name"]
|
321 |
df["model_name"] = df["model_name"].apply(
|
322 |
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
323 |
)
|