Spaces:
Running
Running
Konstantin Chernyshev
commited on
Commit
·
51510cb
1
Parent(s):
c6356a2
chore: add more u-math models
Browse files- data/u_math_eval_results.json +313 -40
data/u_math_eval_results.json
CHANGED
@@ -78,7 +78,7 @@
|
|
78 |
]
|
79 |
},
|
80 |
{
|
81 |
-
"model_name": "
|
82 |
"judge_model_name": "gpt-4o-2024-08-06",
|
83 |
"u_math": [
|
84 |
17.0,
|
@@ -311,6 +311,45 @@
|
|
311 |
0.0
|
312 |
]
|
313 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
{
|
315 |
"model_name": "meta-llama/Llama-3.1-8B-Instruct",
|
316 |
"judge_model_name": "gpt-4o-2024-08-06",
|
@@ -467,6 +506,45 @@
|
|
467 |
0.25
|
468 |
]
|
469 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
{
|
471 |
"model_name": "Qwen/Qwen2-VL-72B-Instruct",
|
472 |
"judge_model_name": "gpt-4o-2024-08-06",
|
@@ -1013,6 +1091,45 @@
|
|
1013 |
0.0
|
1014 |
]
|
1015 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1016 |
{
|
1017 |
"model_name": "Nexusflow/Athene-V2-Chat",
|
1018 |
"judge_model_name": "gpt-4o-2024-08-06",
|
@@ -1091,45 +1208,6 @@
|
|
1091 |
0.5
|
1092 |
]
|
1093 |
},
|
1094 |
-
{
|
1095 |
-
"model_name": "Qwen/QVQ-72B-Preview",
|
1096 |
-
"judge_model_name": "gpt-4o-2024-08-06",
|
1097 |
-
"u_math": [
|
1098 |
-
50.5455,
|
1099 |
-
59.3333,
|
1100 |
-
11.0
|
1101 |
-
],
|
1102 |
-
"algebra": [
|
1103 |
-
0.7833,
|
1104 |
-
0.9267,
|
1105 |
-
0.0667
|
1106 |
-
],
|
1107 |
-
"differential_calc": [
|
1108 |
-
0.3182,
|
1109 |
-
0.4467,
|
1110 |
-
0.0429
|
1111 |
-
],
|
1112 |
-
"integral_calc": [
|
1113 |
-
0.1731,
|
1114 |
-
0.1933,
|
1115 |
-
0.1207
|
1116 |
-
],
|
1117 |
-
"multivariable_calculus": [
|
1118 |
-
0.4888,
|
1119 |
-
0.5333,
|
1120 |
-
0.25
|
1121 |
-
],
|
1122 |
-
"precalculus_review": [
|
1123 |
-
0.8688,
|
1124 |
-
0.9133,
|
1125 |
-
0.2
|
1126 |
-
],
|
1127 |
-
"sequences_series": [
|
1128 |
-
0.539,
|
1129 |
-
0.5467,
|
1130 |
-
0.25
|
1131 |
-
]
|
1132 |
-
},
|
1133 |
{
|
1134 |
"model_name": "google/gemini-1.5-flash",
|
1135 |
"judge_model_name": "gpt-4o-2024-08-06",
|
@@ -1169,6 +1247,84 @@
|
|
1169 |
0.5
|
1170 |
]
|
1171 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1172 |
{
|
1173 |
"model_name": "google/gemini-1.5-pro",
|
1174 |
"judge_model_name": "gpt-4o-2024-08-06",
|
@@ -1246,5 +1402,122 @@
|
|
1246 |
0.7867,
|
1247 |
0.5
|
1248 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1249 |
}
|
1250 |
]
|
|
|
78 |
]
|
79 |
},
|
80 |
{
|
81 |
+
"model_name": "unsloth/Llama-3.2-11B-Vision-Instruct",
|
82 |
"judge_model_name": "gpt-4o-2024-08-06",
|
83 |
"u_math": [
|
84 |
17.0,
|
|
|
311 |
0.0
|
312 |
]
|
313 |
},
|
314 |
+
{
|
315 |
+
"model_name": "liquid/lfm-7b",
|
316 |
+
"judge_model_name": "gpt-4o-2024-08-06",
|
317 |
+
"u_math": [
|
318 |
+
21.0909,
|
319 |
+
24.5556,
|
320 |
+
5.5
|
321 |
+
],
|
322 |
+
"algebra": [
|
323 |
+
0.5722,
|
324 |
+
0.68,
|
325 |
+
0.0333
|
326 |
+
],
|
327 |
+
"differential_calc": [
|
328 |
+
0.0636,
|
329 |
+
0.04,
|
330 |
+
0.1143
|
331 |
+
],
|
332 |
+
"integral_calc": [
|
333 |
+
0.0096,
|
334 |
+
0.0067,
|
335 |
+
0.0172
|
336 |
+
],
|
337 |
+
"multivariable_calculus": [
|
338 |
+
0.0674,
|
339 |
+
0.0733,
|
340 |
+
0.0357
|
341 |
+
],
|
342 |
+
"precalculus_review": [
|
343 |
+
0.525,
|
344 |
+
0.56,
|
345 |
+
0.0
|
346 |
+
],
|
347 |
+
"sequences_series": [
|
348 |
+
0.1104,
|
349 |
+
0.1133,
|
350 |
+
0.0
|
351 |
+
]
|
352 |
+
},
|
353 |
{
|
354 |
"model_name": "meta-llama/Llama-3.1-8B-Instruct",
|
355 |
"judge_model_name": "gpt-4o-2024-08-06",
|
|
|
506 |
0.25
|
507 |
]
|
508 |
},
|
509 |
+
{
|
510 |
+
"model_name": "mistralai/Mistral-Small-24B-Instruct-2501",
|
511 |
+
"judge_model_name": "gpt-4o-2024-08-06",
|
512 |
+
"u_math": [
|
513 |
+
29.4545,
|
514 |
+
35.0,
|
515 |
+
4.5
|
516 |
+
],
|
517 |
+
"algebra": [
|
518 |
+
0.6889,
|
519 |
+
0.82,
|
520 |
+
0.0333
|
521 |
+
],
|
522 |
+
"differential_calc": [
|
523 |
+
0.0545,
|
524 |
+
0.06,
|
525 |
+
0.0429
|
526 |
+
],
|
527 |
+
"integral_calc": [
|
528 |
+
0.0577,
|
529 |
+
0.06,
|
530 |
+
0.0517
|
531 |
+
],
|
532 |
+
"multivariable_calculus": [
|
533 |
+
0.1404,
|
534 |
+
0.16,
|
535 |
+
0.0357
|
536 |
+
],
|
537 |
+
"precalculus_review": [
|
538 |
+
0.6562,
|
539 |
+
0.6933,
|
540 |
+
0.1
|
541 |
+
],
|
542 |
+
"sequences_series": [
|
543 |
+
0.2987,
|
544 |
+
0.3067,
|
545 |
+
0.0
|
546 |
+
]
|
547 |
+
},
|
548 |
{
|
549 |
"model_name": "Qwen/Qwen2-VL-72B-Instruct",
|
550 |
"judge_model_name": "gpt-4o-2024-08-06",
|
|
|
1091 |
0.0
|
1092 |
]
|
1093 |
},
|
1094 |
+
{
|
1095 |
+
"model_name": "microsoft/phi-4",
|
1096 |
+
"judge_model_name": "gpt-4o-2024-08-06",
|
1097 |
+
"u_math": [
|
1098 |
+
44.0909,
|
1099 |
+
51.3333,
|
1100 |
+
11.5
|
1101 |
+
],
|
1102 |
+
"algebra": [
|
1103 |
+
0.7556,
|
1104 |
+
0.9067,
|
1105 |
+
0.0
|
1106 |
+
],
|
1107 |
+
"differential_calc": [
|
1108 |
+
0.25,
|
1109 |
+
0.3267,
|
1110 |
+
0.0857
|
1111 |
+
],
|
1112 |
+
"integral_calc": [
|
1113 |
+
0.1202,
|
1114 |
+
0.0933,
|
1115 |
+
0.1897
|
1116 |
+
],
|
1117 |
+
"multivariable_calculus": [
|
1118 |
+
0.4045,
|
1119 |
+
0.46,
|
1120 |
+
0.1071
|
1121 |
+
],
|
1122 |
+
"precalculus_review": [
|
1123 |
+
0.8438,
|
1124 |
+
0.8867,
|
1125 |
+
0.2
|
1126 |
+
],
|
1127 |
+
"sequences_series": [
|
1128 |
+
0.4026,
|
1129 |
+
0.4067,
|
1130 |
+
0.25
|
1131 |
+
]
|
1132 |
+
},
|
1133 |
{
|
1134 |
"model_name": "Nexusflow/Athene-V2-Chat",
|
1135 |
"judge_model_name": "gpt-4o-2024-08-06",
|
|
|
1208 |
0.5
|
1209 |
]
|
1210 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1211 |
{
|
1212 |
"model_name": "google/gemini-1.5-flash",
|
1213 |
"judge_model_name": "gpt-4o-2024-08-06",
|
|
|
1247 |
0.5
|
1248 |
]
|
1249 |
},
|
1250 |
+
{
|
1251 |
+
"model_name": "deepseek-ai/DeepSeek-V3",
|
1252 |
+
"judge_model_name": "gpt-4o-2024-08-06",
|
1253 |
+
"u_math": [
|
1254 |
+
51.9091,
|
1255 |
+
60.4444,
|
1256 |
+
13.5
|
1257 |
+
],
|
1258 |
+
"algebra": [
|
1259 |
+
0.8222,
|
1260 |
+
0.98,
|
1261 |
+
0.0333
|
1262 |
+
],
|
1263 |
+
"differential_calc": [
|
1264 |
+
0.2591,
|
1265 |
+
0.3533,
|
1266 |
+
0.0571
|
1267 |
+
],
|
1268 |
+
"integral_calc": [
|
1269 |
+
0.2163,
|
1270 |
+
0.2067,
|
1271 |
+
0.2414
|
1272 |
+
],
|
1273 |
+
"multivariable_calculus": [
|
1274 |
+
0.5112,
|
1275 |
+
0.5733,
|
1276 |
+
0.1786
|
1277 |
+
],
|
1278 |
+
"precalculus_review": [
|
1279 |
+
0.85,
|
1280 |
+
0.9,
|
1281 |
+
0.1
|
1282 |
+
],
|
1283 |
+
"sequences_series": [
|
1284 |
+
0.6104,
|
1285 |
+
0.6133,
|
1286 |
+
0.5
|
1287 |
+
]
|
1288 |
+
},
|
1289 |
+
{
|
1290 |
+
"model_name": "Qwen/QVQ-72b-Preview",
|
1291 |
+
"judge_model_name": "gpt-4o-2024-08-06",
|
1292 |
+
"u_math": [
|
1293 |
+
55.0909,
|
1294 |
+
59.3333,
|
1295 |
+
36.0
|
1296 |
+
],
|
1297 |
+
"algebra": [
|
1298 |
+
0.8278,
|
1299 |
+
0.9267,
|
1300 |
+
0.3333
|
1301 |
+
],
|
1302 |
+
"differential_calc": [
|
1303 |
+
0.4136,
|
1304 |
+
0.4467,
|
1305 |
+
0.3429
|
1306 |
+
],
|
1307 |
+
"integral_calc": [
|
1308 |
+
0.25,
|
1309 |
+
0.1933,
|
1310 |
+
0.3966
|
1311 |
+
],
|
1312 |
+
"multivariable_calculus": [
|
1313 |
+
0.5169,
|
1314 |
+
0.5333,
|
1315 |
+
0.4286
|
1316 |
+
],
|
1317 |
+
"precalculus_review": [
|
1318 |
+
0.875,
|
1319 |
+
0.9133,
|
1320 |
+
0.3
|
1321 |
+
],
|
1322 |
+
"sequences_series": [
|
1323 |
+
0.5325,
|
1324 |
+
0.5467,
|
1325 |
+
0.0
|
1326 |
+
]
|
1327 |
+
},
|
1328 |
{
|
1329 |
"model_name": "google/gemini-1.5-pro",
|
1330 |
"judge_model_name": "gpt-4o-2024-08-06",
|
|
|
1402 |
0.7867,
|
1403 |
0.5
|
1404 |
]
|
1405 |
+
},
|
1406 |
+
{
|
1407 |
+
"model_name": "deepseek-ai/DeepSeek-R1",
|
1408 |
+
"judge_model_name": "gpt-4o-2024-08-06",
|
1409 |
+
"u_math": [
|
1410 |
+
63.2727,
|
1411 |
+
73.6667,
|
1412 |
+
16.5
|
1413 |
+
],
|
1414 |
+
"algebra": [
|
1415 |
+
0.8,
|
1416 |
+
0.96,
|
1417 |
+
0.0
|
1418 |
+
],
|
1419 |
+
"differential_calc": [
|
1420 |
+
0.4818,
|
1421 |
+
0.6667,
|
1422 |
+
0.0857
|
1423 |
+
],
|
1424 |
+
"integral_calc": [
|
1425 |
+
0.3606,
|
1426 |
+
0.4,
|
1427 |
+
0.2586
|
1428 |
+
],
|
1429 |
+
"multivariable_calculus": [
|
1430 |
+
0.6124,
|
1431 |
+
0.6667,
|
1432 |
+
0.3214
|
1433 |
+
],
|
1434 |
+
"precalculus_review": [
|
1435 |
+
0.9188,
|
1436 |
+
0.9733,
|
1437 |
+
0.1
|
1438 |
+
],
|
1439 |
+
"sequences_series": [
|
1440 |
+
0.7468,
|
1441 |
+
0.7533,
|
1442 |
+
0.5
|
1443 |
+
]
|
1444 |
+
},
|
1445 |
+
{
|
1446 |
+
"model_name": "o1-mini",
|
1447 |
+
"judge_model_name": "gpt-4o-2024-08-06",
|
1448 |
+
"u_math": [
|
1449 |
+
63.4545,
|
1450 |
+
73.3333,
|
1451 |
+
19.0
|
1452 |
+
],
|
1453 |
+
"algebra": [
|
1454 |
+
0.8,
|
1455 |
+
0.96,
|
1456 |
+
0.0
|
1457 |
+
],
|
1458 |
+
"differential_calc": [
|
1459 |
+
0.4864,
|
1460 |
+
0.66,
|
1461 |
+
0.1143
|
1462 |
+
],
|
1463 |
+
"integral_calc": [
|
1464 |
+
0.3798,
|
1465 |
+
0.42,
|
1466 |
+
0.2759
|
1467 |
+
],
|
1468 |
+
"multivariable_calculus": [
|
1469 |
+
0.6404,
|
1470 |
+
0.6867,
|
1471 |
+
0.3929
|
1472 |
+
],
|
1473 |
+
"precalculus_review": [
|
1474 |
+
0.8875,
|
1475 |
+
0.9333,
|
1476 |
+
0.2
|
1477 |
+
],
|
1478 |
+
"sequences_series": [
|
1479 |
+
0.7273,
|
1480 |
+
0.74,
|
1481 |
+
0.25
|
1482 |
+
]
|
1483 |
+
},
|
1484 |
+
{
|
1485 |
+
"model_name": "o1",
|
1486 |
+
"judge_model_name": "gpt-4o-2024-08-06",
|
1487 |
+
"u_math": [
|
1488 |
+
70.7031,
|
1489 |
+
77.2059,
|
1490 |
+
45.1923
|
1491 |
+
],
|
1492 |
+
"algebra": [
|
1493 |
+
0.3556,
|
1494 |
+
0.3867,
|
1495 |
+
0.2
|
1496 |
+
],
|
1497 |
+
"differential_calc": [
|
1498 |
+
0.2818,
|
1499 |
+
0.3133,
|
1500 |
+
0.2143
|
1501 |
+
],
|
1502 |
+
"integral_calc": [
|
1503 |
+
0.2692,
|
1504 |
+
0.2733,
|
1505 |
+
0.2586
|
1506 |
+
],
|
1507 |
+
"multivariable_calculus": [
|
1508 |
+
0.3202,
|
1509 |
+
0.32,
|
1510 |
+
0.3214
|
1511 |
+
],
|
1512 |
+
"precalculus_review": [
|
1513 |
+
0.4,
|
1514 |
+
0.4133,
|
1515 |
+
0.2
|
1516 |
+
],
|
1517 |
+
"sequences_series": [
|
1518 |
+
0.3831,
|
1519 |
+
0.3933,
|
1520 |
+
0.0
|
1521 |
+
]
|
1522 |
}
|
1523 |
]
|