Konstantin Chernyshev commited on
Commit
51510cb
·
1 Parent(s): c6356a2

chore: add more u-math models

Browse files
Files changed (1) hide show
  1. data/u_math_eval_results.json +313 -40
data/u_math_eval_results.json CHANGED
@@ -78,7 +78,7 @@
78
  ]
79
  },
80
  {
81
- "model_name": "meta-llama/Llama-3.2-11B-Vision-Instruct",
82
  "judge_model_name": "gpt-4o-2024-08-06",
83
  "u_math": [
84
  17.0,
@@ -311,6 +311,45 @@
311
  0.0
312
  ]
313
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  {
315
  "model_name": "meta-llama/Llama-3.1-8B-Instruct",
316
  "judge_model_name": "gpt-4o-2024-08-06",
@@ -467,6 +506,45 @@
467
  0.25
468
  ]
469
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  {
471
  "model_name": "Qwen/Qwen2-VL-72B-Instruct",
472
  "judge_model_name": "gpt-4o-2024-08-06",
@@ -1013,6 +1091,45 @@
1013
  0.0
1014
  ]
1015
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1016
  {
1017
  "model_name": "Nexusflow/Athene-V2-Chat",
1018
  "judge_model_name": "gpt-4o-2024-08-06",
@@ -1091,45 +1208,6 @@
1091
  0.5
1092
  ]
1093
  },
1094
- {
1095
- "model_name": "Qwen/QVQ-72B-Preview",
1096
- "judge_model_name": "gpt-4o-2024-08-06",
1097
- "u_math": [
1098
- 50.5455,
1099
- 59.3333,
1100
- 11.0
1101
- ],
1102
- "algebra": [
1103
- 0.7833,
1104
- 0.9267,
1105
- 0.0667
1106
- ],
1107
- "differential_calc": [
1108
- 0.3182,
1109
- 0.4467,
1110
- 0.0429
1111
- ],
1112
- "integral_calc": [
1113
- 0.1731,
1114
- 0.1933,
1115
- 0.1207
1116
- ],
1117
- "multivariable_calculus": [
1118
- 0.4888,
1119
- 0.5333,
1120
- 0.25
1121
- ],
1122
- "precalculus_review": [
1123
- 0.8688,
1124
- 0.9133,
1125
- 0.2
1126
- ],
1127
- "sequences_series": [
1128
- 0.539,
1129
- 0.5467,
1130
- 0.25
1131
- ]
1132
- },
1133
  {
1134
  "model_name": "google/gemini-1.5-flash",
1135
  "judge_model_name": "gpt-4o-2024-08-06",
@@ -1169,6 +1247,84 @@
1169
  0.5
1170
  ]
1171
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1172
  {
1173
  "model_name": "google/gemini-1.5-pro",
1174
  "judge_model_name": "gpt-4o-2024-08-06",
@@ -1246,5 +1402,122 @@
1246
  0.7867,
1247
  0.5
1248
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1249
  }
1250
  ]
 
78
  ]
79
  },
80
  {
81
+ "model_name": "unsloth/Llama-3.2-11B-Vision-Instruct",
82
  "judge_model_name": "gpt-4o-2024-08-06",
83
  "u_math": [
84
  17.0,
 
311
  0.0
312
  ]
313
  },
314
+ {
315
+ "model_name": "liquid/lfm-7b",
316
+ "judge_model_name": "gpt-4o-2024-08-06",
317
+ "u_math": [
318
+ 21.0909,
319
+ 24.5556,
320
+ 5.5
321
+ ],
322
+ "algebra": [
323
+ 0.5722,
324
+ 0.68,
325
+ 0.0333
326
+ ],
327
+ "differential_calc": [
328
+ 0.0636,
329
+ 0.04,
330
+ 0.1143
331
+ ],
332
+ "integral_calc": [
333
+ 0.0096,
334
+ 0.0067,
335
+ 0.0172
336
+ ],
337
+ "multivariable_calculus": [
338
+ 0.0674,
339
+ 0.0733,
340
+ 0.0357
341
+ ],
342
+ "precalculus_review": [
343
+ 0.525,
344
+ 0.56,
345
+ 0.0
346
+ ],
347
+ "sequences_series": [
348
+ 0.1104,
349
+ 0.1133,
350
+ 0.0
351
+ ]
352
+ },
353
  {
354
  "model_name": "meta-llama/Llama-3.1-8B-Instruct",
355
  "judge_model_name": "gpt-4o-2024-08-06",
 
506
  0.25
507
  ]
508
  },
509
+ {
510
+ "model_name": "mistralai/Mistral-Small-24B-Instruct-2501",
511
+ "judge_model_name": "gpt-4o-2024-08-06",
512
+ "u_math": [
513
+ 29.4545,
514
+ 35.0,
515
+ 4.5
516
+ ],
517
+ "algebra": [
518
+ 0.6889,
519
+ 0.82,
520
+ 0.0333
521
+ ],
522
+ "differential_calc": [
523
+ 0.0545,
524
+ 0.06,
525
+ 0.0429
526
+ ],
527
+ "integral_calc": [
528
+ 0.0577,
529
+ 0.06,
530
+ 0.0517
531
+ ],
532
+ "multivariable_calculus": [
533
+ 0.1404,
534
+ 0.16,
535
+ 0.0357
536
+ ],
537
+ "precalculus_review": [
538
+ 0.6562,
539
+ 0.6933,
540
+ 0.1
541
+ ],
542
+ "sequences_series": [
543
+ 0.2987,
544
+ 0.3067,
545
+ 0.0
546
+ ]
547
+ },
548
  {
549
  "model_name": "Qwen/Qwen2-VL-72B-Instruct",
550
  "judge_model_name": "gpt-4o-2024-08-06",
 
1091
  0.0
1092
  ]
1093
  },
1094
+ {
1095
+ "model_name": "microsoft/phi-4",
1096
+ "judge_model_name": "gpt-4o-2024-08-06",
1097
+ "u_math": [
1098
+ 44.0909,
1099
+ 51.3333,
1100
+ 11.5
1101
+ ],
1102
+ "algebra": [
1103
+ 0.7556,
1104
+ 0.9067,
1105
+ 0.0
1106
+ ],
1107
+ "differential_calc": [
1108
+ 0.25,
1109
+ 0.3267,
1110
+ 0.0857
1111
+ ],
1112
+ "integral_calc": [
1113
+ 0.1202,
1114
+ 0.0933,
1115
+ 0.1897
1116
+ ],
1117
+ "multivariable_calculus": [
1118
+ 0.4045,
1119
+ 0.46,
1120
+ 0.1071
1121
+ ],
1122
+ "precalculus_review": [
1123
+ 0.8438,
1124
+ 0.8867,
1125
+ 0.2
1126
+ ],
1127
+ "sequences_series": [
1128
+ 0.4026,
1129
+ 0.4067,
1130
+ 0.25
1131
+ ]
1132
+ },
1133
  {
1134
  "model_name": "Nexusflow/Athene-V2-Chat",
1135
  "judge_model_name": "gpt-4o-2024-08-06",
 
1208
  0.5
1209
  ]
1210
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1211
  {
1212
  "model_name": "google/gemini-1.5-flash",
1213
  "judge_model_name": "gpt-4o-2024-08-06",
 
1247
  0.5
1248
  ]
1249
  },
1250
+ {
1251
+ "model_name": "deepseek-ai/DeepSeek-V3",
1252
+ "judge_model_name": "gpt-4o-2024-08-06",
1253
+ "u_math": [
1254
+ 51.9091,
1255
+ 60.4444,
1256
+ 13.5
1257
+ ],
1258
+ "algebra": [
1259
+ 0.8222,
1260
+ 0.98,
1261
+ 0.0333
1262
+ ],
1263
+ "differential_calc": [
1264
+ 0.2591,
1265
+ 0.3533,
1266
+ 0.0571
1267
+ ],
1268
+ "integral_calc": [
1269
+ 0.2163,
1270
+ 0.2067,
1271
+ 0.2414
1272
+ ],
1273
+ "multivariable_calculus": [
1274
+ 0.5112,
1275
+ 0.5733,
1276
+ 0.1786
1277
+ ],
1278
+ "precalculus_review": [
1279
+ 0.85,
1280
+ 0.9,
1281
+ 0.1
1282
+ ],
1283
+ "sequences_series": [
1284
+ 0.6104,
1285
+ 0.6133,
1286
+ 0.5
1287
+ ]
1288
+ },
1289
+ {
1290
+ "model_name": "Qwen/QVQ-72b-Preview",
1291
+ "judge_model_name": "gpt-4o-2024-08-06",
1292
+ "u_math": [
1293
+ 55.0909,
1294
+ 59.3333,
1295
+ 36.0
1296
+ ],
1297
+ "algebra": [
1298
+ 0.8278,
1299
+ 0.9267,
1300
+ 0.3333
1301
+ ],
1302
+ "differential_calc": [
1303
+ 0.4136,
1304
+ 0.4467,
1305
+ 0.3429
1306
+ ],
1307
+ "integral_calc": [
1308
+ 0.25,
1309
+ 0.1933,
1310
+ 0.3966
1311
+ ],
1312
+ "multivariable_calculus": [
1313
+ 0.5169,
1314
+ 0.5333,
1315
+ 0.4286
1316
+ ],
1317
+ "precalculus_review": [
1318
+ 0.875,
1319
+ 0.9133,
1320
+ 0.3
1321
+ ],
1322
+ "sequences_series": [
1323
+ 0.5325,
1324
+ 0.5467,
1325
+ 0.0
1326
+ ]
1327
+ },
1328
  {
1329
  "model_name": "google/gemini-1.5-pro",
1330
  "judge_model_name": "gpt-4o-2024-08-06",
 
1402
  0.7867,
1403
  0.5
1404
  ]
1405
+ },
1406
+ {
1407
+ "model_name": "deepseek-ai/DeepSeek-R1",
1408
+ "judge_model_name": "gpt-4o-2024-08-06",
1409
+ "u_math": [
1410
+ 63.2727,
1411
+ 73.6667,
1412
+ 16.5
1413
+ ],
1414
+ "algebra": [
1415
+ 0.8,
1416
+ 0.96,
1417
+ 0.0
1418
+ ],
1419
+ "differential_calc": [
1420
+ 0.4818,
1421
+ 0.6667,
1422
+ 0.0857
1423
+ ],
1424
+ "integral_calc": [
1425
+ 0.3606,
1426
+ 0.4,
1427
+ 0.2586
1428
+ ],
1429
+ "multivariable_calculus": [
1430
+ 0.6124,
1431
+ 0.6667,
1432
+ 0.3214
1433
+ ],
1434
+ "precalculus_review": [
1435
+ 0.9188,
1436
+ 0.9733,
1437
+ 0.1
1438
+ ],
1439
+ "sequences_series": [
1440
+ 0.7468,
1441
+ 0.7533,
1442
+ 0.5
1443
+ ]
1444
+ },
1445
+ {
1446
+ "model_name": "o1-mini",
1447
+ "judge_model_name": "gpt-4o-2024-08-06",
1448
+ "u_math": [
1449
+ 63.4545,
1450
+ 73.3333,
1451
+ 19.0
1452
+ ],
1453
+ "algebra": [
1454
+ 0.8,
1455
+ 0.96,
1456
+ 0.0
1457
+ ],
1458
+ "differential_calc": [
1459
+ 0.4864,
1460
+ 0.66,
1461
+ 0.1143
1462
+ ],
1463
+ "integral_calc": [
1464
+ 0.3798,
1465
+ 0.42,
1466
+ 0.2759
1467
+ ],
1468
+ "multivariable_calculus": [
1469
+ 0.6404,
1470
+ 0.6867,
1471
+ 0.3929
1472
+ ],
1473
+ "precalculus_review": [
1474
+ 0.8875,
1475
+ 0.9333,
1476
+ 0.2
1477
+ ],
1478
+ "sequences_series": [
1479
+ 0.7273,
1480
+ 0.74,
1481
+ 0.25
1482
+ ]
1483
+ },
1484
+ {
1485
+ "model_name": "o1",
1486
+ "judge_model_name": "gpt-4o-2024-08-06",
1487
+ "u_math": [
1488
+ 70.7031,
1489
+ 77.2059,
1490
+ 45.1923
1491
+ ],
1492
+ "algebra": [
1493
+ 0.3556,
1494
+ 0.3867,
1495
+ 0.2
1496
+ ],
1497
+ "differential_calc": [
1498
+ 0.2818,
1499
+ 0.3133,
1500
+ 0.2143
1501
+ ],
1502
+ "integral_calc": [
1503
+ 0.2692,
1504
+ 0.2733,
1505
+ 0.2586
1506
+ ],
1507
+ "multivariable_calculus": [
1508
+ 0.3202,
1509
+ 0.32,
1510
+ 0.3214
1511
+ ],
1512
+ "precalculus_review": [
1513
+ 0.4,
1514
+ 0.4133,
1515
+ 0.2
1516
+ ],
1517
+ "sequences_series": [
1518
+ 0.3831,
1519
+ 0.3933,
1520
+ 0.0
1521
+ ]
1522
  }
1523
  ]