cogwheelhead commited on
Commit
1589444
·
verified ·
1 Parent(s): 51510cb

data: commit mu-math numbers

Browse files
Files changed (1) hide show
  1. data/mu_math_eval_results.json +883 -19
data/mu_math_eval_results.json CHANGED
@@ -1,20 +1,884 @@
1
  [
2
- {
3
- "model_name": "mistralai/Ministral-8B-Instruct-2410",
4
- "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
5
- "mu_math": [0.664, 0.33, 0.651, 0.68, 0.701, 0.628, 0.574],
6
- "GPT-4o": [0.664, 0.332, 0.621, 0.71, 0.696, 0.637, 0.574],
7
- "Gemini-1.5-Pro": [0.672, 0.279, 0.709, 0.585, 0.798, 0.466, 0.574],
8
- "Llama-3.1-70B-Instruct": [0.675, 0.317, 0.619, 0.707, 0.541, 0.769, 0.574],
9
- "Qwen2.5-72B-Instruct": [0.646, 0.295, 0.626, 0.672, 0.719, 0.574, 0.574]
10
- },
11
- {
12
- "model_name": "meta-llama/Llama-3.3-70B-Instruct",
13
- "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
14
- "mu_math": [0.741, 0.496, 0.666, 0.827, 0.816, 0.682, 0.574],
15
- "GPT-4o": [0.731, 0.475, 0.636, 0.832, 0.802, 0.681, 0.574],
16
- "Gemini-1.5-Pro": [0.705, 0.394, 0.693, 0.732, 0.856, 0.508, 0.574],
17
- "Llama-3.1-70B-Instruct": [0.823, 0.605, 0.67, 0.908, 0.802, 0.832, 0.574],
18
- "Qwen2.5-72B-Instruct": [0.705, 0.421, 0.658, 0.767, 0.791, 0.627, 0.574]
19
- }
20
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  [
2
+ {
3
+ "model_name": "google/gemini-1.5-flash",
4
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
5
+ "mu_math": [
6
+ 0.749,
7
+ 0.748,
8
+ 0.763,
9
+ 0.633,
10
+ 0.883,
11
+ 0.862,
12
+ 0.676
13
+ ],
14
+ "GPT-4o": [
15
+ 0.705,
16
+ 0.701,
17
+ 0.716,
18
+ 0.579,
19
+ 0.84,
20
+ 0.794,
21
+ 0.651
22
+ ],
23
+ "Gemini-1.5-Pro": [
24
+ 0.749,
25
+ 0.739,
26
+ 0.772,
27
+ 0.677,
28
+ 0.915,
29
+ 0.948,
30
+ 0.551
31
+ ],
32
+ "Llama-3.1-70B-Instruct": [
33
+ 0.83,
34
+ 0.806,
35
+ 0.811,
36
+ 0.67,
37
+ 0.92,
38
+ 0.823,
39
+ 0.833
40
+ ],
41
+ "Qwen2.5-72B-Instruct": [
42
+ 0.712,
43
+ 0.712,
44
+ 0.731,
45
+ 0.606,
46
+ 0.853,
47
+ 0.847,
48
+ 0.619
49
+ ]
50
+ },
51
+ {
52
+ "model_name": "google/gemini-1.5-pro",
53
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
54
+ "mu_math": [
55
+ 0.807,
56
+ 0.807,
57
+ 0.809,
58
+ 0.775,
59
+ 0.845,
60
+ 0.852,
61
+ 0.764
62
+ ],
63
+ "GPT-4o": [
64
+ 0.782,
65
+ 0.782,
66
+ 0.783,
67
+ 0.764,
68
+ 0.802,
69
+ 0.805,
70
+ 0.761
71
+ ],
72
+ "Gemini-1.5-Pro": [
73
+ 0.815,
74
+ 0.795,
75
+ 0.802,
76
+ 0.81,
77
+ 0.829,
78
+ 0.916,
79
+ 0.654
80
+ ],
81
+ "Llama-3.1-70B-Instruct": [
82
+ 0.852,
83
+ 0.836,
84
+ 0.837,
85
+ 0.753,
86
+ 0.908,
87
+ 0.82,
88
+ 0.868
89
+ ],
90
+ "Qwen2.5-72B-Instruct": [
91
+ 0.779,
92
+ 0.777,
93
+ 0.78,
94
+ 0.755,
95
+ 0.81,
96
+ 0.842,
97
+ 0.712
98
+ ]
99
+ },
100
+ {
101
+ "model_name": "gpt-4o-mini-2024-07-18",
102
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
103
+ "mu_math": [
104
+ 0.725,
105
+ 0.723,
106
+ 0.743,
107
+ 0.59,
108
+ 0.881,
109
+ 0.851,
110
+ 0.651
111
+ ],
112
+ "GPT-4o": [
113
+ 0.708,
114
+ 0.704,
115
+ 0.722,
116
+ 0.564,
117
+ 0.863,
118
+ 0.814,
119
+ 0.649
120
+ ],
121
+ "Gemini-1.5-Pro": [
122
+ 0.705,
123
+ 0.696,
124
+ 0.733,
125
+ 0.63,
126
+ 0.878,
127
+ 0.922,
128
+ 0.507
129
+ ],
130
+ "Llama-3.1-70B-Instruct": [
131
+ 0.793,
132
+ 0.762,
133
+ 0.768,
134
+ 0.598,
135
+ 0.902,
136
+ 0.773,
137
+ 0.801
138
+ ],
139
+ "Qwen2.5-72B-Instruct": [
140
+ 0.694,
141
+ 0.693,
142
+ 0.721,
143
+ 0.561,
144
+ 0.871,
145
+ 0.853,
146
+ 0.598
147
+ ]
148
+ },
149
+ {
150
+ "model_name": "gpt-4o-2024-08-06",
151
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
152
+ "mu_math": [
153
+ 0.774,
154
+ 0.774,
155
+ 0.781,
156
+ 0.701,
157
+ 0.859,
158
+ 0.851,
159
+ 0.713
160
+ ],
161
+ "GPT-4o": [
162
+ 0.775,
163
+ 0.775,
164
+ 0.778,
165
+ 0.721,
166
+ 0.832,
167
+ 0.821,
168
+ 0.736
169
+ ],
170
+ "Gemini-1.5-Pro": [
171
+ 0.742,
172
+ 0.726,
173
+ 0.746,
174
+ 0.704,
175
+ 0.829,
176
+ 0.905,
177
+ 0.548
178
+ ],
179
+ "Llama-3.1-70B-Instruct": [
180
+ 0.838,
181
+ 0.818,
182
+ 0.82,
183
+ 0.711,
184
+ 0.908,
185
+ 0.812,
186
+ 0.849
187
+ ],
188
+ "Qwen2.5-72B-Instruct": [
189
+ 0.742,
190
+ 0.742,
191
+ 0.752,
192
+ 0.671,
193
+ 0.836,
194
+ 0.846,
195
+ 0.655
196
+ ]
197
+ },
198
+ {
199
+ "model_name": "meta-llama/Llama-3.1-8B-Instruct",
200
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
201
+ "mu_math": [
202
+ 0.52,
203
+ 0.52,
204
+ 0.523,
205
+ 0.487,
206
+ 0.559,
207
+ 0.56,
208
+ 0.485
209
+ ],
210
+ "GPT-4o": [
211
+ 0.513,
212
+ 0.512,
213
+ 0.515,
214
+ 0.464,
215
+ 0.565,
216
+ 0.533,
217
+ 0.497
218
+ ],
219
+ "Gemini-1.5-Pro": [
220
+ 0.572,
221
+ 0.555,
222
+ 0.579,
223
+ 0.55,
224
+ 0.622,
225
+ 0.77,
226
+ 0.375
227
+ ],
228
+ "Llama-3.1-70B-Instruct": [
229
+ 0.509,
230
+ 0.492,
231
+ 0.497,
232
+ 0.454,
233
+ 0.54,
234
+ 0.355,
235
+ 0.639
236
+ ],
237
+ "Qwen2.5-72B-Instruct": [
238
+ 0.487,
239
+ 0.487,
240
+ 0.493,
241
+ 0.452,
242
+ 0.534,
243
+ 0.565,
244
+ 0.422
245
+ ]
246
+ },
247
+ {
248
+ "model_name": "meta-llama/Llama-3.1-70B-Instruct",
249
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
250
+ "mu_math": [
251
+ 0.612,
252
+ 0.61,
253
+ 0.61,
254
+ 0.625,
255
+ 0.596,
256
+ 0.641,
257
+ 0.579
258
+ ],
259
+ "GPT-4o": [
260
+ 0.694,
261
+ 0.694,
262
+ 0.694,
263
+ 0.671,
264
+ 0.718,
265
+ 0.718,
266
+ 0.671
267
+ ],
268
+ "Gemini-1.5-Pro": [
269
+ 0.613,
270
+ 0.588,
271
+ 0.603,
272
+ 0.614,
273
+ 0.61,
274
+ 0.784,
275
+ 0.407
276
+ ],
277
+ "Llama-3.1-70B-Instruct": [
278
+ 0.576,
279
+ 0.57,
280
+ 0.586,
281
+ 0.639,
282
+ 0.54,
283
+ 0.437,
284
+ 0.729
285
+ ],
286
+ "Qwen2.5-72B-Instruct": [
287
+ 0.565,
288
+ 0.56,
289
+ 0.56,
290
+ 0.587,
291
+ 0.534,
292
+ 0.628,
293
+ 0.492
294
+ ]
295
+ },
296
+ {
297
+ "model_name": "Qwen/Qwen2.5-7B-Instruct",
298
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
299
+ "mu_math": [
300
+ 0.699,
301
+ 0.693,
302
+ 0.697,
303
+ 0.787,
304
+ 0.598,
305
+ 0.693,
306
+ 0.708
307
+ ],
308
+ "GPT-4o": [
309
+ 0.69,
310
+ 0.683,
311
+ 0.693,
312
+ 0.814,
313
+ 0.557,
314
+ 0.663,
315
+ 0.737
316
+ ],
317
+ "Gemini-1.5-Pro": [
318
+ 0.734,
319
+ 0.691,
320
+ 0.692,
321
+ 0.794,
322
+ 0.598,
323
+ 0.82,
324
+ 0.557
325
+ ],
326
+ "Llama-3.1-70B-Instruct": [
327
+ 0.731,
328
+ 0.723,
329
+ 0.733,
330
+ 0.784,
331
+ 0.701,
332
+ 0.594,
333
+ 0.853
334
+ ],
335
+ "Qwen2.5-72B-Instruct": [
336
+ 0.642,
337
+ 0.624,
338
+ 0.628,
339
+ 0.755,
340
+ 0.491,
341
+ 0.665,
342
+ 0.6
343
+ ]
344
+ },
345
+ {
346
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
347
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
348
+ "mu_math": [
349
+ 0.757,
350
+ 0.756,
351
+ 0.756,
352
+ 0.771,
353
+ 0.742,
354
+ 0.775,
355
+ 0.737
356
+ ],
357
+ "GPT-4o": [
358
+ 0.738,
359
+ 0.737,
360
+ 0.738,
361
+ 0.764,
362
+ 0.71,
363
+ 0.738,
364
+ 0.738
365
+ ],
366
+ "Gemini-1.5-Pro": [
367
+ 0.771,
368
+ 0.742,
369
+ 0.745,
370
+ 0.794,
371
+ 0.72,
372
+ 0.867,
373
+ 0.602
374
+ ],
375
+ "Llama-3.1-70B-Instruct": [
376
+ 0.808,
377
+ 0.793,
378
+ 0.793,
379
+ 0.753,
380
+ 0.839,
381
+ 0.723,
382
+ 0.859
383
+ ],
384
+ "Qwen2.5-72B-Instruct": [
385
+ 0.712,
386
+ 0.705,
387
+ 0.705,
388
+ 0.761,
389
+ 0.647,
390
+ 0.742,
391
+ 0.67
392
+ ]
393
+ },
394
+ {
395
+ "model_name": "Qwen/Qwen2.5-Math-7B-Instruct",
396
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
397
+ "mu_math": [
398
+ 0.633,
399
+ 0.619,
400
+ 0.628,
401
+ 0.766,
402
+ 0.479,
403
+ 0.629,
404
+ 0.639
405
+ ],
406
+ "GPT-4o": [
407
+ 0.587,
408
+ 0.572,
409
+ 0.586,
410
+ 0.75,
411
+ 0.412,
412
+ 0.577,
413
+ 0.607
414
+ ],
415
+ "Gemini-1.5-Pro": [
416
+ 0.694,
417
+ 0.638,
418
+ 0.638,
419
+ 0.778,
420
+ 0.5,
421
+ 0.782,
422
+ 0.494
423
+ ],
424
+ "Llama-3.1-70B-Instruct": [
425
+ 0.638,
426
+ 0.638,
427
+ 0.684,
428
+ 0.856,
429
+ 0.517,
430
+ 0.497,
431
+ 0.865
432
+ ],
433
+ "Qwen2.5-72B-Instruct": [
434
+ 0.613,
435
+ 0.597,
436
+ 0.598,
437
+ 0.71,
438
+ 0.483,
439
+ 0.647,
440
+ 0.554
441
+ ]
442
+ },
443
+ {
444
+ "model_name": "Qwen/Qwen2.5-Math-72B-Instruct",
445
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
446
+ "mu_math": [
447
+ 0.744,
448
+ 0.74,
449
+ 0.742,
450
+ 0.809,
451
+ 0.668,
452
+ 0.738,
453
+ 0.752
454
+ ],
455
+ "GPT-4o": [
456
+ 0.686,
457
+ 0.682,
458
+ 0.687,
459
+ 0.779,
460
+ 0.588,
461
+ 0.669,
462
+ 0.713
463
+ ],
464
+ "Gemini-1.5-Pro": [
465
+ 0.797,
466
+ 0.768,
467
+ 0.77,
468
+ 0.825,
469
+ 0.732,
470
+ 0.876,
471
+ 0.645
472
+ ],
473
+ "Llama-3.1-70B-Instruct": [
474
+ 0.782,
475
+ 0.773,
476
+ 0.779,
477
+ 0.814,
478
+ 0.764,
479
+ 0.658,
480
+ 0.881
481
+ ],
482
+ "Qwen2.5-72B-Instruct": [
483
+ 0.708,
484
+ 0.693,
485
+ 0.698,
486
+ 0.813,
487
+ 0.569,
488
+ 0.716,
489
+ 0.695
490
+ ]
491
+ },
492
+ {
493
+ "model_name": "claude-sonnet-3-5",
494
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
495
+ "mu_math": [
496
+ 0.75,
497
+ 0.748,
498
+ 0.766,
499
+ 0.625,
500
+ 0.895,
501
+ 0.873,
502
+ 0.674
503
+ ],
504
+ "GPT-4o": [
505
+ 0.727,
506
+ 0.722,
507
+ 0.742,
508
+ 0.579,
509
+ 0.885,
510
+ 0.844,
511
+ 0.663
512
+ ],
513
+ "Gemini-1.5-Pro": [
514
+ 0.753,
515
+ 0.738,
516
+ 0.759,
517
+ 0.709,
518
+ 0.854,
519
+ 0.918,
520
+ 0.56
521
+ ],
522
+ "Llama-3.1-70B-Instruct": [
523
+ 0.812,
524
+ 0.779,
525
+ 0.79,
526
+ 0.598,
527
+ 0.931,
528
+ 0.829,
529
+ 0.806
530
+ ],
531
+ "Qwen2.5-72B-Instruct": [
532
+ 0.708,
533
+ 0.708,
534
+ 0.734,
535
+ 0.581,
536
+ 0.879,
537
+ 0.865,
538
+ 0.611
539
+ ]
540
+ },
541
+ {
542
+ "model_name": "mistralai/Ministral-8B-Instruct-2410",
543
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
544
+ "mu_math": [
545
+ 0.605,
546
+ 0.605,
547
+ 0.609,
548
+ 0.559,
549
+ 0.658,
550
+ 0.654,
551
+ 0.564
552
+ ],
553
+ "GPT-4o": [
554
+ 0.631,
555
+ 0.629,
556
+ 0.637,
557
+ 0.536,
558
+ 0.733,
559
+ 0.682,
560
+ 0.596
561
+ ],
562
+ "Gemini-1.5-Pro": [
563
+ 0.613,
564
+ 0.583,
565
+ 0.594,
566
+ 0.63,
567
+ 0.573,
568
+ 0.773,
569
+ 0.402
570
+ ],
571
+ "Llama-3.1-70B-Instruct": [
572
+ 0.649,
573
+ 0.631,
574
+ 0.634,
575
+ 0.598,
576
+ 0.678,
577
+ 0.509,
578
+ 0.752
579
+ ],
580
+ "Qwen2.5-72B-Instruct": [
581
+ 0.528,
582
+ 0.528,
583
+ 0.537,
584
+ 0.471,
585
+ 0.603,
586
+ 0.613,
587
+ 0.461
588
+ ]
589
+ },
590
+ {
591
+ "model_name": "mistralai/Mistral-Large-Instruct-2411",
592
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
593
+ "mu_math": [
594
+ 0.767,
595
+ 0.766,
596
+ 0.767,
597
+ 0.757,
598
+ 0.777,
599
+ 0.797,
600
+ 0.735
601
+ ],
602
+ "GPT-4o": [
603
+ 0.76,
604
+ 0.76,
605
+ 0.76,
606
+ 0.757,
607
+ 0.763,
608
+ 0.774,
609
+ 0.746
610
+ ],
611
+ "Gemini-1.5-Pro": [
612
+ 0.779,
613
+ 0.75,
614
+ 0.754,
615
+ 0.799,
616
+ 0.732,
617
+ 0.873,
618
+ 0.612
619
+ ],
620
+ "Llama-3.1-70B-Instruct": [
621
+ 0.801,
622
+ 0.786,
623
+ 0.786,
624
+ 0.753,
625
+ 0.828,
626
+ 0.709,
627
+ 0.857
628
+ ],
629
+ "Qwen2.5-72B-Instruct": [
630
+ 0.727,
631
+ 0.725,
632
+ 0.728,
633
+ 0.71,
634
+ 0.75,
635
+ 0.791,
636
+ 0.659
637
+ ]
638
+ },
639
+ {
640
+ "model_name": "gemini-2.0-flash-thinking-exp-01-21",
641
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
642
+ "mu_math": [
643
+ 0.812,
644
+ 0.81,
645
+ 0.816,
646
+ 0.891,
647
+ 0.732,
648
+ 0.769,
649
+ 0.871
650
+ ],
651
+ "GPT-4o": [
652
+ 0.754,
653
+ 0.743,
654
+ 0.764,
655
+ 0.917,
656
+ 0.576,
657
+ 0.702,
658
+ 0.864
659
+ ],
660
+ "Gemini-1.5-Pro": [
661
+ 0.87,
662
+ 0.858,
663
+ 0.86,
664
+ 0.93,
665
+ 0.769,
666
+ 0.87,
667
+ 0.87
668
+ ],
669
+ "Llama-3.1-70B-Instruct": [
670
+ 0.855,
671
+ 0.833,
672
+ 0.834,
673
+ 0.81,
674
+ 0.875,
675
+ 0.739,
676
+ 0.913
677
+ ],
678
+ "Qwen2.5-72B-Instruct": [
679
+ 0.768,
680
+ 0.76,
681
+ 0.766,
682
+ 0.868,
683
+ 0.645,
684
+ 0.75,
685
+ 0.8
686
+ ]
687
+ },
688
+ {
689
+ "model_name": "o1",
690
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
691
+ "mu_math": [
692
+ 0.895,
693
+ 0.895,
694
+ 0.895,
695
+ 0.906,
696
+ 0.884,
697
+ 0.887,
698
+ 0.904
699
+ ],
700
+ "GPT-4o": [
701
+ 0.884,
702
+ 0.884,
703
+ 0.884,
704
+ 0.889,
705
+ 0.879,
706
+ 0.889,
707
+ 0.879
708
+ ],
709
+ "Gemini-1.5-Pro": [
710
+ 0.913,
711
+ 0.906,
712
+ 0.907,
713
+ 0.953,
714
+ 0.846,
715
+ 0.911,
716
+ 0.917
717
+ ],
718
+ "Llama-3.1-70B-Instruct": [
719
+ 0.942,
720
+ 0.932,
721
+ 0.932,
722
+ 0.905,
723
+ 0.958,
724
+ 0.905,
725
+ 0.958
726
+ ],
727
+ "Qwen2.5-72B-Instruct": [
728
+ 0.841,
729
+ 0.838,
730
+ 0.839,
731
+ 0.868,
732
+ 0.806,
733
+ 0.846,
734
+ 0.833
735
+ ]
736
+ },
737
+ {
738
+ "model_name": "Qwen/QwQ-32B-Preview",
739
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
740
+ "mu_math": [
741
+ 0.833,
742
+ 0.832,
743
+ 0.838,
744
+ 0.913,
745
+ 0.754,
746
+ 0.787,
747
+ 0.897
748
+ ],
749
+ "GPT-4o": [
750
+ 0.783,
751
+ 0.78,
752
+ 0.784,
753
+ 0.861,
754
+ 0.697,
755
+ 0.756,
756
+ 0.821
757
+ ],
758
+ "Gemini-1.5-Pro": [
759
+ 0.826,
760
+ 0.8,
761
+ 0.814,
762
+ 0.953,
763
+ 0.615,
764
+ 0.804,
765
+ 0.889
766
+ ],
767
+ "Llama-3.1-70B-Instruct": [
768
+ 0.855,
769
+ 0.84,
770
+ 0.848,
771
+ 0.905,
772
+ 0.833,
773
+ 0.704,
774
+ 0.952
775
+ ],
776
+ "Qwen2.5-72B-Instruct": [
777
+ 0.87,
778
+ 0.867,
779
+ 0.868,
780
+ 0.921,
781
+ 0.806,
782
+ 0.854,
783
+ 0.893
784
+ ]
785
+ },
786
+ {
787
+ "model_name": "deepseek-ai/DeepSeek-R1",
788
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
789
+ "mu_math": [
790
+ 0.822,
791
+ 0.822,
792
+ 0.824,
793
+ 0.768,
794
+ 0.877,
795
+ 0.862,
796
+ 0.791
797
+ ],
798
+ "GPT-4o": [
799
+ 0.797,
800
+ 0.797,
801
+ 0.803,
802
+ 0.722,
803
+ 0.879,
804
+ 0.867,
805
+ 0.744
806
+ ],
807
+ "Gemini-1.5-Pro": [
808
+ 0.826,
809
+ 0.82,
810
+ 0.823,
811
+ 0.814,
812
+ 0.846,
813
+ 0.897,
814
+ 0.733
815
+ ],
816
+ "Llama-3.1-70B-Instruct": [
817
+ 0.899,
818
+ 0.882,
819
+ 0.882,
820
+ 0.857,
821
+ 0.917,
822
+ 0.818,
823
+ 0.936
824
+ ],
825
+ "Qwen2.5-72B-Instruct": [
826
+ 0.768,
827
+ 0.768,
828
+ 0.774,
829
+ 0.711,
830
+ 0.839,
831
+ 0.844,
832
+ 0.703
833
+ ]
834
+ },
835
+ {
836
+ "model_name": "o1-mini",
837
+ "extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
838
+ "mu_math": [
839
+ 0.848,
840
+ 0.848,
841
+ 0.848,
842
+ 0.833,
843
+ 0.862,
844
+ 0.858,
845
+ 0.838
846
+ ],
847
+ "GPT-4o": [
848
+ 0.812,
849
+ 0.812,
850
+ 0.813,
851
+ 0.778,
852
+ 0.848,
853
+ 0.848,
854
+ 0.778
855
+ ],
856
+ "Gemini-1.5-Pro": [
857
+ 0.87,
858
+ 0.862,
859
+ 0.862,
860
+ 0.884,
861
+ 0.846,
862
+ 0.905,
863
+ 0.815
864
+ ],
865
+ "Llama-3.1-70B-Instruct": [
866
+ 0.913,
867
+ 0.897,
868
+ 0.897,
869
+ 0.857,
870
+ 0.938,
871
+ 0.857,
872
+ 0.938
873
+ ],
874
+ "Qwen2.5-72B-Instruct": [
875
+ 0.797,
876
+ 0.795,
877
+ 0.795,
878
+ 0.816,
879
+ 0.774,
880
+ 0.816,
881
+ 0.774
882
+ ]
883
+ }
884
+ ]