u-math-leaderboard / data /mu_math_eval_results.json
cogwheelhead's picture
data: commit mu-math numbers
1589444 verified
[
{
"model_name": "google/gemini-1.5-flash",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.749,
0.748,
0.763,
0.633,
0.883,
0.862,
0.676
],
"GPT-4o": [
0.705,
0.701,
0.716,
0.579,
0.84,
0.794,
0.651
],
"Gemini-1.5-Pro": [
0.749,
0.739,
0.772,
0.677,
0.915,
0.948,
0.551
],
"Llama-3.1-70B-Instruct": [
0.83,
0.806,
0.811,
0.67,
0.92,
0.823,
0.833
],
"Qwen2.5-72B-Instruct": [
0.712,
0.712,
0.731,
0.606,
0.853,
0.847,
0.619
]
},
{
"model_name": "google/gemini-1.5-pro",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.807,
0.807,
0.809,
0.775,
0.845,
0.852,
0.764
],
"GPT-4o": [
0.782,
0.782,
0.783,
0.764,
0.802,
0.805,
0.761
],
"Gemini-1.5-Pro": [
0.815,
0.795,
0.802,
0.81,
0.829,
0.916,
0.654
],
"Llama-3.1-70B-Instruct": [
0.852,
0.836,
0.837,
0.753,
0.908,
0.82,
0.868
],
"Qwen2.5-72B-Instruct": [
0.779,
0.777,
0.78,
0.755,
0.81,
0.842,
0.712
]
},
{
"model_name": "gpt-4o-mini-2024-07-18",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.725,
0.723,
0.743,
0.59,
0.881,
0.851,
0.651
],
"GPT-4o": [
0.708,
0.704,
0.722,
0.564,
0.863,
0.814,
0.649
],
"Gemini-1.5-Pro": [
0.705,
0.696,
0.733,
0.63,
0.878,
0.922,
0.507
],
"Llama-3.1-70B-Instruct": [
0.793,
0.762,
0.768,
0.598,
0.902,
0.773,
0.801
],
"Qwen2.5-72B-Instruct": [
0.694,
0.693,
0.721,
0.561,
0.871,
0.853,
0.598
]
},
{
"model_name": "gpt-4o-2024-08-06",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.774,
0.774,
0.781,
0.701,
0.859,
0.851,
0.713
],
"GPT-4o": [
0.775,
0.775,
0.778,
0.721,
0.832,
0.821,
0.736
],
"Gemini-1.5-Pro": [
0.742,
0.726,
0.746,
0.704,
0.829,
0.905,
0.548
],
"Llama-3.1-70B-Instruct": [
0.838,
0.818,
0.82,
0.711,
0.908,
0.812,
0.849
],
"Qwen2.5-72B-Instruct": [
0.742,
0.742,
0.752,
0.671,
0.836,
0.846,
0.655
]
},
{
"model_name": "meta-llama/Llama-3.1-8B-Instruct",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.52,
0.52,
0.523,
0.487,
0.559,
0.56,
0.485
],
"GPT-4o": [
0.513,
0.512,
0.515,
0.464,
0.565,
0.533,
0.497
],
"Gemini-1.5-Pro": [
0.572,
0.555,
0.579,
0.55,
0.622,
0.77,
0.375
],
"Llama-3.1-70B-Instruct": [
0.509,
0.492,
0.497,
0.454,
0.54,
0.355,
0.639
],
"Qwen2.5-72B-Instruct": [
0.487,
0.487,
0.493,
0.452,
0.534,
0.565,
0.422
]
},
{
"model_name": "meta-llama/Llama-3.1-70B-Instruct",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.612,
0.61,
0.61,
0.625,
0.596,
0.641,
0.579
],
"GPT-4o": [
0.694,
0.694,
0.694,
0.671,
0.718,
0.718,
0.671
],
"Gemini-1.5-Pro": [
0.613,
0.588,
0.603,
0.614,
0.61,
0.784,
0.407
],
"Llama-3.1-70B-Instruct": [
0.576,
0.57,
0.586,
0.639,
0.54,
0.437,
0.729
],
"Qwen2.5-72B-Instruct": [
0.565,
0.56,
0.56,
0.587,
0.534,
0.628,
0.492
]
},
{
"model_name": "Qwen/Qwen2.5-7B-Instruct",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.699,
0.693,
0.697,
0.787,
0.598,
0.693,
0.708
],
"GPT-4o": [
0.69,
0.683,
0.693,
0.814,
0.557,
0.663,
0.737
],
"Gemini-1.5-Pro": [
0.734,
0.691,
0.692,
0.794,
0.598,
0.82,
0.557
],
"Llama-3.1-70B-Instruct": [
0.731,
0.723,
0.733,
0.784,
0.701,
0.594,
0.853
],
"Qwen2.5-72B-Instruct": [
0.642,
0.624,
0.628,
0.755,
0.491,
0.665,
0.6
]
},
{
"model_name": "Qwen/Qwen2.5-72B-Instruct",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.757,
0.756,
0.756,
0.771,
0.742,
0.775,
0.737
],
"GPT-4o": [
0.738,
0.737,
0.738,
0.764,
0.71,
0.738,
0.738
],
"Gemini-1.5-Pro": [
0.771,
0.742,
0.745,
0.794,
0.72,
0.867,
0.602
],
"Llama-3.1-70B-Instruct": [
0.808,
0.793,
0.793,
0.753,
0.839,
0.723,
0.859
],
"Qwen2.5-72B-Instruct": [
0.712,
0.705,
0.705,
0.761,
0.647,
0.742,
0.67
]
},
{
"model_name": "Qwen/Qwen2.5-Math-7B-Instruct",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.633,
0.619,
0.628,
0.766,
0.479,
0.629,
0.639
],
"GPT-4o": [
0.587,
0.572,
0.586,
0.75,
0.412,
0.577,
0.607
],
"Gemini-1.5-Pro": [
0.694,
0.638,
0.638,
0.778,
0.5,
0.782,
0.494
],
"Llama-3.1-70B-Instruct": [
0.638,
0.638,
0.684,
0.856,
0.517,
0.497,
0.865
],
"Qwen2.5-72B-Instruct": [
0.613,
0.597,
0.598,
0.71,
0.483,
0.647,
0.554
]
},
{
"model_name": "Qwen/Qwen2.5-Math-72B-Instruct",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.744,
0.74,
0.742,
0.809,
0.668,
0.738,
0.752
],
"GPT-4o": [
0.686,
0.682,
0.687,
0.779,
0.588,
0.669,
0.713
],
"Gemini-1.5-Pro": [
0.797,
0.768,
0.77,
0.825,
0.732,
0.876,
0.645
],
"Llama-3.1-70B-Instruct": [
0.782,
0.773,
0.779,
0.814,
0.764,
0.658,
0.881
],
"Qwen2.5-72B-Instruct": [
0.708,
0.693,
0.698,
0.813,
0.569,
0.716,
0.695
]
},
{
"model_name": "claude-sonnet-3-5",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.75,
0.748,
0.766,
0.625,
0.895,
0.873,
0.674
],
"GPT-4o": [
0.727,
0.722,
0.742,
0.579,
0.885,
0.844,
0.663
],
"Gemini-1.5-Pro": [
0.753,
0.738,
0.759,
0.709,
0.854,
0.918,
0.56
],
"Llama-3.1-70B-Instruct": [
0.812,
0.779,
0.79,
0.598,
0.931,
0.829,
0.806
],
"Qwen2.5-72B-Instruct": [
0.708,
0.708,
0.734,
0.581,
0.879,
0.865,
0.611
]
},
{
"model_name": "mistralai/Ministral-8B-Instruct-2410",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.605,
0.605,
0.609,
0.559,
0.658,
0.654,
0.564
],
"GPT-4o": [
0.631,
0.629,
0.637,
0.536,
0.733,
0.682,
0.596
],
"Gemini-1.5-Pro": [
0.613,
0.583,
0.594,
0.63,
0.573,
0.773,
0.402
],
"Llama-3.1-70B-Instruct": [
0.649,
0.631,
0.634,
0.598,
0.678,
0.509,
0.752
],
"Qwen2.5-72B-Instruct": [
0.528,
0.528,
0.537,
0.471,
0.603,
0.613,
0.461
]
},
{
"model_name": "mistralai/Mistral-Large-Instruct-2411",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.767,
0.766,
0.767,
0.757,
0.777,
0.797,
0.735
],
"GPT-4o": [
0.76,
0.76,
0.76,
0.757,
0.763,
0.774,
0.746
],
"Gemini-1.5-Pro": [
0.779,
0.75,
0.754,
0.799,
0.732,
0.873,
0.612
],
"Llama-3.1-70B-Instruct": [
0.801,
0.786,
0.786,
0.753,
0.828,
0.709,
0.857
],
"Qwen2.5-72B-Instruct": [
0.727,
0.725,
0.728,
0.71,
0.75,
0.791,
0.659
]
},
{
"model_name": "gemini-2.0-flash-thinking-exp-01-21",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.812,
0.81,
0.816,
0.891,
0.732,
0.769,
0.871
],
"GPT-4o": [
0.754,
0.743,
0.764,
0.917,
0.576,
0.702,
0.864
],
"Gemini-1.5-Pro": [
0.87,
0.858,
0.86,
0.93,
0.769,
0.87,
0.87
],
"Llama-3.1-70B-Instruct": [
0.855,
0.833,
0.834,
0.81,
0.875,
0.739,
0.913
],
"Qwen2.5-72B-Instruct": [
0.768,
0.76,
0.766,
0.868,
0.645,
0.75,
0.8
]
},
{
"model_name": "o1",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.895,
0.895,
0.895,
0.906,
0.884,
0.887,
0.904
],
"GPT-4o": [
0.884,
0.884,
0.884,
0.889,
0.879,
0.889,
0.879
],
"Gemini-1.5-Pro": [
0.913,
0.906,
0.907,
0.953,
0.846,
0.911,
0.917
],
"Llama-3.1-70B-Instruct": [
0.942,
0.932,
0.932,
0.905,
0.958,
0.905,
0.958
],
"Qwen2.5-72B-Instruct": [
0.841,
0.838,
0.839,
0.868,
0.806,
0.846,
0.833
]
},
{
"model_name": "Qwen/QwQ-32B-Preview",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.833,
0.832,
0.838,
0.913,
0.754,
0.787,
0.897
],
"GPT-4o": [
0.783,
0.78,
0.784,
0.861,
0.697,
0.756,
0.821
],
"Gemini-1.5-Pro": [
0.826,
0.8,
0.814,
0.953,
0.615,
0.804,
0.889
],
"Llama-3.1-70B-Instruct": [
0.855,
0.84,
0.848,
0.905,
0.833,
0.704,
0.952
],
"Qwen2.5-72B-Instruct": [
0.87,
0.867,
0.868,
0.921,
0.806,
0.854,
0.893
]
},
{
"model_name": "deepseek-ai/DeepSeek-R1",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.822,
0.822,
0.824,
0.768,
0.877,
0.862,
0.791
],
"GPT-4o": [
0.797,
0.797,
0.803,
0.722,
0.879,
0.867,
0.744
],
"Gemini-1.5-Pro": [
0.826,
0.82,
0.823,
0.814,
0.846,
0.897,
0.733
],
"Llama-3.1-70B-Instruct": [
0.899,
0.882,
0.882,
0.857,
0.917,
0.818,
0.936
],
"Qwen2.5-72B-Instruct": [
0.768,
0.768,
0.774,
0.711,
0.839,
0.844,
0.703
]
},
{
"model_name": "o1-mini",
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
"mu_math": [
0.848,
0.848,
0.848,
0.833,
0.862,
0.858,
0.838
],
"GPT-4o": [
0.812,
0.812,
0.813,
0.778,
0.848,
0.848,
0.778
],
"Gemini-1.5-Pro": [
0.87,
0.862,
0.862,
0.884,
0.846,
0.905,
0.815
],
"Llama-3.1-70B-Instruct": [
0.913,
0.897,
0.897,
0.857,
0.938,
0.857,
0.938
],
"Qwen2.5-72B-Instruct": [
0.797,
0.795,
0.795,
0.816,
0.774,
0.816,
0.774
]
}
]