Spaces:
Running
Running
[ | |
{ | |
"model_name": "google/gemini-1.5-flash", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.749, | |
0.748, | |
0.763, | |
0.633, | |
0.883, | |
0.862, | |
0.676 | |
], | |
"GPT-4o": [ | |
0.705, | |
0.701, | |
0.716, | |
0.579, | |
0.84, | |
0.794, | |
0.651 | |
], | |
"Gemini-1.5-Pro": [ | |
0.749, | |
0.739, | |
0.772, | |
0.677, | |
0.915, | |
0.948, | |
0.551 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.83, | |
0.806, | |
0.811, | |
0.67, | |
0.92, | |
0.823, | |
0.833 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.712, | |
0.712, | |
0.731, | |
0.606, | |
0.853, | |
0.847, | |
0.619 | |
] | |
}, | |
{ | |
"model_name": "google/gemini-1.5-pro", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.807, | |
0.807, | |
0.809, | |
0.775, | |
0.845, | |
0.852, | |
0.764 | |
], | |
"GPT-4o": [ | |
0.782, | |
0.782, | |
0.783, | |
0.764, | |
0.802, | |
0.805, | |
0.761 | |
], | |
"Gemini-1.5-Pro": [ | |
0.815, | |
0.795, | |
0.802, | |
0.81, | |
0.829, | |
0.916, | |
0.654 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.852, | |
0.836, | |
0.837, | |
0.753, | |
0.908, | |
0.82, | |
0.868 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.779, | |
0.777, | |
0.78, | |
0.755, | |
0.81, | |
0.842, | |
0.712 | |
] | |
}, | |
{ | |
"model_name": "gpt-4o-mini-2024-07-18", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.725, | |
0.723, | |
0.743, | |
0.59, | |
0.881, | |
0.851, | |
0.651 | |
], | |
"GPT-4o": [ | |
0.708, | |
0.704, | |
0.722, | |
0.564, | |
0.863, | |
0.814, | |
0.649 | |
], | |
"Gemini-1.5-Pro": [ | |
0.705, | |
0.696, | |
0.733, | |
0.63, | |
0.878, | |
0.922, | |
0.507 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.793, | |
0.762, | |
0.768, | |
0.598, | |
0.902, | |
0.773, | |
0.801 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.694, | |
0.693, | |
0.721, | |
0.561, | |
0.871, | |
0.853, | |
0.598 | |
] | |
}, | |
{ | |
"model_name": "gpt-4o-2024-08-06", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.774, | |
0.774, | |
0.781, | |
0.701, | |
0.859, | |
0.851, | |
0.713 | |
], | |
"GPT-4o": [ | |
0.775, | |
0.775, | |
0.778, | |
0.721, | |
0.832, | |
0.821, | |
0.736 | |
], | |
"Gemini-1.5-Pro": [ | |
0.742, | |
0.726, | |
0.746, | |
0.704, | |
0.829, | |
0.905, | |
0.548 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.838, | |
0.818, | |
0.82, | |
0.711, | |
0.908, | |
0.812, | |
0.849 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.742, | |
0.742, | |
0.752, | |
0.671, | |
0.836, | |
0.846, | |
0.655 | |
] | |
}, | |
{ | |
"model_name": "meta-llama/Llama-3.1-8B-Instruct", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.52, | |
0.52, | |
0.523, | |
0.487, | |
0.559, | |
0.56, | |
0.485 | |
], | |
"GPT-4o": [ | |
0.513, | |
0.512, | |
0.515, | |
0.464, | |
0.565, | |
0.533, | |
0.497 | |
], | |
"Gemini-1.5-Pro": [ | |
0.572, | |
0.555, | |
0.579, | |
0.55, | |
0.622, | |
0.77, | |
0.375 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.509, | |
0.492, | |
0.497, | |
0.454, | |
0.54, | |
0.355, | |
0.639 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.487, | |
0.487, | |
0.493, | |
0.452, | |
0.534, | |
0.565, | |
0.422 | |
] | |
}, | |
{ | |
"model_name": "meta-llama/Llama-3.1-70B-Instruct", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.612, | |
0.61, | |
0.61, | |
0.625, | |
0.596, | |
0.641, | |
0.579 | |
], | |
"GPT-4o": [ | |
0.694, | |
0.694, | |
0.694, | |
0.671, | |
0.718, | |
0.718, | |
0.671 | |
], | |
"Gemini-1.5-Pro": [ | |
0.613, | |
0.588, | |
0.603, | |
0.614, | |
0.61, | |
0.784, | |
0.407 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.576, | |
0.57, | |
0.586, | |
0.639, | |
0.54, | |
0.437, | |
0.729 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.565, | |
0.56, | |
0.56, | |
0.587, | |
0.534, | |
0.628, | |
0.492 | |
] | |
}, | |
{ | |
"model_name": "Qwen/Qwen2.5-7B-Instruct", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.699, | |
0.693, | |
0.697, | |
0.787, | |
0.598, | |
0.693, | |
0.708 | |
], | |
"GPT-4o": [ | |
0.69, | |
0.683, | |
0.693, | |
0.814, | |
0.557, | |
0.663, | |
0.737 | |
], | |
"Gemini-1.5-Pro": [ | |
0.734, | |
0.691, | |
0.692, | |
0.794, | |
0.598, | |
0.82, | |
0.557 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.731, | |
0.723, | |
0.733, | |
0.784, | |
0.701, | |
0.594, | |
0.853 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.642, | |
0.624, | |
0.628, | |
0.755, | |
0.491, | |
0.665, | |
0.6 | |
] | |
}, | |
{ | |
"model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.757, | |
0.756, | |
0.756, | |
0.771, | |
0.742, | |
0.775, | |
0.737 | |
], | |
"GPT-4o": [ | |
0.738, | |
0.737, | |
0.738, | |
0.764, | |
0.71, | |
0.738, | |
0.738 | |
], | |
"Gemini-1.5-Pro": [ | |
0.771, | |
0.742, | |
0.745, | |
0.794, | |
0.72, | |
0.867, | |
0.602 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.808, | |
0.793, | |
0.793, | |
0.753, | |
0.839, | |
0.723, | |
0.859 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.712, | |
0.705, | |
0.705, | |
0.761, | |
0.647, | |
0.742, | |
0.67 | |
] | |
}, | |
{ | |
"model_name": "Qwen/Qwen2.5-Math-7B-Instruct", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.633, | |
0.619, | |
0.628, | |
0.766, | |
0.479, | |
0.629, | |
0.639 | |
], | |
"GPT-4o": [ | |
0.587, | |
0.572, | |
0.586, | |
0.75, | |
0.412, | |
0.577, | |
0.607 | |
], | |
"Gemini-1.5-Pro": [ | |
0.694, | |
0.638, | |
0.638, | |
0.778, | |
0.5, | |
0.782, | |
0.494 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.638, | |
0.638, | |
0.684, | |
0.856, | |
0.517, | |
0.497, | |
0.865 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.613, | |
0.597, | |
0.598, | |
0.71, | |
0.483, | |
0.647, | |
0.554 | |
] | |
}, | |
{ | |
"model_name": "Qwen/Qwen2.5-Math-72B-Instruct", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.744, | |
0.74, | |
0.742, | |
0.809, | |
0.668, | |
0.738, | |
0.752 | |
], | |
"GPT-4o": [ | |
0.686, | |
0.682, | |
0.687, | |
0.779, | |
0.588, | |
0.669, | |
0.713 | |
], | |
"Gemini-1.5-Pro": [ | |
0.797, | |
0.768, | |
0.77, | |
0.825, | |
0.732, | |
0.876, | |
0.645 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.782, | |
0.773, | |
0.779, | |
0.814, | |
0.764, | |
0.658, | |
0.881 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.708, | |
0.693, | |
0.698, | |
0.813, | |
0.569, | |
0.716, | |
0.695 | |
] | |
}, | |
{ | |
"model_name": "claude-sonnet-3-5", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.75, | |
0.748, | |
0.766, | |
0.625, | |
0.895, | |
0.873, | |
0.674 | |
], | |
"GPT-4o": [ | |
0.727, | |
0.722, | |
0.742, | |
0.579, | |
0.885, | |
0.844, | |
0.663 | |
], | |
"Gemini-1.5-Pro": [ | |
0.753, | |
0.738, | |
0.759, | |
0.709, | |
0.854, | |
0.918, | |
0.56 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.812, | |
0.779, | |
0.79, | |
0.598, | |
0.931, | |
0.829, | |
0.806 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.708, | |
0.708, | |
0.734, | |
0.581, | |
0.879, | |
0.865, | |
0.611 | |
] | |
}, | |
{ | |
"model_name": "mistralai/Ministral-8B-Instruct-2410", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.605, | |
0.605, | |
0.609, | |
0.559, | |
0.658, | |
0.654, | |
0.564 | |
], | |
"GPT-4o": [ | |
0.631, | |
0.629, | |
0.637, | |
0.536, | |
0.733, | |
0.682, | |
0.596 | |
], | |
"Gemini-1.5-Pro": [ | |
0.613, | |
0.583, | |
0.594, | |
0.63, | |
0.573, | |
0.773, | |
0.402 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.649, | |
0.631, | |
0.634, | |
0.598, | |
0.678, | |
0.509, | |
0.752 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.528, | |
0.528, | |
0.537, | |
0.471, | |
0.603, | |
0.613, | |
0.461 | |
] | |
}, | |
{ | |
"model_name": "mistralai/Mistral-Large-Instruct-2411", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.767, | |
0.766, | |
0.767, | |
0.757, | |
0.777, | |
0.797, | |
0.735 | |
], | |
"GPT-4o": [ | |
0.76, | |
0.76, | |
0.76, | |
0.757, | |
0.763, | |
0.774, | |
0.746 | |
], | |
"Gemini-1.5-Pro": [ | |
0.779, | |
0.75, | |
0.754, | |
0.799, | |
0.732, | |
0.873, | |
0.612 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.801, | |
0.786, | |
0.786, | |
0.753, | |
0.828, | |
0.709, | |
0.857 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.727, | |
0.725, | |
0.728, | |
0.71, | |
0.75, | |
0.791, | |
0.659 | |
] | |
}, | |
{ | |
"model_name": "gemini-2.0-flash-thinking-exp-01-21", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.812, | |
0.81, | |
0.816, | |
0.891, | |
0.732, | |
0.769, | |
0.871 | |
], | |
"GPT-4o": [ | |
0.754, | |
0.743, | |
0.764, | |
0.917, | |
0.576, | |
0.702, | |
0.864 | |
], | |
"Gemini-1.5-Pro": [ | |
0.87, | |
0.858, | |
0.86, | |
0.93, | |
0.769, | |
0.87, | |
0.87 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.855, | |
0.833, | |
0.834, | |
0.81, | |
0.875, | |
0.739, | |
0.913 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.768, | |
0.76, | |
0.766, | |
0.868, | |
0.645, | |
0.75, | |
0.8 | |
] | |
}, | |
{ | |
"model_name": "o1", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.895, | |
0.895, | |
0.895, | |
0.906, | |
0.884, | |
0.887, | |
0.904 | |
], | |
"GPT-4o": [ | |
0.884, | |
0.884, | |
0.884, | |
0.889, | |
0.879, | |
0.889, | |
0.879 | |
], | |
"Gemini-1.5-Pro": [ | |
0.913, | |
0.906, | |
0.907, | |
0.953, | |
0.846, | |
0.911, | |
0.917 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.942, | |
0.932, | |
0.932, | |
0.905, | |
0.958, | |
0.905, | |
0.958 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.841, | |
0.838, | |
0.839, | |
0.868, | |
0.806, | |
0.846, | |
0.833 | |
] | |
}, | |
{ | |
"model_name": "Qwen/QwQ-32B-Preview", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.833, | |
0.832, | |
0.838, | |
0.913, | |
0.754, | |
0.787, | |
0.897 | |
], | |
"GPT-4o": [ | |
0.783, | |
0.78, | |
0.784, | |
0.861, | |
0.697, | |
0.756, | |
0.821 | |
], | |
"Gemini-1.5-Pro": [ | |
0.826, | |
0.8, | |
0.814, | |
0.953, | |
0.615, | |
0.804, | |
0.889 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.855, | |
0.84, | |
0.848, | |
0.905, | |
0.833, | |
0.704, | |
0.952 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.87, | |
0.867, | |
0.868, | |
0.921, | |
0.806, | |
0.854, | |
0.893 | |
] | |
}, | |
{ | |
"model_name": "deepseek-ai/DeepSeek-R1", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.822, | |
0.822, | |
0.824, | |
0.768, | |
0.877, | |
0.862, | |
0.791 | |
], | |
"GPT-4o": [ | |
0.797, | |
0.797, | |
0.803, | |
0.722, | |
0.879, | |
0.867, | |
0.744 | |
], | |
"Gemini-1.5-Pro": [ | |
0.826, | |
0.82, | |
0.823, | |
0.814, | |
0.846, | |
0.897, | |
0.733 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.899, | |
0.882, | |
0.882, | |
0.857, | |
0.917, | |
0.818, | |
0.936 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.768, | |
0.768, | |
0.774, | |
0.711, | |
0.839, | |
0.844, | |
0.703 | |
] | |
}, | |
{ | |
"model_name": "o1-mini", | |
"extract_model_name": "Qwen/Qwen2.5-72B-Instruct", | |
"mu_math": [ | |
0.848, | |
0.848, | |
0.848, | |
0.833, | |
0.862, | |
0.858, | |
0.838 | |
], | |
"GPT-4o": [ | |
0.812, | |
0.812, | |
0.813, | |
0.778, | |
0.848, | |
0.848, | |
0.778 | |
], | |
"Gemini-1.5-Pro": [ | |
0.87, | |
0.862, | |
0.862, | |
0.884, | |
0.846, | |
0.905, | |
0.815 | |
], | |
"Llama-3.1-70B-Instruct": [ | |
0.913, | |
0.897, | |
0.897, | |
0.857, | |
0.938, | |
0.857, | |
0.938 | |
], | |
"Qwen2.5-72B-Instruct": [ | |
0.797, | |
0.795, | |
0.795, | |
0.816, | |
0.774, | |
0.816, | |
0.774 | |
] | |
} | |
] |