Spaces:

toloka
/

u-math-leaderboard

Running

App Files Files Community

u-math-leaderboard / data /mu_math_eval_results.json

cogwheelhead

data: commit mu-math numbers

1589444 verified about 2 months ago

raw

history blame contribute delete

17.5 kB

	[
	{
	"model_name": "google/gemini-1.5-flash",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.749,
	0.748,
	0.763,
	0.633,
	0.883,
	0.862,
	0.676
	],
	"GPT-4o": [
	0.705,
	0.701,
	0.716,
	0.579,
	0.84,
	0.794,
	0.651
	],
	"Gemini-1.5-Pro": [
	0.749,
	0.739,
	0.772,
	0.677,
	0.915,
	0.948,
	0.551
	],
	"Llama-3.1-70B-Instruct": [
	0.83,
	0.806,
	0.811,
	0.67,
	0.92,
	0.823,
	0.833
	],
	"Qwen2.5-72B-Instruct": [
	0.712,
	0.712,
	0.731,
	0.606,
	0.853,
	0.847,
	0.619
	]
	},
	{
	"model_name": "google/gemini-1.5-pro",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.807,
	0.807,
	0.809,
	0.775,
	0.845,
	0.852,
	0.764
	],
	"GPT-4o": [
	0.782,
	0.782,
	0.783,
	0.764,
	0.802,
	0.805,
	0.761
	],
	"Gemini-1.5-Pro": [
	0.815,
	0.795,
	0.802,
	0.81,
	0.829,
	0.916,
	0.654
	],
	"Llama-3.1-70B-Instruct": [
	0.852,
	0.836,
	0.837,
	0.753,
	0.908,
	0.82,
	0.868
	],
	"Qwen2.5-72B-Instruct": [
	0.779,
	0.777,
	0.78,
	0.755,
	0.81,
	0.842,
	0.712
	]
	},
	{
	"model_name": "gpt-4o-mini-2024-07-18",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.725,
	0.723,
	0.743,
	0.59,
	0.881,
	0.851,
	0.651
	],
	"GPT-4o": [
	0.708,
	0.704,
	0.722,
	0.564,
	0.863,
	0.814,
	0.649
	],
	"Gemini-1.5-Pro": [
	0.705,
	0.696,
	0.733,
	0.63,
	0.878,
	0.922,
	0.507
	],
	"Llama-3.1-70B-Instruct": [
	0.793,
	0.762,
	0.768,
	0.598,
	0.902,
	0.773,
	0.801
	],
	"Qwen2.5-72B-Instruct": [
	0.694,
	0.693,
	0.721,
	0.561,
	0.871,
	0.853,
	0.598
	]
	},
	{
	"model_name": "gpt-4o-2024-08-06",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.774,
	0.774,
	0.781,
	0.701,
	0.859,
	0.851,
	0.713
	],
	"GPT-4o": [
	0.775,
	0.775,
	0.778,
	0.721,
	0.832,
	0.821,
	0.736
	],
	"Gemini-1.5-Pro": [
	0.742,
	0.726,
	0.746,
	0.704,
	0.829,
	0.905,
	0.548
	],
	"Llama-3.1-70B-Instruct": [
	0.838,
	0.818,
	0.82,
	0.711,
	0.908,
	0.812,
	0.849
	],
	"Qwen2.5-72B-Instruct": [
	0.742,
	0.742,
	0.752,
	0.671,
	0.836,
	0.846,
	0.655
	]
	},
	{
	"model_name": "meta-llama/Llama-3.1-8B-Instruct",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.52,
	0.52,
	0.523,
	0.487,
	0.559,
	0.56,
	0.485
	],
	"GPT-4o": [
	0.513,
	0.512,
	0.515,
	0.464,
	0.565,
	0.533,
	0.497
	],
	"Gemini-1.5-Pro": [
	0.572,
	0.555,
	0.579,
	0.55,
	0.622,
	0.77,
	0.375
	],
	"Llama-3.1-70B-Instruct": [
	0.509,
	0.492,
	0.497,
	0.454,
	0.54,
	0.355,
	0.639
	],
	"Qwen2.5-72B-Instruct": [
	0.487,
	0.487,
	0.493,
	0.452,
	0.534,
	0.565,
	0.422
	]
	},
	{
	"model_name": "meta-llama/Llama-3.1-70B-Instruct",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.612,
	0.61,
	0.61,
	0.625,
	0.596,
	0.641,
	0.579
	],
	"GPT-4o": [
	0.694,
	0.694,
	0.694,
	0.671,
	0.718,
	0.718,
	0.671
	],
	"Gemini-1.5-Pro": [
	0.613,
	0.588,
	0.603,
	0.614,
	0.61,
	0.784,
	0.407
	],
	"Llama-3.1-70B-Instruct": [
	0.576,
	0.57,
	0.586,
	0.639,
	0.54,
	0.437,
	0.729
	],
	"Qwen2.5-72B-Instruct": [
	0.565,
	0.56,
	0.56,
	0.587,
	0.534,
	0.628,
	0.492
	]
	},
	{
	"model_name": "Qwen/Qwen2.5-7B-Instruct",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.699,
	0.693,
	0.697,
	0.787,
	0.598,
	0.693,
	0.708
	],
	"GPT-4o": [
	0.69,
	0.683,
	0.693,
	0.814,
	0.557,
	0.663,
	0.737
	],
	"Gemini-1.5-Pro": [
	0.734,
	0.691,
	0.692,
	0.794,
	0.598,
	0.82,
	0.557
	],
	"Llama-3.1-70B-Instruct": [
	0.731,
	0.723,
	0.733,
	0.784,
	0.701,
	0.594,
	0.853
	],
	"Qwen2.5-72B-Instruct": [
	0.642,
	0.624,
	0.628,
	0.755,
	0.491,
	0.665,
	0.6
	]
	},
	{
	"model_name": "Qwen/Qwen2.5-72B-Instruct",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.757,
	0.756,
	0.756,
	0.771,
	0.742,
	0.775,
	0.737
	],
	"GPT-4o": [
	0.738,
	0.737,
	0.738,
	0.764,
	0.71,
	0.738,
	0.738
	],
	"Gemini-1.5-Pro": [
	0.771,
	0.742,
	0.745,
	0.794,
	0.72,
	0.867,
	0.602
	],
	"Llama-3.1-70B-Instruct": [
	0.808,
	0.793,
	0.793,
	0.753,
	0.839,
	0.723,
	0.859
	],
	"Qwen2.5-72B-Instruct": [
	0.712,
	0.705,
	0.705,
	0.761,
	0.647,
	0.742,
	0.67
	]
	},
	{
	"model_name": "Qwen/Qwen2.5-Math-7B-Instruct",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.633,
	0.619,
	0.628,
	0.766,
	0.479,
	0.629,
	0.639
	],
	"GPT-4o": [
	0.587,
	0.572,
	0.586,
	0.75,
	0.412,
	0.577,
	0.607
	],
	"Gemini-1.5-Pro": [
	0.694,
	0.638,
	0.638,
	0.778,
	0.5,
	0.782,
	0.494
	],
	"Llama-3.1-70B-Instruct": [
	0.638,
	0.638,
	0.684,
	0.856,
	0.517,
	0.497,
	0.865
	],
	"Qwen2.5-72B-Instruct": [
	0.613,
	0.597,
	0.598,
	0.71,
	0.483,
	0.647,
	0.554
	]
	},
	{
	"model_name": "Qwen/Qwen2.5-Math-72B-Instruct",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.744,
	0.74,
	0.742,
	0.809,
	0.668,
	0.738,
	0.752
	],
	"GPT-4o": [
	0.686,
	0.682,
	0.687,
	0.779,
	0.588,
	0.669,
	0.713
	],
	"Gemini-1.5-Pro": [
	0.797,
	0.768,
	0.77,
	0.825,
	0.732,
	0.876,
	0.645
	],
	"Llama-3.1-70B-Instruct": [
	0.782,
	0.773,
	0.779,
	0.814,
	0.764,
	0.658,
	0.881
	],
	"Qwen2.5-72B-Instruct": [
	0.708,
	0.693,
	0.698,
	0.813,
	0.569,
	0.716,
	0.695
	]
	},
	{
	"model_name": "claude-sonnet-3-5",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.75,
	0.748,
	0.766,
	0.625,
	0.895,
	0.873,
	0.674
	],
	"GPT-4o": [
	0.727,
	0.722,
	0.742,
	0.579,
	0.885,
	0.844,
	0.663
	],
	"Gemini-1.5-Pro": [
	0.753,
	0.738,
	0.759,
	0.709,
	0.854,
	0.918,
	0.56
	],
	"Llama-3.1-70B-Instruct": [
	0.812,
	0.779,
	0.79,
	0.598,
	0.931,
	0.829,
	0.806
	],
	"Qwen2.5-72B-Instruct": [
	0.708,
	0.708,
	0.734,
	0.581,
	0.879,
	0.865,
	0.611
	]
	},
	{
	"model_name": "mistralai/Ministral-8B-Instruct-2410",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.605,
	0.605,
	0.609,
	0.559,
	0.658,
	0.654,
	0.564
	],
	"GPT-4o": [
	0.631,
	0.629,
	0.637,
	0.536,
	0.733,
	0.682,
	0.596
	],
	"Gemini-1.5-Pro": [
	0.613,
	0.583,
	0.594,
	0.63,
	0.573,
	0.773,
	0.402
	],
	"Llama-3.1-70B-Instruct": [
	0.649,
	0.631,
	0.634,
	0.598,
	0.678,
	0.509,
	0.752
	],
	"Qwen2.5-72B-Instruct": [
	0.528,
	0.528,
	0.537,
	0.471,
	0.603,
	0.613,
	0.461
	]
	},
	{
	"model_name": "mistralai/Mistral-Large-Instruct-2411",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.767,
	0.766,
	0.767,
	0.757,
	0.777,
	0.797,
	0.735
	],
	"GPT-4o": [
	0.76,
	0.76,
	0.76,
	0.757,
	0.763,
	0.774,
	0.746
	],
	"Gemini-1.5-Pro": [
	0.779,
	0.75,
	0.754,
	0.799,
	0.732,
	0.873,
	0.612
	],
	"Llama-3.1-70B-Instruct": [
	0.801,
	0.786,
	0.786,
	0.753,
	0.828,
	0.709,
	0.857
	],
	"Qwen2.5-72B-Instruct": [
	0.727,
	0.725,
	0.728,
	0.71,
	0.75,
	0.791,
	0.659
	]
	},
	{
	"model_name": "gemini-2.0-flash-thinking-exp-01-21",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.812,
	0.81,
	0.816,
	0.891,
	0.732,
	0.769,
	0.871
	],
	"GPT-4o": [
	0.754,
	0.743,
	0.764,
	0.917,
	0.576,
	0.702,
	0.864
	],
	"Gemini-1.5-Pro": [
	0.87,
	0.858,
	0.86,
	0.93,
	0.769,
	0.87,
	0.87
	],
	"Llama-3.1-70B-Instruct": [
	0.855,
	0.833,
	0.834,
	0.81,
	0.875,
	0.739,
	0.913
	],
	"Qwen2.5-72B-Instruct": [
	0.768,
	0.76,
	0.766,
	0.868,
	0.645,
	0.75,
	0.8
	]
	},
	{
	"model_name": "o1",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.895,
	0.895,
	0.895,
	0.906,
	0.884,
	0.887,
	0.904
	],
	"GPT-4o": [
	0.884,
	0.884,
	0.884,
	0.889,
	0.879,
	0.889,
	0.879
	],
	"Gemini-1.5-Pro": [
	0.913,
	0.906,
	0.907,
	0.953,
	0.846,
	0.911,
	0.917
	],
	"Llama-3.1-70B-Instruct": [
	0.942,
	0.932,
	0.932,
	0.905,
	0.958,
	0.905,
	0.958
	],
	"Qwen2.5-72B-Instruct": [
	0.841,
	0.838,
	0.839,
	0.868,
	0.806,
	0.846,
	0.833
	]
	},
	{
	"model_name": "Qwen/QwQ-32B-Preview",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.833,
	0.832,
	0.838,
	0.913,
	0.754,
	0.787,
	0.897
	],
	"GPT-4o": [
	0.783,
	0.78,
	0.784,
	0.861,
	0.697,
	0.756,
	0.821
	],
	"Gemini-1.5-Pro": [
	0.826,
	0.8,
	0.814,
	0.953,
	0.615,
	0.804,
	0.889
	],
	"Llama-3.1-70B-Instruct": [
	0.855,
	0.84,
	0.848,
	0.905,
	0.833,
	0.704,
	0.952
	],
	"Qwen2.5-72B-Instruct": [
	0.87,
	0.867,
	0.868,
	0.921,
	0.806,
	0.854,
	0.893
	]
	},
	{
	"model_name": "deepseek-ai/DeepSeek-R1",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.822,
	0.822,
	0.824,
	0.768,
	0.877,
	0.862,
	0.791
	],
	"GPT-4o": [
	0.797,
	0.797,
	0.803,
	0.722,
	0.879,
	0.867,
	0.744
	],
	"Gemini-1.5-Pro": [
	0.826,
	0.82,
	0.823,
	0.814,
	0.846,
	0.897,
	0.733
	],
	"Llama-3.1-70B-Instruct": [
	0.899,
	0.882,
	0.882,
	0.857,
	0.917,
	0.818,
	0.936
	],
	"Qwen2.5-72B-Instruct": [
	0.768,
	0.768,
	0.774,
	0.711,
	0.839,
	0.844,
	0.703
	]
	},
	{
	"model_name": "o1-mini",
	"extract_model_name": "Qwen/Qwen2.5-72B-Instruct",
	"mu_math": [
	0.848,
	0.848,
	0.848,
	0.833,
	0.862,
	0.858,
	0.838
	],
	"GPT-4o": [
	0.812,
	0.812,
	0.813,
	0.778,
	0.848,
	0.848,
	0.778
	],
	"Gemini-1.5-Pro": [
	0.87,
	0.862,
	0.862,
	0.884,
	0.846,
	0.905,
	0.815
	],
	"Llama-3.1-70B-Instruct": [
	0.913,
	0.897,
	0.897,
	0.857,
	0.938,
	0.857,
	0.938
	],
	"Qwen2.5-72B-Instruct": [
	0.797,
	0.795,
	0.795,
	0.816,
	0.774,
	0.816,
	0.774
	]
	}
	]