Update Leaderboard
Browse files- gen_table.py +14 -4
- meta_data.py +4 -2
gen_table.py
CHANGED
|
@@ -88,8 +88,12 @@ def BUILD_L2_DF(results, dataset):
|
|
| 88 |
assert len(sub), dataset
|
| 89 |
fields = list(sub[0][dataset].keys())
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
for m in results:
|
| 95 |
item = results[m]
|
|
@@ -117,7 +121,11 @@ def BUILD_L2_DF(results, dataset):
|
|
| 117 |
# Use the first 5 non-overall fields as required fields
|
| 118 |
required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
|
| 119 |
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
df = df.iloc[::-1]
|
| 122 |
|
| 123 |
check_box = {}
|
|
@@ -152,7 +160,9 @@ def generate_table(results):
|
|
| 152 |
|
| 153 |
|
| 154 |
for d in DATASETS_ALL:
|
| 155 |
-
key_name = 'Overall'
|
|
|
|
|
|
|
| 156 |
if d in item:
|
| 157 |
val = float(item[d][key_name])
|
| 158 |
val = float(f'{val:.1f}')
|
|
|
|
| 88 |
assert len(sub), dataset
|
| 89 |
fields = list(sub[0][dataset].keys())
|
| 90 |
|
| 91 |
+
if dataset == 'WeMath':
|
| 92 |
+
non_overall_fields = [x for x in fields if 'Score' in x]
|
| 93 |
+
overall_fields = [x for x in fields if 'Score' not in x]
|
| 94 |
+
else:
|
| 95 |
+
non_overall_fields = [x for x in fields if 'Overall' not in x]
|
| 96 |
+
overall_fields = [x for x in fields if 'Overall' in x]
|
| 97 |
|
| 98 |
for m in results:
|
| 99 |
item = results[m]
|
|
|
|
| 121 |
# Use the first 5 non-overall fields as required fields
|
| 122 |
required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
|
| 123 |
|
| 124 |
+
if 'Score (Strict)' in df:
|
| 125 |
+
df = df.sort_values('Score (Strict)')
|
| 126 |
+
else:
|
| 127 |
+
df = df.sort_values('Overall')
|
| 128 |
+
|
| 129 |
df = df.iloc[::-1]
|
| 130 |
|
| 131 |
check_box = {}
|
|
|
|
| 160 |
|
| 161 |
|
| 162 |
for d in DATASETS_ALL:
|
| 163 |
+
key_name = 'Overall'
|
| 164 |
+
if d == 'WeMath':
|
| 165 |
+
key_name = 'Score (Strict)'
|
| 166 |
if d in item:
|
| 167 |
val = float(item[d][key_name])
|
| 168 |
val = float(f'{val:.1f}')
|
meta_data.py
CHANGED
|
@@ -22,13 +22,15 @@ We obtain all evaluation results based on the [VLMEvalKit](https://github.com/op
|
|
| 22 |
2. MathVision: The Full test set of MathVision, around 3000 samples.
|
| 23 |
3. MathVerse_MINI_Vision_Only: The Test Mini split of MathVerse, using the "Vision Only" mode, around 700 samples.
|
| 24 |
4. DynaMath: The Full test set of DynaMath, around 5000 samples (501 original questions x 10 variants).
|
|
|
|
|
|
|
| 25 |
|
| 26 |
To suggest new models or benchmarks for this leaderboard, please contact [email protected].
|
| 27 |
"""
|
| 28 |
|
| 29 |
# CONSTANTS-FIELDS
|
| 30 |
-
DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
|
| 31 |
-
DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
|
| 32 |
META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
|
| 33 |
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
| 34 |
MODEL_TYPE = ['OpenSource', 'API']
|
|
|
|
| 22 |
2. MathVision: The Full test set of MathVision, around 3000 samples.
|
| 23 |
3. MathVerse_MINI_Vision_Only: The Test Mini split of MathVerse, using the "Vision Only" mode, around 700 samples.
|
| 24 |
4. DynaMath: The Full test set of DynaMath, around 5000 samples (501 original questions x 10 variants).
|
| 25 |
+
5. WeMath: The Test Mini split of WeMath, around 1740 samples, we report "Score (Strict)" as the main metric.
|
| 26 |
+
6. LogicVista: The Full test set of LogicVista, around 450 samples.
|
| 27 |
|
| 28 |
To suggest new models or benchmarks for this leaderboard, please contact [email protected].
|
| 29 |
"""
|
| 30 |
|
| 31 |
# CONSTANTS-FIELDS
|
| 32 |
+
DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'WeMath', 'LogicVista']
|
| 33 |
+
DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'WeMath', 'LogicVista']
|
| 34 |
META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
|
| 35 |
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
| 36 |
MODEL_TYPE = ['OpenSource', 'API']
|