Open_LMM_Reasoning_Leaderboard

Running

KennyUTC commited on Feb 10

Commit

151fc9b

1 Parent(s): 1b0ca05

Update Leaderboard

Files changed (2) hide show

gen_table.py CHANGED Viewed

@@ -88,8 +88,12 @@ def BUILD_L2_DF(results, dataset):
     assert len(sub), dataset
     fields = list(sub[0][dataset].keys())
-    non_overall_fields = [x for x in fields if 'Overall' not in x]
-    overall_fields = [x for x in fields if 'Overall' in x]
     for m in results:
         item = results[m]
@@ -117,7 +121,11 @@ def BUILD_L2_DF(results, dataset):
     # Use the first 5 non-overall fields as required fields
     required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
-    df = df.sort_values('Overall')
     df = df.iloc[::-1]
     check_box = {}
@@ -152,7 +160,9 @@ def generate_table(results):
         for d in DATASETS_ALL:
-            key_name = 'Overall'
             if d in item:
                 val = float(item[d][key_name])
                 val = float(f'{val:.1f}')

     assert len(sub), dataset
     fields = list(sub[0][dataset].keys())
+    if dataset == 'WeMath':
+        non_overall_fields = [x for x in fields if 'Score' in x]
+        overall_fields = [x for x in fields if 'Score' not in x]
+    else:
+        non_overall_fields = [x for x in fields if 'Overall' not in x]
+        overall_fields = [x for x in fields if 'Overall' in x]
     for m in results:
         item = results[m]
     # Use the first 5 non-overall fields as required fields
     required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
+    if 'Score (Strict)' in df:
+        df = df.sort_values('Score (Strict)')
+    else:
+        df = df.sort_values('Overall')
     df = df.iloc[::-1]
     check_box = {}
         for d in DATASETS_ALL:
+            key_name = 'Overall'
+            if d == 'WeMath':
+                key_name = 'Score (Strict)'
             if d in item:
                 val = float(item[d][key_name])
                 val = float(f'{val:.1f}')

meta_data.py CHANGED Viewed

@@ -22,13 +22,15 @@ We obtain all evaluation results based on the [VLMEvalKit](https://github.com/op
 2. MathVision: The Full test set of MathVision, around 3000 samples.
 3. MathVerse_MINI_Vision_Only: The Test Mini split of MathVerse, using the "Vision Only" mode, around 700 samples.
 4. DynaMath: The Full test set of DynaMath, around 5000 samples (501 original questions x 10 variants).
 To suggest new models or benchmarks for this leaderboard, please contact [email protected].
 """
 # CONSTANTS-FIELDS
-DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
-DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
 META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
 MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
 MODEL_TYPE = ['OpenSource', 'API']

 2. MathVision: The Full test set of MathVision, around 3000 samples.
 3. MathVerse_MINI_Vision_Only: The Test Mini split of MathVerse, using the "Vision Only" mode, around 700 samples.
 4. DynaMath: The Full test set of DynaMath, around 5000 samples (501 original questions x 10 variants).
+5. WeMath: The Test Mini split of WeMath, around 1740 samples, we report "Score (Strict)" as the main metric.
+6. LogicVista: The Full test set of LogicVista, around 450 samples.
 To suggest new models or benchmarks for this leaderboard, please contact [email protected].
 """
 # CONSTANTS-FIELDS
+DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'WeMath', 'LogicVista']
+DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'WeMath', 'LogicVista']
 META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
 MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
 MODEL_TYPE = ['OpenSource', 'API']