Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
kennymckormick
commited on
Commit
·
e06d81a
1
Parent(s):
4a9f248
update
Browse files- app.py +2 -2
- gen_table.py +41 -24
- meta_data.py +5 -6
app.py
CHANGED
|
@@ -55,7 +55,8 @@ with gr.Blocks() as demo:
|
|
| 55 |
filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
|
| 56 |
headers = check_box['essential'] + fields
|
| 57 |
new_fields = [field for field in fields if field not in filter_list]
|
| 58 |
-
df =
|
|
|
|
| 59 |
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
| 60 |
df = df[df['flag']]
|
| 61 |
df.pop('flag')
|
|
@@ -64,7 +65,6 @@ with gr.Blocks() as demo:
|
|
| 64 |
df = df[df['flag']]
|
| 65 |
df.pop('flag')
|
| 66 |
|
| 67 |
-
df = generate_table(results, new_fields, df)
|
| 68 |
comp = gr.components.DataFrame(
|
| 69 |
value=df[headers],
|
| 70 |
type='pandas',
|
|
|
|
| 55 |
filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
|
| 56 |
headers = check_box['essential'] + fields
|
| 57 |
new_fields = [field for field in fields if field not in filter_list]
|
| 58 |
+
df = generate_table(results, new_fields)
|
| 59 |
+
|
| 60 |
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
| 61 |
df = df[df['flag']]
|
| 62 |
df.pop('flag')
|
|
|
|
| 65 |
df = df[df['flag']]
|
| 66 |
df.pop('flag')
|
| 67 |
|
|
|
|
| 68 |
comp = gr.components.DataFrame(
|
| 69 |
value=df[headers],
|
| 70 |
type='pandas',
|
gen_table.py
CHANGED
|
@@ -38,7 +38,9 @@ def model_size_flag(sz, FIELDS):
|
|
| 38 |
return True
|
| 39 |
if pd.isna(sz):
|
| 40 |
return False
|
| 41 |
-
if '<
|
|
|
|
|
|
|
| 42 |
return True
|
| 43 |
if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
|
| 44 |
return True
|
|
@@ -71,10 +73,7 @@ def BUILD_L1_DF(results, fields):
|
|
| 71 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
| 72 |
check_box['type_map'] = type_map
|
| 73 |
|
| 74 |
-
|
| 75 |
-
df = pd.DataFrame(res)
|
| 76 |
-
df = df.sort_values('Avg Score')
|
| 77 |
-
df = df.iloc[::-1]
|
| 78 |
return df, check_box
|
| 79 |
|
| 80 |
|
|
@@ -131,7 +130,14 @@ def BUILD_L2_DF(results, dataset):
|
|
| 131 |
return df, check_box
|
| 132 |
|
| 133 |
|
| 134 |
-
def generate_table(results, fields
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
res = defaultdict(list)
|
| 136 |
for i, m in enumerate(results):
|
| 137 |
item = results[m]
|
|
@@ -149,23 +155,34 @@ def generate_table(results, fields, df=None):
|
|
| 149 |
scores, ranks = [], []
|
| 150 |
for d in fields:
|
| 151 |
key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
|
| 152 |
-
|
| 153 |
-
if d == '
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
scores.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
else:
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
| 171 |
return df
|
|
|
|
| 38 |
return True
|
| 39 |
if pd.isna(sz):
|
| 40 |
return False
|
| 41 |
+
if '<4B' in FIELDS and sz < 4:
|
| 42 |
+
return True
|
| 43 |
+
if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
|
| 44 |
return True
|
| 45 |
if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
|
| 46 |
return True
|
|
|
|
| 73 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
| 74 |
check_box['type_map'] = type_map
|
| 75 |
|
| 76 |
+
df = generate_table(results, fields)
|
|
|
|
|
|
|
|
|
|
| 77 |
return df, check_box
|
| 78 |
|
| 79 |
|
|
|
|
| 130 |
return df, check_box
|
| 131 |
|
| 132 |
|
| 133 |
+
def generate_table(results, fields):
|
| 134 |
+
|
| 135 |
+
def get_mmbench_v11(item):
|
| 136 |
+
assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
|
| 137 |
+
val = (item['MMBench_TEST_CN_V11'] + item['MMBench_TEST_EN_V11']) / 2
|
| 138 |
+
val = float(f'{val:.1f}')
|
| 139 |
+
return val
|
| 140 |
+
|
| 141 |
res = defaultdict(list)
|
| 142 |
for i, m in enumerate(results):
|
| 143 |
item = results[m]
|
|
|
|
| 155 |
scores, ranks = [], []
|
| 156 |
for d in fields:
|
| 157 |
key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
|
| 158 |
+
# Every Model should have MMBench_V11 results
|
| 159 |
+
if d == 'MMBench_V11':
|
| 160 |
+
val = get_mmbench_v11(item)
|
| 161 |
+
res[d].append(val)
|
| 162 |
+
scores.append(val)
|
| 163 |
+
ranks.append(nth_large(val, [get_mmbench_v11(x) for x in results.values()]))
|
| 164 |
+
elif d in item:
|
| 165 |
+
res[d].append(item[d][key_name])
|
| 166 |
+
if d == 'MME':
|
| 167 |
+
scores.append(item[d][key_name] / 28)
|
| 168 |
+
elif d == 'OCRBench':
|
| 169 |
+
scores.append(item[d][key_name] / 10)
|
| 170 |
+
else:
|
| 171 |
+
scores.append(item[d][key_name])
|
| 172 |
+
ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x]))
|
| 173 |
else:
|
| 174 |
+
res[d].append(None)
|
| 175 |
+
scores.append(None)
|
| 176 |
+
ranks.append(None)
|
| 177 |
+
|
| 178 |
+
res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None)
|
| 179 |
+
res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None)
|
| 180 |
+
|
| 181 |
+
df = pd.DataFrame(res)
|
| 182 |
+
valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
|
| 183 |
+
valid = valid.sort_values('Avg Score')
|
| 184 |
+
valid = valid.iloc[::-1]
|
| 185 |
+
missing = missing.sort_values('MMBench_V11')
|
| 186 |
+
missing = missing.iloc[::-1]
|
| 187 |
+
df = pd.concat([valid, missing])
|
| 188 |
return df
|
meta_data.py
CHANGED
|
@@ -22,17 +22,16 @@ OpenVLM Leaderboard only includes open-source VLMs or API models that are public
|
|
| 22 |
# CONSTANTS-FIELDS
|
| 23 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
| 24 |
MAIN_FIELDS = [
|
| 25 |
-
'
|
| 26 |
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
| 27 |
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
|
| 28 |
]
|
| 29 |
DEFAULT_BENCH = [
|
| 30 |
-
'
|
| 31 |
-
'
|
| 32 |
-
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
|
| 33 |
]
|
| 34 |
-
MMBENCH_FIELDS = ['
|
| 35 |
-
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
| 36 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
| 37 |
|
| 38 |
# The README file for each benchmark
|
|
|
|
| 22 |
# CONSTANTS-FIELDS
|
| 23 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
| 24 |
MAIN_FIELDS = [
|
| 25 |
+
'MMBench_V11', 'MMStar', 'MME',
|
| 26 |
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
| 27 |
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
|
| 28 |
]
|
| 29 |
DEFAULT_BENCH = [
|
| 30 |
+
'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
| 31 |
+
'HallusionBench', 'MMVet'
|
|
|
|
| 32 |
]
|
| 33 |
+
MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
|
| 34 |
+
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
| 35 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
| 36 |
|
| 37 |
# The README file for each benchmark
|