add self-reported results
Browse files- app.py +1 -1
- constants.py +12 -4
- static/eval_results/Default/self_reported.json +6 -0
- utils.py +94 -55
app.py
CHANGED
@@ -52,7 +52,7 @@ with gr.Blocks() as block:
|
|
52 |
)
|
53 |
|
54 |
# Define different captions for each table
|
55 |
-
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data. <br> $\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ "
|
56 |
|
57 |
single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
58 |
|
|
|
52 |
)
|
53 |
|
54 |
# Define different captions for each table
|
55 |
+
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data. <br> $\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ <br> * indicates self-reported results from the model authors."
|
56 |
|
57 |
single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
58 |
|
constants.py
CHANGED
@@ -134,6 +134,10 @@ MODEL_NAME_MAP = {
|
|
134 |
"Gemini-exp-1206": "Gemini-exp-1206",
|
135 |
"Ivy_VL_3B": "Ivy-VL-3B",
|
136 |
"DeepSeek_VL2_tiny": "deepseek-vl2-tiny",
|
|
|
|
|
|
|
|
|
137 |
}
|
138 |
|
139 |
DIMENSION_NAME_MAP = {
|
@@ -227,15 +231,19 @@ MODEL_URLS = {
|
|
227 |
"Gemini-Flash-2.0-exp": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/#gemini-2-0-flash",
|
228 |
"Ivy_VL_3B": "https://huggingface.co/AI-Safeguard/Ivy-VL-llava",
|
229 |
"DeepSeek_VL2_tiny": "https://huggingface.co/deepseek-ai/deepseek-vl2-tiny",
|
|
|
|
|
|
|
|
|
230 |
}
|
231 |
|
232 |
# Define the base MODEL_GROUPS structure
|
233 |
BASE_MODEL_GROUPS = {
|
234 |
"All": list(MODEL_NAME_MAP.keys()),
|
235 |
-
"Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212', "Gemini-exp-1206"],
|
236 |
-
"Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Gemini-2.0-thinking", "Gemini-Flash-2.0-exp", "Ivy_VL_3B", "DeepSeek_VL2_tiny"],
|
237 |
"Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212', "Gemini-exp-1206"],
|
238 |
"Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', "Gemini-Flash-2.0-exp", "Gemini-2.0-thinking"],
|
239 |
-
"Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
|
240 |
-
"Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Ivy_VL_3B", "DeepSeek_VL2_tiny"]
|
241 |
}
|
|
|
134 |
"Gemini-exp-1206": "Gemini-exp-1206",
|
135 |
"Ivy_VL_3B": "Ivy-VL-3B",
|
136 |
"DeepSeek_VL2_tiny": "deepseek-vl2-tiny",
|
137 |
+
"MiniMax-VL-01": "MiniMax-VL-01",
|
138 |
+
"Qwen2.5-VL-72B": "Qwen2.5-VL-72B",
|
139 |
+
"Qwen2.5-VL-7B": "Qwen2.5-VL-7B",
|
140 |
+
"Qwen2.5-VL-3B": "Qwen2.5-VL-3B",
|
141 |
}
|
142 |
|
143 |
DIMENSION_NAME_MAP = {
|
|
|
231 |
"Gemini-Flash-2.0-exp": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/#gemini-2-0-flash",
|
232 |
"Ivy_VL_3B": "https://huggingface.co/AI-Safeguard/Ivy-VL-llava",
|
233 |
"DeepSeek_VL2_tiny": "https://huggingface.co/deepseek-ai/deepseek-vl2-tiny",
|
234 |
+
"MiniMax-VL-01": "https://huggingface.co/MiniMaxAI/MiniMax-VL-01",
|
235 |
+
"Qwen2.5-VL-72B": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
|
236 |
+
"Qwen2.5-VL-7B": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
|
237 |
+
"Qwen2.5-VL-3B": "https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct",
|
238 |
}
|
239 |
|
240 |
# Define the base MODEL_GROUPS structure
|
241 |
BASE_MODEL_GROUPS = {
|
242 |
"All": list(MODEL_NAME_MAP.keys()),
|
243 |
+
"Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212', "Gemini-exp-1206", "MiniMax-VL-01", "Qwen2.5-VL-72B",],
|
244 |
+
"Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Gemini-2.0-thinking", "Gemini-Flash-2.0-exp", "Ivy_VL_3B", "DeepSeek_VL2_tiny", "Qwen2.5-VL-7B", "Qwen2.5-VL-3B"],
|
245 |
"Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212', "Gemini-exp-1206"],
|
246 |
"Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', "Gemini-Flash-2.0-exp", "Gemini-2.0-thinking"],
|
247 |
+
"Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B", "MiniMax-VL-01", "Qwen2.5-VL-72B"],
|
248 |
+
"Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Ivy_VL_3B", "DeepSeek_VL2_tiny", "Qwen2.5-VL-7B", "Qwen2.5-VL-3B"]
|
249 |
}
|
static/eval_results/Default/self_reported.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"MiniMax-VL-01": 47.4,
|
3 |
+
"Qwen2.5-VL-72B": 51.3,
|
4 |
+
"Qwen2.5-VL-7B": 36.8,
|
5 |
+
"Qwen2.5-VL-3B": 28.9
|
6 |
+
}
|
utils.py
CHANGED
@@ -2,19 +2,16 @@ import pandas as pd
|
|
2 |
import json
|
3 |
from typing import Dict, Any, Tuple
|
4 |
import os
|
5 |
-
from constants import
|
6 |
-
|
7 |
-
DIMENSION_NAME_MAP,
|
8 |
-
KEYWORD_NAME_MAP,
|
9 |
-
MODEL_URLS,
|
10 |
-
BASE_MODEL_GROUPS
|
11 |
-
)
|
12 |
|
13 |
class MEGABenchEvalDataLoader:
|
14 |
def __init__(self, base_path):
|
15 |
self.base_path = base_path
|
16 |
# Load both model and summary data at once
|
17 |
self.KEYWORD_DATA, self.SUMMARY_DATA = self._load_data()
|
|
|
|
|
18 |
self.SUPER_GROUPS = self._initialize_super_groups()
|
19 |
self.MODEL_GROUPS = self._initialize_model_groups()
|
20 |
|
@@ -33,44 +30,56 @@ class MEGABenchEvalDataLoader:
|
|
33 |
keyword_data[model_name] = data["keyword_stats"]
|
34 |
if "model_summary" in data:
|
35 |
summary_data[model_name] = data["model_summary"]
|
36 |
-
|
37 |
return keyword_data, summary_data
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
def _initialize_super_groups(self):
|
40 |
# Get a sample model to access the structure
|
41 |
sample_model = next(iter(self.KEYWORD_DATA))
|
42 |
-
|
43 |
# Create groups with task counts
|
44 |
groups = {}
|
45 |
self.keyword_display_map = {} # Add this map to store display-to-original mapping
|
46 |
-
|
47 |
for dim in self.KEYWORD_DATA[sample_model]:
|
48 |
dim_name = DIMENSION_NAME_MAP[dim]
|
49 |
# Create a list of tuples (display_name, count, keyword) for sorting
|
50 |
keyword_info = []
|
51 |
-
|
52 |
for keyword in self.KEYWORD_DATA[sample_model][dim]:
|
53 |
# Get the task count for this keyword
|
54 |
task_count = self.KEYWORD_DATA[sample_model][dim][keyword]["count"]
|
55 |
original_name = KEYWORD_NAME_MAP.get(keyword, keyword)
|
56 |
display_name = f"{original_name}({task_count})"
|
57 |
keyword_info.append((display_name, task_count, keyword))
|
58 |
-
|
59 |
# Sort by count (descending) and then by display name (for ties)
|
60 |
keyword_info.sort(key=lambda x: (-x[1], x[0]))
|
61 |
-
|
62 |
# Store sorted display names and update mapping
|
63 |
groups[dim_name] = [info[0] for info in keyword_info]
|
64 |
for display_name, _, keyword in keyword_info:
|
65 |
self.keyword_display_map[display_name] = keyword
|
66 |
-
|
67 |
# Sort based on predefined order
|
68 |
order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"]
|
69 |
return {k: groups[k] for k in order if k in groups}
|
70 |
|
71 |
def _initialize_model_groups(self) -> Dict[str, list]:
|
72 |
-
|
73 |
-
|
|
|
74 |
filtered_groups = {}
|
75 |
for group_name, models in BASE_MODEL_GROUPS.items():
|
76 |
if group_name == "All":
|
@@ -79,49 +88,75 @@ class MEGABenchEvalDataLoader:
|
|
79 |
filtered_models = [model for model in models if model in available_models]
|
80 |
if filtered_models:
|
81 |
filtered_groups[group_name] = filtered_models
|
82 |
-
|
83 |
return filtered_groups
|
84 |
|
85 |
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
|
86 |
original_dimension = get_original_dimension(selected_super_group)
|
87 |
data = []
|
88 |
-
|
89 |
for model in self.MODEL_GROUPS[selected_model_group]:
|
90 |
-
if model not in self.KEYWORD_DATA or model not in self.SUMMARY_DATA:
|
91 |
continue
|
92 |
|
93 |
-
model_data = self.KEYWORD_DATA[model]
|
94 |
-
summary = self.SUMMARY_DATA[model]
|
95 |
-
|
96 |
# Basic model information
|
97 |
row = {
|
98 |
"Models": get_display_model_name(model, as_link=True),
|
99 |
-
"Overall": round(summary["overall_score"] * 100, 2),
|
100 |
-
"Core": round(summary["core"]["macro_mean_score"] * 100, 2),
|
101 |
-
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
102 |
}
|
103 |
|
104 |
-
# Add
|
105 |
-
if
|
106 |
-
for
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
row[display_name] = None
|
112 |
-
else:
|
113 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
114 |
row[display_name] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
data.append(row)
|
117 |
-
|
118 |
df = pd.DataFrame(data)
|
|
|
119 |
df = df.sort_values(by="Overall", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
return df
|
121 |
|
122 |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
|
123 |
df = self.get_df(selected_super_group, selected_model_group)
|
124 |
-
|
125 |
# Get total task counts from the first model's data
|
126 |
sample_model = "GPT_4o"
|
127 |
total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"]
|
@@ -134,39 +169,42 @@ class MEGABenchEvalDataLoader:
|
|
134 |
"Models": "Models",
|
135 |
"Overall": f"Overall\n({total_tasks})",
|
136 |
"Core": f"Core\n({total_core_tasks})",
|
137 |
-
"Open-ended": f"Open-ended\n({total_open_tasks})"
|
138 |
}
|
139 |
-
|
140 |
# Add rank column to DataFrame
|
141 |
df = df.reset_index(drop=True)
|
142 |
-
df.insert(0,
|
143 |
-
|
144 |
# Rename the columns in DataFrame to match headers
|
145 |
df = df.rename(columns=column_headers)
|
146 |
-
|
147 |
# For dimension columns, add task counts on new line
|
148 |
dimension_headers = []
|
149 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
150 |
-
task_count = display_name.split(
|
151 |
-
base_name = display_name.split(
|
152 |
dimension_headers.append(f"{base_name}\n({task_count})")
|
153 |
-
|
154 |
headers = [
|
155 |
column_headers["Rank"],
|
156 |
column_headers["Models"],
|
157 |
column_headers["Overall"],
|
158 |
column_headers["Core"],
|
159 |
-
column_headers["Open-ended"]
|
160 |
] + dimension_headers
|
161 |
-
|
162 |
-
data = df[
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
|
|
|
|
170 |
return headers, data
|
171 |
|
172 |
|
@@ -174,12 +212,13 @@ class MEGABenchEvalDataLoader:
|
|
174 |
def get_original_dimension(mapped_dimension):
|
175 |
return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
|
176 |
|
|
|
177 |
def get_original_keyword(mapped_keyword):
|
178 |
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
|
179 |
|
|
|
180 |
def get_display_model_name(model_name: str, as_link: bool = True) -> str:
|
181 |
display_name = MODEL_NAME_MAP.get(model_name, model_name)
|
182 |
if as_link and model_name in MODEL_URLS:
|
183 |
return f'<a href="{MODEL_URLS[model_name]}" target="_blank" style="text-decoration: none; color: #2196F3;">{display_name}</a>'
|
184 |
return display_name
|
185 |
-
|
|
|
2 |
import json
|
3 |
from typing import Dict, Any, Tuple
|
4 |
import os
|
5 |
+
from constants import MODEL_NAME_MAP, DIMENSION_NAME_MAP, KEYWORD_NAME_MAP, MODEL_URLS, BASE_MODEL_GROUPS
|
6 |
+
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
class MEGABenchEvalDataLoader:
|
9 |
def __init__(self, base_path):
|
10 |
self.base_path = base_path
|
11 |
# Load both model and summary data at once
|
12 |
self.KEYWORD_DATA, self.SUMMARY_DATA = self._load_data()
|
13 |
+
# Add loading of self-reported results
|
14 |
+
self.SELF_REPORTED = self._load_self_reported()
|
15 |
self.SUPER_GROUPS = self._initialize_super_groups()
|
16 |
self.MODEL_GROUPS = self._initialize_model_groups()
|
17 |
|
|
|
30 |
keyword_data[model_name] = data["keyword_stats"]
|
31 |
if "model_summary" in data:
|
32 |
summary_data[model_name] = data["model_summary"]
|
33 |
+
|
34 |
return keyword_data, summary_data
|
35 |
|
36 |
+
def _load_self_reported(self) -> Dict[str, float]:
|
37 |
+
try:
|
38 |
+
with open(os.path.join(self.base_path, "self_reported.json"), "r") as f:
|
39 |
+
return json.load(f)
|
40 |
+
except FileNotFoundError:
|
41 |
+
print(
|
42 |
+
"Warning: No self-reported file found at",
|
43 |
+
os.path.join(os.path.dirname(self.base_path), "self_reported.json"),
|
44 |
+
)
|
45 |
+
return {}
|
46 |
+
|
47 |
def _initialize_super_groups(self):
|
48 |
# Get a sample model to access the structure
|
49 |
sample_model = next(iter(self.KEYWORD_DATA))
|
50 |
+
|
51 |
# Create groups with task counts
|
52 |
groups = {}
|
53 |
self.keyword_display_map = {} # Add this map to store display-to-original mapping
|
54 |
+
|
55 |
for dim in self.KEYWORD_DATA[sample_model]:
|
56 |
dim_name = DIMENSION_NAME_MAP[dim]
|
57 |
# Create a list of tuples (display_name, count, keyword) for sorting
|
58 |
keyword_info = []
|
59 |
+
|
60 |
for keyword in self.KEYWORD_DATA[sample_model][dim]:
|
61 |
# Get the task count for this keyword
|
62 |
task_count = self.KEYWORD_DATA[sample_model][dim][keyword]["count"]
|
63 |
original_name = KEYWORD_NAME_MAP.get(keyword, keyword)
|
64 |
display_name = f"{original_name}({task_count})"
|
65 |
keyword_info.append((display_name, task_count, keyword))
|
66 |
+
|
67 |
# Sort by count (descending) and then by display name (for ties)
|
68 |
keyword_info.sort(key=lambda x: (-x[1], x[0]))
|
69 |
+
|
70 |
# Store sorted display names and update mapping
|
71 |
groups[dim_name] = [info[0] for info in keyword_info]
|
72 |
for display_name, _, keyword in keyword_info:
|
73 |
self.keyword_display_map[display_name] = keyword
|
74 |
+
|
75 |
# Sort based on predefined order
|
76 |
order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"]
|
77 |
return {k: groups[k] for k in order if k in groups}
|
78 |
|
79 |
def _initialize_model_groups(self) -> Dict[str, list]:
|
80 |
+
# Include both evaluated and self-reported models
|
81 |
+
available_models = set(self.KEYWORD_DATA.keys()) | set(self.SELF_REPORTED.keys())
|
82 |
+
|
83 |
filtered_groups = {}
|
84 |
for group_name, models in BASE_MODEL_GROUPS.items():
|
85 |
if group_name == "All":
|
|
|
88 |
filtered_models = [model for model in models if model in available_models]
|
89 |
if filtered_models:
|
90 |
filtered_groups[group_name] = filtered_models
|
91 |
+
|
92 |
return filtered_groups
|
93 |
|
94 |
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
|
95 |
original_dimension = get_original_dimension(selected_super_group)
|
96 |
data = []
|
97 |
+
|
98 |
for model in self.MODEL_GROUPS[selected_model_group]:
|
99 |
+
if (model not in self.KEYWORD_DATA or model not in self.SUMMARY_DATA) and model not in self.SELF_REPORTED:
|
100 |
continue
|
101 |
|
|
|
|
|
|
|
102 |
# Basic model information
|
103 |
row = {
|
104 |
"Models": get_display_model_name(model, as_link=True),
|
|
|
|
|
|
|
105 |
}
|
106 |
|
107 |
+
# Add asterisk for self-reported results
|
108 |
+
if model in self.SELF_REPORTED:
|
109 |
+
# Store numeric value for sorting but display with asterisk
|
110 |
+
row["Overall"] = self.SELF_REPORTED[model]
|
111 |
+
row["Overall_display"] = f"{self.SELF_REPORTED[model]:.2f}*"
|
112 |
+
row["Core"] = None
|
113 |
+
row["Open-ended"] = None
|
|
|
|
|
114 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
115 |
row[display_name] = None
|
116 |
+
else:
|
117 |
+
model_data = self.KEYWORD_DATA[model]
|
118 |
+
summary = self.SUMMARY_DATA[model]
|
119 |
+
|
120 |
+
# Store numeric values
|
121 |
+
overall_score = round(summary["overall_score"] * 100, 2)
|
122 |
+
row["Overall"] = overall_score
|
123 |
+
row["Overall_display"] = f"{overall_score:.2f}"
|
124 |
+
row["Core"] = round(summary["core"]["macro_mean_score"] * 100, 2)
|
125 |
+
row["Open-ended"] = round(summary["open"]["macro_mean_score"] * 100, 2)
|
126 |
|
127 |
+
# Add dimension-specific scores
|
128 |
+
if original_dimension in model_data:
|
129 |
+
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
130 |
+
original_keyword = self.keyword_display_map[display_name]
|
131 |
+
if original_keyword in model_data[original_dimension]:
|
132 |
+
row[display_name] = round(
|
133 |
+
model_data[original_dimension][original_keyword]["average_score"] * 100, 2
|
134 |
+
)
|
135 |
+
else:
|
136 |
+
row[display_name] = None
|
137 |
+
else:
|
138 |
+
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
139 |
+
row[display_name] = None
|
140 |
+
|
141 |
data.append(row)
|
142 |
+
|
143 |
df = pd.DataFrame(data)
|
144 |
+
# Sort by numeric Overall column
|
145 |
df = df.sort_values(by="Overall", ascending=False)
|
146 |
+
|
147 |
+
# Replace None with "-" for display
|
148 |
+
display_cols = ["Core", "Open-ended"] + self.SUPER_GROUPS[selected_super_group]
|
149 |
+
df[display_cols] = df[display_cols].fillna("-")
|
150 |
+
|
151 |
+
# Replace Overall with Overall_display
|
152 |
+
df["Overall"] = df["Overall_display"]
|
153 |
+
df = df.drop("Overall_display", axis=1)
|
154 |
+
|
155 |
return df
|
156 |
|
157 |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
|
158 |
df = self.get_df(selected_super_group, selected_model_group)
|
159 |
+
|
160 |
# Get total task counts from the first model's data
|
161 |
sample_model = "GPT_4o"
|
162 |
total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"]
|
|
|
169 |
"Models": "Models",
|
170 |
"Overall": f"Overall\n({total_tasks})",
|
171 |
"Core": f"Core\n({total_core_tasks})",
|
172 |
+
"Open-ended": f"Open-ended\n({total_open_tasks})",
|
173 |
}
|
174 |
+
|
175 |
# Add rank column to DataFrame
|
176 |
df = df.reset_index(drop=True)
|
177 |
+
df.insert(0, "Rank", range(1, len(df) + 1))
|
178 |
+
|
179 |
# Rename the columns in DataFrame to match headers
|
180 |
df = df.rename(columns=column_headers)
|
181 |
+
|
182 |
# For dimension columns, add task counts on new line
|
183 |
dimension_headers = []
|
184 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
185 |
+
task_count = display_name.split("(")[1].rstrip(")")
|
186 |
+
base_name = display_name.split("(")[0]
|
187 |
dimension_headers.append(f"{base_name}\n({task_count})")
|
188 |
+
|
189 |
headers = [
|
190 |
column_headers["Rank"],
|
191 |
column_headers["Models"],
|
192 |
column_headers["Overall"],
|
193 |
column_headers["Core"],
|
194 |
+
column_headers["Open-ended"],
|
195 |
] + dimension_headers
|
196 |
+
|
197 |
+
data = df[
|
198 |
+
[
|
199 |
+
column_headers["Rank"],
|
200 |
+
column_headers["Models"],
|
201 |
+
column_headers["Overall"],
|
202 |
+
column_headers["Core"],
|
203 |
+
column_headers["Open-ended"],
|
204 |
+
]
|
205 |
+
+ self.SUPER_GROUPS[selected_super_group]
|
206 |
+
].values.tolist()
|
207 |
+
|
208 |
return headers, data
|
209 |
|
210 |
|
|
|
212 |
def get_original_dimension(mapped_dimension):
|
213 |
return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
|
214 |
|
215 |
+
|
216 |
def get_original_keyword(mapped_keyword):
|
217 |
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
|
218 |
|
219 |
+
|
220 |
def get_display_model_name(model_name: str, as_link: bool = True) -> str:
|
221 |
display_name = MODEL_NAME_MAP.get(model_name, model_name)
|
222 |
if as_link and model_name in MODEL_URLS:
|
223 |
return f'<a href="{MODEL_URLS[model_name]}" target="_blank" style="text-decoration: none; color: #2196F3;">{display_name}</a>'
|
224 |
return display_name
|
|