cccjc commited on
Commit
bc925b6
·
1 Parent(s): 47bc6da

add self-reported results

Browse files
app.py CHANGED
@@ -52,7 +52,7 @@ with gr.Blocks() as block:
52
  )
53
 
54
  # Define different captions for each table
55
- default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data. <br> $\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ "
56
 
57
  single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
58
 
 
52
  )
53
 
54
  # Define different captions for each table
55
+ default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data. <br> $\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ <br> * indicates self-reported results from the model authors."
56
 
57
  single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
58
 
constants.py CHANGED
@@ -134,6 +134,10 @@ MODEL_NAME_MAP = {
134
  "Gemini-exp-1206": "Gemini-exp-1206",
135
  "Ivy_VL_3B": "Ivy-VL-3B",
136
  "DeepSeek_VL2_tiny": "deepseek-vl2-tiny",
 
 
 
 
137
  }
138
 
139
  DIMENSION_NAME_MAP = {
@@ -227,15 +231,19 @@ MODEL_URLS = {
227
  "Gemini-Flash-2.0-exp": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/#gemini-2-0-flash",
228
  "Ivy_VL_3B": "https://huggingface.co/AI-Safeguard/Ivy-VL-llava",
229
  "DeepSeek_VL2_tiny": "https://huggingface.co/deepseek-ai/deepseek-vl2-tiny",
 
 
 
 
230
  }
231
 
232
  # Define the base MODEL_GROUPS structure
233
  BASE_MODEL_GROUPS = {
234
  "All": list(MODEL_NAME_MAP.keys()),
235
- "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212', "Gemini-exp-1206"],
236
- "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Gemini-2.0-thinking", "Gemini-Flash-2.0-exp", "Ivy_VL_3B", "DeepSeek_VL2_tiny"],
237
  "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212', "Gemini-exp-1206"],
238
  "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', "Gemini-Flash-2.0-exp", "Gemini-2.0-thinking"],
239
- "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
240
- "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Ivy_VL_3B", "DeepSeek_VL2_tiny"]
241
  }
 
134
  "Gemini-exp-1206": "Gemini-exp-1206",
135
  "Ivy_VL_3B": "Ivy-VL-3B",
136
  "DeepSeek_VL2_tiny": "deepseek-vl2-tiny",
137
+ "MiniMax-VL-01": "MiniMax-VL-01",
138
+ "Qwen2.5-VL-72B": "Qwen2.5-VL-72B",
139
+ "Qwen2.5-VL-7B": "Qwen2.5-VL-7B",
140
+ "Qwen2.5-VL-3B": "Qwen2.5-VL-3B",
141
  }
142
 
143
  DIMENSION_NAME_MAP = {
 
231
  "Gemini-Flash-2.0-exp": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/#gemini-2-0-flash",
232
  "Ivy_VL_3B": "https://huggingface.co/AI-Safeguard/Ivy-VL-llava",
233
  "DeepSeek_VL2_tiny": "https://huggingface.co/deepseek-ai/deepseek-vl2-tiny",
234
+ "MiniMax-VL-01": "https://huggingface.co/MiniMaxAI/MiniMax-VL-01",
235
+ "Qwen2.5-VL-72B": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
236
+ "Qwen2.5-VL-7B": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
237
+ "Qwen2.5-VL-3B": "https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct",
238
  }
239
 
240
  # Define the base MODEL_GROUPS structure
241
  BASE_MODEL_GROUPS = {
242
  "All": list(MODEL_NAME_MAP.keys()),
243
+ "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212', "Gemini-exp-1206", "MiniMax-VL-01", "Qwen2.5-VL-72B",],
244
+ "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Gemini-2.0-thinking", "Gemini-Flash-2.0-exp", "Ivy_VL_3B", "DeepSeek_VL2_tiny", "Qwen2.5-VL-7B", "Qwen2.5-VL-3B"],
245
  "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212', "Gemini-exp-1206"],
246
  "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', "Gemini-Flash-2.0-exp", "Gemini-2.0-thinking"],
247
+ "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B", "MiniMax-VL-01", "Qwen2.5-VL-72B"],
248
+ "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Ivy_VL_3B", "DeepSeek_VL2_tiny", "Qwen2.5-VL-7B", "Qwen2.5-VL-3B"]
249
  }
static/eval_results/Default/self_reported.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "MiniMax-VL-01": 47.4,
3
+ "Qwen2.5-VL-72B": 51.3,
4
+ "Qwen2.5-VL-7B": 36.8,
5
+ "Qwen2.5-VL-3B": 28.9
6
+ }
utils.py CHANGED
@@ -2,19 +2,16 @@ import pandas as pd
2
  import json
3
  from typing import Dict, Any, Tuple
4
  import os
5
- from constants import (
6
- MODEL_NAME_MAP,
7
- DIMENSION_NAME_MAP,
8
- KEYWORD_NAME_MAP,
9
- MODEL_URLS,
10
- BASE_MODEL_GROUPS
11
- )
12
 
13
  class MEGABenchEvalDataLoader:
14
  def __init__(self, base_path):
15
  self.base_path = base_path
16
  # Load both model and summary data at once
17
  self.KEYWORD_DATA, self.SUMMARY_DATA = self._load_data()
 
 
18
  self.SUPER_GROUPS = self._initialize_super_groups()
19
  self.MODEL_GROUPS = self._initialize_model_groups()
20
 
@@ -33,44 +30,56 @@ class MEGABenchEvalDataLoader:
33
  keyword_data[model_name] = data["keyword_stats"]
34
  if "model_summary" in data:
35
  summary_data[model_name] = data["model_summary"]
36
-
37
  return keyword_data, summary_data
38
 
 
 
 
 
 
 
 
 
 
 
 
39
  def _initialize_super_groups(self):
40
  # Get a sample model to access the structure
41
  sample_model = next(iter(self.KEYWORD_DATA))
42
-
43
  # Create groups with task counts
44
  groups = {}
45
  self.keyword_display_map = {} # Add this map to store display-to-original mapping
46
-
47
  for dim in self.KEYWORD_DATA[sample_model]:
48
  dim_name = DIMENSION_NAME_MAP[dim]
49
  # Create a list of tuples (display_name, count, keyword) for sorting
50
  keyword_info = []
51
-
52
  for keyword in self.KEYWORD_DATA[sample_model][dim]:
53
  # Get the task count for this keyword
54
  task_count = self.KEYWORD_DATA[sample_model][dim][keyword]["count"]
55
  original_name = KEYWORD_NAME_MAP.get(keyword, keyword)
56
  display_name = f"{original_name}({task_count})"
57
  keyword_info.append((display_name, task_count, keyword))
58
-
59
  # Sort by count (descending) and then by display name (for ties)
60
  keyword_info.sort(key=lambda x: (-x[1], x[0]))
61
-
62
  # Store sorted display names and update mapping
63
  groups[dim_name] = [info[0] for info in keyword_info]
64
  for display_name, _, keyword in keyword_info:
65
  self.keyword_display_map[display_name] = keyword
66
-
67
  # Sort based on predefined order
68
  order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"]
69
  return {k: groups[k] for k in order if k in groups}
70
 
71
  def _initialize_model_groups(self) -> Dict[str, list]:
72
- available_models = set(self.KEYWORD_DATA.keys())
73
-
 
74
  filtered_groups = {}
75
  for group_name, models in BASE_MODEL_GROUPS.items():
76
  if group_name == "All":
@@ -79,49 +88,75 @@ class MEGABenchEvalDataLoader:
79
  filtered_models = [model for model in models if model in available_models]
80
  if filtered_models:
81
  filtered_groups[group_name] = filtered_models
82
-
83
  return filtered_groups
84
 
85
  def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
86
  original_dimension = get_original_dimension(selected_super_group)
87
  data = []
88
-
89
  for model in self.MODEL_GROUPS[selected_model_group]:
90
- if model not in self.KEYWORD_DATA or model not in self.SUMMARY_DATA:
91
  continue
92
 
93
- model_data = self.KEYWORD_DATA[model]
94
- summary = self.SUMMARY_DATA[model]
95
-
96
  # Basic model information
97
  row = {
98
  "Models": get_display_model_name(model, as_link=True),
99
- "Overall": round(summary["overall_score"] * 100, 2),
100
- "Core": round(summary["core"]["macro_mean_score"] * 100, 2),
101
- "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
102
  }
103
 
104
- # Add dimension-specific scores
105
- if original_dimension in model_data:
106
- for display_name in self.SUPER_GROUPS[selected_super_group]:
107
- original_keyword = self.keyword_display_map[display_name]
108
- if original_keyword in model_data[original_dimension]:
109
- row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
110
- else:
111
- row[display_name] = None
112
- else:
113
  for display_name in self.SUPER_GROUPS[selected_super_group]:
114
  row[display_name] = None
 
 
 
 
 
 
 
 
 
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  data.append(row)
117
-
118
  df = pd.DataFrame(data)
 
119
  df = df.sort_values(by="Overall", ascending=False)
 
 
 
 
 
 
 
 
 
120
  return df
121
 
122
  def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
123
  df = self.get_df(selected_super_group, selected_model_group)
124
-
125
  # Get total task counts from the first model's data
126
  sample_model = "GPT_4o"
127
  total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"]
@@ -134,39 +169,42 @@ class MEGABenchEvalDataLoader:
134
  "Models": "Models",
135
  "Overall": f"Overall\n({total_tasks})",
136
  "Core": f"Core\n({total_core_tasks})",
137
- "Open-ended": f"Open-ended\n({total_open_tasks})"
138
  }
139
-
140
  # Add rank column to DataFrame
141
  df = df.reset_index(drop=True)
142
- df.insert(0, 'Rank', range(1, len(df) + 1))
143
-
144
  # Rename the columns in DataFrame to match headers
145
  df = df.rename(columns=column_headers)
146
-
147
  # For dimension columns, add task counts on new line
148
  dimension_headers = []
149
  for display_name in self.SUPER_GROUPS[selected_super_group]:
150
- task_count = display_name.split('(')[1].rstrip(')')
151
- base_name = display_name.split('(')[0]
152
  dimension_headers.append(f"{base_name}\n({task_count})")
153
-
154
  headers = [
155
  column_headers["Rank"],
156
  column_headers["Models"],
157
  column_headers["Overall"],
158
  column_headers["Core"],
159
- column_headers["Open-ended"]
160
  ] + dimension_headers
161
-
162
- data = df[[
163
- column_headers["Rank"],
164
- column_headers["Models"],
165
- column_headers["Overall"],
166
- column_headers["Core"],
167
- column_headers["Open-ended"]
168
- ] + self.SUPER_GROUPS[selected_super_group]].values.tolist()
169
-
 
 
 
170
  return headers, data
171
 
172
 
@@ -174,12 +212,13 @@ class MEGABenchEvalDataLoader:
174
  def get_original_dimension(mapped_dimension):
175
  return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
176
 
 
177
  def get_original_keyword(mapped_keyword):
178
  return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
179
 
 
180
  def get_display_model_name(model_name: str, as_link: bool = True) -> str:
181
  display_name = MODEL_NAME_MAP.get(model_name, model_name)
182
  if as_link and model_name in MODEL_URLS:
183
  return f'<a href="{MODEL_URLS[model_name]}" target="_blank" style="text-decoration: none; color: #2196F3;">{display_name}</a>'
184
  return display_name
185
-
 
2
  import json
3
  from typing import Dict, Any, Tuple
4
  import os
5
+ from constants import MODEL_NAME_MAP, DIMENSION_NAME_MAP, KEYWORD_NAME_MAP, MODEL_URLS, BASE_MODEL_GROUPS
6
+
 
 
 
 
 
7
 
8
  class MEGABenchEvalDataLoader:
9
  def __init__(self, base_path):
10
  self.base_path = base_path
11
  # Load both model and summary data at once
12
  self.KEYWORD_DATA, self.SUMMARY_DATA = self._load_data()
13
+ # Add loading of self-reported results
14
+ self.SELF_REPORTED = self._load_self_reported()
15
  self.SUPER_GROUPS = self._initialize_super_groups()
16
  self.MODEL_GROUPS = self._initialize_model_groups()
17
 
 
30
  keyword_data[model_name] = data["keyword_stats"]
31
  if "model_summary" in data:
32
  summary_data[model_name] = data["model_summary"]
33
+
34
  return keyword_data, summary_data
35
 
36
+ def _load_self_reported(self) -> Dict[str, float]:
37
+ try:
38
+ with open(os.path.join(self.base_path, "self_reported.json"), "r") as f:
39
+ return json.load(f)
40
+ except FileNotFoundError:
41
+ print(
42
+ "Warning: No self-reported file found at",
43
+ os.path.join(os.path.dirname(self.base_path), "self_reported.json"),
44
+ )
45
+ return {}
46
+
47
  def _initialize_super_groups(self):
48
  # Get a sample model to access the structure
49
  sample_model = next(iter(self.KEYWORD_DATA))
50
+
51
  # Create groups with task counts
52
  groups = {}
53
  self.keyword_display_map = {} # Add this map to store display-to-original mapping
54
+
55
  for dim in self.KEYWORD_DATA[sample_model]:
56
  dim_name = DIMENSION_NAME_MAP[dim]
57
  # Create a list of tuples (display_name, count, keyword) for sorting
58
  keyword_info = []
59
+
60
  for keyword in self.KEYWORD_DATA[sample_model][dim]:
61
  # Get the task count for this keyword
62
  task_count = self.KEYWORD_DATA[sample_model][dim][keyword]["count"]
63
  original_name = KEYWORD_NAME_MAP.get(keyword, keyword)
64
  display_name = f"{original_name}({task_count})"
65
  keyword_info.append((display_name, task_count, keyword))
66
+
67
  # Sort by count (descending) and then by display name (for ties)
68
  keyword_info.sort(key=lambda x: (-x[1], x[0]))
69
+
70
  # Store sorted display names and update mapping
71
  groups[dim_name] = [info[0] for info in keyword_info]
72
  for display_name, _, keyword in keyword_info:
73
  self.keyword_display_map[display_name] = keyword
74
+
75
  # Sort based on predefined order
76
  order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"]
77
  return {k: groups[k] for k in order if k in groups}
78
 
79
  def _initialize_model_groups(self) -> Dict[str, list]:
80
+ # Include both evaluated and self-reported models
81
+ available_models = set(self.KEYWORD_DATA.keys()) | set(self.SELF_REPORTED.keys())
82
+
83
  filtered_groups = {}
84
  for group_name, models in BASE_MODEL_GROUPS.items():
85
  if group_name == "All":
 
88
  filtered_models = [model for model in models if model in available_models]
89
  if filtered_models:
90
  filtered_groups[group_name] = filtered_models
91
+
92
  return filtered_groups
93
 
94
  def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
95
  original_dimension = get_original_dimension(selected_super_group)
96
  data = []
97
+
98
  for model in self.MODEL_GROUPS[selected_model_group]:
99
+ if (model not in self.KEYWORD_DATA or model not in self.SUMMARY_DATA) and model not in self.SELF_REPORTED:
100
  continue
101
 
 
 
 
102
  # Basic model information
103
  row = {
104
  "Models": get_display_model_name(model, as_link=True),
 
 
 
105
  }
106
 
107
+ # Add asterisk for self-reported results
108
+ if model in self.SELF_REPORTED:
109
+ # Store numeric value for sorting but display with asterisk
110
+ row["Overall"] = self.SELF_REPORTED[model]
111
+ row["Overall_display"] = f"{self.SELF_REPORTED[model]:.2f}*"
112
+ row["Core"] = None
113
+ row["Open-ended"] = None
 
 
114
  for display_name in self.SUPER_GROUPS[selected_super_group]:
115
  row[display_name] = None
116
+ else:
117
+ model_data = self.KEYWORD_DATA[model]
118
+ summary = self.SUMMARY_DATA[model]
119
+
120
+ # Store numeric values
121
+ overall_score = round(summary["overall_score"] * 100, 2)
122
+ row["Overall"] = overall_score
123
+ row["Overall_display"] = f"{overall_score:.2f}"
124
+ row["Core"] = round(summary["core"]["macro_mean_score"] * 100, 2)
125
+ row["Open-ended"] = round(summary["open"]["macro_mean_score"] * 100, 2)
126
 
127
+ # Add dimension-specific scores
128
+ if original_dimension in model_data:
129
+ for display_name in self.SUPER_GROUPS[selected_super_group]:
130
+ original_keyword = self.keyword_display_map[display_name]
131
+ if original_keyword in model_data[original_dimension]:
132
+ row[display_name] = round(
133
+ model_data[original_dimension][original_keyword]["average_score"] * 100, 2
134
+ )
135
+ else:
136
+ row[display_name] = None
137
+ else:
138
+ for display_name in self.SUPER_GROUPS[selected_super_group]:
139
+ row[display_name] = None
140
+
141
  data.append(row)
142
+
143
  df = pd.DataFrame(data)
144
+ # Sort by numeric Overall column
145
  df = df.sort_values(by="Overall", ascending=False)
146
+
147
+ # Replace None with "-" for display
148
+ display_cols = ["Core", "Open-ended"] + self.SUPER_GROUPS[selected_super_group]
149
+ df[display_cols] = df[display_cols].fillna("-")
150
+
151
+ # Replace Overall with Overall_display
152
+ df["Overall"] = df["Overall_display"]
153
+ df = df.drop("Overall_display", axis=1)
154
+
155
  return df
156
 
157
  def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
158
  df = self.get_df(selected_super_group, selected_model_group)
159
+
160
  # Get total task counts from the first model's data
161
  sample_model = "GPT_4o"
162
  total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"]
 
169
  "Models": "Models",
170
  "Overall": f"Overall\n({total_tasks})",
171
  "Core": f"Core\n({total_core_tasks})",
172
+ "Open-ended": f"Open-ended\n({total_open_tasks})",
173
  }
174
+
175
  # Add rank column to DataFrame
176
  df = df.reset_index(drop=True)
177
+ df.insert(0, "Rank", range(1, len(df) + 1))
178
+
179
  # Rename the columns in DataFrame to match headers
180
  df = df.rename(columns=column_headers)
181
+
182
  # For dimension columns, add task counts on new line
183
  dimension_headers = []
184
  for display_name in self.SUPER_GROUPS[selected_super_group]:
185
+ task_count = display_name.split("(")[1].rstrip(")")
186
+ base_name = display_name.split("(")[0]
187
  dimension_headers.append(f"{base_name}\n({task_count})")
188
+
189
  headers = [
190
  column_headers["Rank"],
191
  column_headers["Models"],
192
  column_headers["Overall"],
193
  column_headers["Core"],
194
+ column_headers["Open-ended"],
195
  ] + dimension_headers
196
+
197
+ data = df[
198
+ [
199
+ column_headers["Rank"],
200
+ column_headers["Models"],
201
+ column_headers["Overall"],
202
+ column_headers["Core"],
203
+ column_headers["Open-ended"],
204
+ ]
205
+ + self.SUPER_GROUPS[selected_super_group]
206
+ ].values.tolist()
207
+
208
  return headers, data
209
 
210
 
 
212
  def get_original_dimension(mapped_dimension):
213
  return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
214
 
215
+
216
  def get_original_keyword(mapped_keyword):
217
  return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
218
 
219
+
220
  def get_display_model_name(model_name: str, as_link: bool = True) -> str:
221
  display_name = MODEL_NAME_MAP.get(model_name, model_name)
222
  if as_link and model_name in MODEL_URLS:
223
  return f'<a href="{MODEL_URLS[model_name]}" target="_blank" style="text-decoration: none; color: #2196F3;">{display_name}</a>'
224
  return display_name