Smita R Smita commited on
Commit
2c742e8
·
unverified ·
1 Parent(s): d2de222

more eval ordering changes (#43)

Browse files
Files changed (3) hide show
  1. c_and_e.py +1 -1
  2. e2e.py +1 -1
  3. leaderboard_transformer.py +14 -5
c_and_e.py CHANGED
@@ -3,7 +3,7 @@ from content import CODE_EXECUTION_DESCRIPTION
3
  from category_page_builder import build_category_page
4
 
5
  # Define the category for this page
6
- CATEGORY_NAME = "Code Execution"
7
 
8
  def build_page():
9
  build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION)
 
3
  from category_page_builder import build_category_page
4
 
5
  # Define the category for this page
6
+ CATEGORY_NAME = "Code & Execution"
7
 
8
  def build_page():
9
  build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION)
e2e.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  from content import DISCOVERY_DESCRIPTION
3
  from category_page_builder import build_category_page
4
  # Define the category for this page
5
- CATEGORY_NAME = "Discovery"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION)
 
2
  from content import DISCOVERY_DESCRIPTION
3
  from category_page_builder import build_category_page
4
  # Define the category for this page
5
+ CATEGORY_NAME = "End-to-End Discovery"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION)
leaderboard_transformer.py CHANGED
@@ -11,9 +11,9 @@ logger = logging.getLogger(__name__)
11
  INFORMAL_TO_FORMAL_NAME_MAP = {
12
  # Short Names
13
  "lit": "Literature Understanding",
 
14
  "data": "Data Analysis",
15
- "code": "Code Execution",
16
- "discovery": "Discovery",
17
 
18
  # Validation Names
19
  "arxivdigestables_validation": "ArxivDIGESTables-Clean",
@@ -41,6 +41,12 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
41
  "super_test": "SUPER-Expert",
42
  }
43
  ORDER_MAP = {
 
 
 
 
 
 
44
  'Literature Understanding': [
45
  'PaperFindingBench',
46
  'LitQA2-FullText-Search',
@@ -48,7 +54,7 @@ ORDER_MAP = {
48
  'LitQA2-FullText',
49
  'ArxivDIGESTables-Clean'
50
  ],
51
- 'Code Execution': [
52
  'SUPER-Expert',
53
  'CORE-Bench-Hard',
54
  'DS-1000'
@@ -117,7 +123,10 @@ def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
117
  def get_pretty(raw_name):
118
  return name_map.get(raw_name, raw_name.replace("_", " "))
119
 
120
- for raw_key, raw_value_list in raw_tag_map.items():
 
 
 
121
  pretty_key = get_pretty(raw_key)
122
  pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
123
 
@@ -271,7 +280,7 @@ class DataTransformer:
271
  # Calculated and add "Categories Attempted" column
272
  if primary_metric == "Overall":
273
  def calculate_attempted(row):
274
- main_categories = ['Literature Understanding', 'Data Analysis', 'Code Execution', 'Discovery']
275
  count = sum(1 for category in main_categories if pd.notna(row.get(f"{category} Cost")))
276
 
277
  # Return the formatted string with the correct emoji
 
11
  INFORMAL_TO_FORMAL_NAME_MAP = {
12
  # Short Names
13
  "lit": "Literature Understanding",
14
+ "code": "Code & Execution",
15
  "data": "Data Analysis",
16
+ "discovery": "End-to-End Discovery",
 
17
 
18
  # Validation Names
19
  "arxivdigestables_validation": "ArxivDIGESTables-Clean",
 
41
  "super_test": "SUPER-Expert",
42
  }
43
  ORDER_MAP = {
44
+ 'Overall_keys': [
45
+ 'lit',
46
+ 'code',
47
+ 'data',
48
+ 'discovery',
49
+ ],
50
  'Literature Understanding': [
51
  'PaperFindingBench',
52
  'LitQA2-FullText-Search',
 
54
  'LitQA2-FullText',
55
  'ArxivDIGESTables-Clean'
56
  ],
57
+ 'Code & Execution': [
58
  'SUPER-Expert',
59
  'CORE-Bench-Hard',
60
  'DS-1000'
 
123
  def get_pretty(raw_name):
124
  return name_map.get(raw_name, raw_name.replace("_", " "))
125
 
126
+ key_order = ORDER_MAP.get('Overall_keys', [])
127
+ sorted_keys = sorted(raw_tag_map.keys(), key=lambda x: key_order.index(x) if x in key_order else len(key_order))
128
+ for raw_key in sorted_keys:
129
+ raw_value_list = raw_tag_map[raw_key]
130
  pretty_key = get_pretty(raw_key)
131
  pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
132
 
 
280
  # Calculated and add "Categories Attempted" column
281
  if primary_metric == "Overall":
282
  def calculate_attempted(row):
283
+ main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
284
  count = sum(1 for category in main_categories if pd.notna(row.get(f"{category} Cost")))
285
 
286
  # Return the formatted string with the correct emoji