Spaces:
Running
Running
Smita R
Smita
commited on
more eval ordering changes (#43)
Browse files- c_and_e.py +1 -1
- e2e.py +1 -1
- leaderboard_transformer.py +14 -5
c_and_e.py
CHANGED
|
@@ -3,7 +3,7 @@ from content import CODE_EXECUTION_DESCRIPTION
|
|
| 3 |
from category_page_builder import build_category_page
|
| 4 |
|
| 5 |
# Define the category for this page
|
| 6 |
-
CATEGORY_NAME = "Code Execution"
|
| 7 |
|
| 8 |
def build_page():
|
| 9 |
build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION)
|
|
|
|
| 3 |
from category_page_builder import build_category_page
|
| 4 |
|
| 5 |
# Define the category for this page
|
| 6 |
+
CATEGORY_NAME = "Code & Execution"
|
| 7 |
|
| 8 |
def build_page():
|
| 9 |
build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION)
|
e2e.py
CHANGED
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
from content import DISCOVERY_DESCRIPTION
|
| 3 |
from category_page_builder import build_category_page
|
| 4 |
# Define the category for this page
|
| 5 |
-
CATEGORY_NAME = "Discovery"
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION)
|
|
|
|
| 2 |
from content import DISCOVERY_DESCRIPTION
|
| 3 |
from category_page_builder import build_category_page
|
| 4 |
# Define the category for this page
|
| 5 |
+
CATEGORY_NAME = "End-to-End Discovery"
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION)
|
leaderboard_transformer.py
CHANGED
|
@@ -11,9 +11,9 @@ logger = logging.getLogger(__name__)
|
|
| 11 |
INFORMAL_TO_FORMAL_NAME_MAP = {
|
| 12 |
# Short Names
|
| 13 |
"lit": "Literature Understanding",
|
|
|
|
| 14 |
"data": "Data Analysis",
|
| 15 |
-
"
|
| 16 |
-
"discovery": "Discovery",
|
| 17 |
|
| 18 |
# Validation Names
|
| 19 |
"arxivdigestables_validation": "ArxivDIGESTables-Clean",
|
|
@@ -41,6 +41,12 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
|
|
| 41 |
"super_test": "SUPER-Expert",
|
| 42 |
}
|
| 43 |
ORDER_MAP = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
'Literature Understanding': [
|
| 45 |
'PaperFindingBench',
|
| 46 |
'LitQA2-FullText-Search',
|
|
@@ -48,7 +54,7 @@ ORDER_MAP = {
|
|
| 48 |
'LitQA2-FullText',
|
| 49 |
'ArxivDIGESTables-Clean'
|
| 50 |
],
|
| 51 |
-
'Code Execution': [
|
| 52 |
'SUPER-Expert',
|
| 53 |
'CORE-Bench-Hard',
|
| 54 |
'DS-1000'
|
|
@@ -117,7 +123,10 @@ def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
|
|
| 117 |
def get_pretty(raw_name):
|
| 118 |
return name_map.get(raw_name, raw_name.replace("_", " "))
|
| 119 |
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
| 121 |
pretty_key = get_pretty(raw_key)
|
| 122 |
pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
|
| 123 |
|
|
@@ -271,7 +280,7 @@ class DataTransformer:
|
|
| 271 |
# Calculated and add "Categories Attempted" column
|
| 272 |
if primary_metric == "Overall":
|
| 273 |
def calculate_attempted(row):
|
| 274 |
-
main_categories = ['Literature Understanding', '
|
| 275 |
count = sum(1 for category in main_categories if pd.notna(row.get(f"{category} Cost")))
|
| 276 |
|
| 277 |
# Return the formatted string with the correct emoji
|
|
|
|
| 11 |
INFORMAL_TO_FORMAL_NAME_MAP = {
|
| 12 |
# Short Names
|
| 13 |
"lit": "Literature Understanding",
|
| 14 |
+
"code": "Code & Execution",
|
| 15 |
"data": "Data Analysis",
|
| 16 |
+
"discovery": "End-to-End Discovery",
|
|
|
|
| 17 |
|
| 18 |
# Validation Names
|
| 19 |
"arxivdigestables_validation": "ArxivDIGESTables-Clean",
|
|
|
|
| 41 |
"super_test": "SUPER-Expert",
|
| 42 |
}
|
| 43 |
ORDER_MAP = {
|
| 44 |
+
'Overall_keys': [
|
| 45 |
+
'lit',
|
| 46 |
+
'code',
|
| 47 |
+
'data',
|
| 48 |
+
'discovery',
|
| 49 |
+
],
|
| 50 |
'Literature Understanding': [
|
| 51 |
'PaperFindingBench',
|
| 52 |
'LitQA2-FullText-Search',
|
|
|
|
| 54 |
'LitQA2-FullText',
|
| 55 |
'ArxivDIGESTables-Clean'
|
| 56 |
],
|
| 57 |
+
'Code & Execution': [
|
| 58 |
'SUPER-Expert',
|
| 59 |
'CORE-Bench-Hard',
|
| 60 |
'DS-1000'
|
|
|
|
| 123 |
def get_pretty(raw_name):
|
| 124 |
return name_map.get(raw_name, raw_name.replace("_", " "))
|
| 125 |
|
| 126 |
+
key_order = ORDER_MAP.get('Overall_keys', [])
|
| 127 |
+
sorted_keys = sorted(raw_tag_map.keys(), key=lambda x: key_order.index(x) if x in key_order else len(key_order))
|
| 128 |
+
for raw_key in sorted_keys:
|
| 129 |
+
raw_value_list = raw_tag_map[raw_key]
|
| 130 |
pretty_key = get_pretty(raw_key)
|
| 131 |
pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
|
| 132 |
|
|
|
|
| 280 |
# Calculated and add "Categories Attempted" column
|
| 281 |
if primary_metric == "Overall":
|
| 282 |
def calculate_attempted(row):
|
| 283 |
+
main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
|
| 284 |
count = sum(1 for category in main_categories if pd.notna(row.get(f"{category} Cost")))
|
| 285 |
|
| 286 |
# Return the formatted string with the correct emoji
|