Spaces:
Paused
Paused
Amber Tanaka
commited on
Fix test data display (#6)
Browse files- c_and_e.py +27 -4
- data_analysis.py +26 -4
- e2e.py +25 -4
- leaderboard_transformer.py +13 -1
- literature_understanding.py +27 -4
- ui_components.py +1 -3
c_and_e.py
CHANGED
|
@@ -12,13 +12,16 @@ with gr.Blocks() as demo:
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# --- This page now has two main sections: Validation and Test ---
|
| 20 |
with gr.Tabs():
|
| 21 |
-
with gr.Tab("Results: Validation"):
|
| 22 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 23 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 24 |
|
|
@@ -40,7 +43,7 @@ with gr.Blocks() as demo:
|
|
| 40 |
else:
|
| 41 |
gr.Markdown("No data available for validation split.")
|
| 42 |
|
| 43 |
-
with gr.Tab("Results: Test"):
|
| 44 |
# Repeat the process for the "test" split
|
| 45 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 46 |
|
|
@@ -57,4 +60,24 @@ with gr.Blocks() as demo:
|
|
| 57 |
category_name=CATEGORY_NAME
|
| 58 |
)
|
| 59 |
else:
|
| 60 |
-
gr.Markdown("No data available for test split.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
|
| 15 |
+
with gr.Column(elem_id="validation_nav_container", visible=True) as validation_nav_container:
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
|
| 18 |
+
with gr.Column(elem_id="test_nav_container", visible=False) as test_nav_container:
|
| 19 |
+
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 20 |
+
|
| 21 |
|
| 22 |
# --- This page now has two main sections: Validation and Test ---
|
| 23 |
with gr.Tabs():
|
| 24 |
+
with gr.Tab("Results: Validation") as validation_tab:
|
| 25 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 26 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 27 |
|
|
|
|
| 43 |
else:
|
| 44 |
gr.Markdown("No data available for validation split.")
|
| 45 |
|
| 46 |
+
with gr.Tab("Results: Test") as test_tab:
|
| 47 |
# Repeat the process for the "test" split
|
| 48 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 49 |
|
|
|
|
| 60 |
category_name=CATEGORY_NAME
|
| 61 |
)
|
| 62 |
else:
|
| 63 |
+
gr.Markdown("No data available for test split.")
|
| 64 |
+
|
| 65 |
+
show_validation_js = """
|
| 66 |
+
() => {
|
| 67 |
+
document.getElementById('validation_nav_container').style.display = 'block';
|
| 68 |
+
document.getElementById('test_nav_container').style.display = 'none';
|
| 69 |
+
}
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 73 |
+
show_test_js = """
|
| 74 |
+
() => {
|
| 75 |
+
document.getElementById('validation_nav_container').style.display = 'none';
|
| 76 |
+
document.getElementById('test_nav_container').style.display = 'block';
|
| 77 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 78 |
+
}
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 82 |
+
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 83 |
+
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
data_analysis.py
CHANGED
|
@@ -12,12 +12,14 @@ with gr.Blocks() as demo:
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
|
|
|
|
|
|
|
| 18 |
# --- This page now has two main sections: Validation and Test ---
|
| 19 |
with gr.Tabs():
|
| 20 |
-
with gr.Tab("Results: Validation"):
|
| 21 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 22 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 23 |
|
|
@@ -39,7 +41,7 @@ with gr.Blocks() as demo:
|
|
| 39 |
else:
|
| 40 |
gr.Markdown("No data available for validation split.")
|
| 41 |
|
| 42 |
-
with gr.Tab("Results: Test"):
|
| 43 |
# Repeat the process for the "test" split
|
| 44 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 45 |
|
|
@@ -56,4 +58,24 @@ with gr.Blocks() as demo:
|
|
| 56 |
category_name=CATEGORY_NAME
|
| 57 |
)
|
| 58 |
else:
|
| 59 |
-
gr.Markdown("No data available for test split.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
|
| 15 |
+
with gr.Column(elem_id="validation_nav_container", visible=True) as validation_nav_container:
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
|
| 18 |
+
with gr.Column(elem_id="test_nav_container", visible=False) as test_nav_container:
|
| 19 |
+
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 20 |
# --- This page now has two main sections: Validation and Test ---
|
| 21 |
with gr.Tabs():
|
| 22 |
+
with gr.Tab("Results: Validation") as validation_tab:
|
| 23 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 24 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 25 |
|
|
|
|
| 41 |
else:
|
| 42 |
gr.Markdown("No data available for validation split.")
|
| 43 |
|
| 44 |
+
with gr.Tab("Results: Test") as test_tab:
|
| 45 |
# Repeat the process for the "test" split
|
| 46 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 47 |
|
|
|
|
| 58 |
category_name=CATEGORY_NAME
|
| 59 |
)
|
| 60 |
else:
|
| 61 |
+
gr.Markdown("No data available for test split.")
|
| 62 |
+
|
| 63 |
+
show_validation_js = """
|
| 64 |
+
() => {
|
| 65 |
+
document.getElementById('validation_nav_container').style.display = 'block';
|
| 66 |
+
document.getElementById('test_nav_container').style.display = 'none';
|
| 67 |
+
}
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 71 |
+
show_test_js = """
|
| 72 |
+
() => {
|
| 73 |
+
document.getElementById('validation_nav_container').style.display = 'none';
|
| 74 |
+
document.getElementById('test_nav_container').style.display = 'block';
|
| 75 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 76 |
+
}
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 80 |
+
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 81 |
+
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
e2e.py
CHANGED
|
@@ -12,12 +12,14 @@ with gr.Blocks() as demo:
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
|
|
|
|
|
|
|
| 18 |
# --- This page now has two main sections: Validation and Test ---
|
| 19 |
with gr.Tabs():
|
| 20 |
-
with gr.Tab("Results: Validation"):
|
| 21 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 22 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 23 |
|
|
@@ -39,7 +41,7 @@ with gr.Blocks() as demo:
|
|
| 39 |
else:
|
| 40 |
gr.Markdown("No data available for validation split.")
|
| 41 |
|
| 42 |
-
with gr.Tab("Results: Test"):
|
| 43 |
# Repeat the process for the "test" split
|
| 44 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 45 |
|
|
@@ -56,4 +58,23 @@ with gr.Blocks() as demo:
|
|
| 56 |
category_name=CATEGORY_NAME
|
| 57 |
)
|
| 58 |
else:
|
| 59 |
-
gr.Markdown("No data available for test split.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
|
| 15 |
+
with gr.Column(elem_id="validation_nav_container", visible=True) as validation_nav_container:
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
|
| 18 |
+
with gr.Column(elem_id="test_nav_container", visible=False) as test_nav_container:
|
| 19 |
+
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 20 |
# --- This page now has two main sections: Validation and Test ---
|
| 21 |
with gr.Tabs():
|
| 22 |
+
with gr.Tab("Results: Validation") as validation_tab:
|
| 23 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 24 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 25 |
|
|
|
|
| 41 |
else:
|
| 42 |
gr.Markdown("No data available for validation split.")
|
| 43 |
|
| 44 |
+
with gr.Tab("Results: Test") as test_tab:
|
| 45 |
# Repeat the process for the "test" split
|
| 46 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 47 |
|
|
|
|
| 58 |
category_name=CATEGORY_NAME
|
| 59 |
)
|
| 60 |
else:
|
| 61 |
+
gr.Markdown("No data available for test split.")
|
| 62 |
+
show_validation_js = """
|
| 63 |
+
() => {
|
| 64 |
+
document.getElementById('validation_nav_container').style.display = 'block';
|
| 65 |
+
document.getElementById('test_nav_container').style.display = 'none';
|
| 66 |
+
}
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 70 |
+
show_test_js = """
|
| 71 |
+
() => {
|
| 72 |
+
document.getElementById('validation_nav_container').style.display = 'none';
|
| 73 |
+
document.getElementById('test_nav_container').style.display = 'block';
|
| 74 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 75 |
+
}
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 79 |
+
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 80 |
+
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
leaderboard_transformer.py
CHANGED
|
@@ -14,7 +14,7 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
|
|
| 14 |
"code": "Code Execution",
|
| 15 |
"discovery": "Discovery",
|
| 16 |
|
| 17 |
-
#
|
| 18 |
"arxivdigestables_validation": "Arxivdigestables Validation",
|
| 19 |
"sqa_dev": "Sqa Dev",
|
| 20 |
"litqa2_validation": "Litqa2 Validation",
|
|
@@ -24,6 +24,18 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
|
|
| 24 |
"ds1000_validation": "DS1000 Validation",
|
| 25 |
"e2e_discovery_validation": "E2E Discovery Validation",
|
| 26 |
"super_validation": "Super Validation",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
}
|
| 28 |
|
| 29 |
|
|
|
|
| 14 |
"code": "Code Execution",
|
| 15 |
"discovery": "Discovery",
|
| 16 |
|
| 17 |
+
# Validation Names
|
| 18 |
"arxivdigestables_validation": "Arxivdigestables Validation",
|
| 19 |
"sqa_dev": "Sqa Dev",
|
| 20 |
"litqa2_validation": "Litqa2 Validation",
|
|
|
|
| 24 |
"ds1000_validation": "DS1000 Validation",
|
| 25 |
"e2e_discovery_validation": "E2E Discovery Validation",
|
| 26 |
"super_validation": "Super Validation",
|
| 27 |
+
# Test Names
|
| 28 |
+
"paper_finder_test": "Paper Finder Test",
|
| 29 |
+
"paper_finder_litqa2_test": "Paper Finder Litqa2 Test",
|
| 30 |
+
"sqa_test": "Sqa Test",
|
| 31 |
+
"arxivdigestables_test": "Arxivdigestables Test",
|
| 32 |
+
"litqa2_test": "Litqa2 Test",
|
| 33 |
+
"discoverybench_test": "Discoverybench Test",
|
| 34 |
+
"core_bench_test": "Core Bench Test",
|
| 35 |
+
"ds1000_test": "DS1000 Test",
|
| 36 |
+
"e2e_discovery_test": "E2E Discovery Test",
|
| 37 |
+
"e2e_discovery_hard_test": "E2E Discovery Hard Test",
|
| 38 |
+
"super_test": "Super Test",
|
| 39 |
}
|
| 40 |
|
| 41 |
|
literature_understanding.py
CHANGED
|
@@ -13,12 +13,15 @@ with gr.Blocks() as demo:
|
|
| 13 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 14 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 15 |
gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
|
| 16 |
-
|
| 17 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 18 |
|
|
|
|
|
|
|
|
|
|
| 19 |
# --- This page now has two main sections: Validation and Test ---
|
| 20 |
with gr.Tabs():
|
| 21 |
-
with gr.Tab("Results: Validation"):
|
| 22 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 23 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 24 |
|
|
@@ -40,7 +43,7 @@ with gr.Blocks() as demo:
|
|
| 40 |
else:
|
| 41 |
gr.Markdown("No data available for validation split.")
|
| 42 |
|
| 43 |
-
with gr.Tab("Results: Test"):
|
| 44 |
# Repeat the process for the "test" split
|
| 45 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 46 |
|
|
@@ -57,4 +60,24 @@ with gr.Blocks() as demo:
|
|
| 57 |
category_name=CATEGORY_NAME
|
| 58 |
)
|
| 59 |
else:
|
| 60 |
-
gr.Markdown("No data available for test split.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 14 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 15 |
gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
|
| 16 |
+
with gr.Column(elem_id="validation_nav_container", visible=True) as validation_nav_container:
|
| 17 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 18 |
|
| 19 |
+
with gr.Column(elem_id="test_nav_container", visible=False) as test_nav_container:
|
| 20 |
+
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 21 |
+
|
| 22 |
# --- This page now has two main sections: Validation and Test ---
|
| 23 |
with gr.Tabs():
|
| 24 |
+
with gr.Tab("Results: Validation") as validation_tab:
|
| 25 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 26 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 27 |
|
|
|
|
| 43 |
else:
|
| 44 |
gr.Markdown("No data available for validation split.")
|
| 45 |
|
| 46 |
+
with gr.Tab("Results: Test") as test_tab:
|
| 47 |
# Repeat the process for the "test" split
|
| 48 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 49 |
|
|
|
|
| 60 |
category_name=CATEGORY_NAME
|
| 61 |
)
|
| 62 |
else:
|
| 63 |
+
gr.Markdown("No data available for test split.")
|
| 64 |
+
|
| 65 |
+
show_validation_js = """
|
| 66 |
+
() => {
|
| 67 |
+
document.getElementById('validation_nav_container').style.display = 'block';
|
| 68 |
+
document.getElementById('test_nav_container').style.display = 'none';
|
| 69 |
+
}
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 73 |
+
show_test_js = """
|
| 74 |
+
() => {
|
| 75 |
+
document.getElementById('validation_nav_container').style.display = 'none';
|
| 76 |
+
document.getElementById('test_nav_container').style.display = 'block';
|
| 77 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 78 |
+
}
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 82 |
+
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 83 |
+
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
ui_components.py
CHANGED
|
@@ -28,7 +28,7 @@ from content import (
|
|
| 28 |
|
| 29 |
# --- Constants and Configuration ---
|
| 30 |
LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
|
| 31 |
-
CONFIG_NAME = "1.0.0-
|
| 32 |
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
|
| 33 |
|
| 34 |
OWNER = "allenai"
|
|
@@ -213,7 +213,6 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
|
| 213 |
Loads and transforms the complete dataset for a given split.
|
| 214 |
This function handles caching and returns the final "pretty" DataFrame and tag map.
|
| 215 |
"""
|
| 216 |
-
# This reuses your existing robust caching logic
|
| 217 |
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
|
| 218 |
|
| 219 |
if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)):
|
|
@@ -291,7 +290,6 @@ def create_benchmark_details_display(
|
|
| 291 |
|
| 292 |
# 2. Loop through each benchmark and create its UI components
|
| 293 |
for benchmark_name in benchmark_names:
|
| 294 |
-
with gr.Blocks():
|
| 295 |
gr.Markdown(f"### {benchmark_name}", header_links=True)
|
| 296 |
|
| 297 |
# 3. Prepare the data for this specific benchmark's table and plot
|
|
|
|
| 28 |
|
| 29 |
# --- Constants and Configuration ---
|
| 30 |
LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
|
| 31 |
+
CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
|
| 32 |
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
|
| 33 |
|
| 34 |
OWNER = "allenai"
|
|
|
|
| 213 |
Loads and transforms the complete dataset for a given split.
|
| 214 |
This function handles caching and returns the final "pretty" DataFrame and tag map.
|
| 215 |
"""
|
|
|
|
| 216 |
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
|
| 217 |
|
| 218 |
if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)):
|
|
|
|
| 290 |
|
| 291 |
# 2. Loop through each benchmark and create its UI components
|
| 292 |
for benchmark_name in benchmark_names:
|
|
|
|
| 293 |
gr.Markdown(f"### {benchmark_name}", header_links=True)
|
| 294 |
|
| 295 |
# 3. Prepare the data for this specific benchmark's table and plot
|