Spaces:
Running
Running
Amber Tanaka
commited on
copy changes around graph (#47)
Browse files- category_page_builder.py +2 -0
- content.py +13 -2
- leaderboard_transformer.py +2 -2
- main_page.py +2 -0
- ui_components.py +3 -2
category_page_builder.py
CHANGED
@@ -20,6 +20,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
|
|
20 |
with gr.Tab("Results: Test Set") as test_tab:
|
21 |
# Repeat the process for the "test" split
|
22 |
if not test_df.empty:
|
|
|
23 |
create_leaderboard_display(
|
24 |
full_df=test_df,
|
25 |
tag_map=test_tag_map,
|
@@ -36,6 +37,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
|
|
36 |
with gr.Tab("Results: Validation Set") as validation_tab:
|
37 |
# 1. Load all necessary data for the "validation" split ONCE.
|
38 |
if not validation_df.empty:
|
|
|
39 |
# 2. Render the main category display using the loaded data.
|
40 |
create_leaderboard_display(
|
41 |
full_df=validation_df,
|
|
|
20 |
with gr.Tab("Results: Test Set") as test_tab:
|
21 |
# Repeat the process for the "test" split
|
22 |
if not test_df.empty:
|
23 |
+
gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
|
24 |
create_leaderboard_display(
|
25 |
full_df=test_df,
|
26 |
tag_map=test_tag_map,
|
|
|
37 |
with gr.Tab("Results: Validation Set") as validation_tab:
|
38 |
# 1. Load all necessary data for the "validation" split ONCE.
|
39 |
if not validation_df.empty:
|
40 |
+
gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
|
41 |
# 2. Render the main category display using the loaded data.
|
42 |
create_leaderboard_display(
|
43 |
full_df=validation_df,
|
content.py
CHANGED
@@ -31,7 +31,18 @@ Each category page includes a summary table (average score and cost per problem
|
|
31 |
🔍 Learn more in the AstaBench technical blog post
|
32 |
"""
|
33 |
SCATTER_DISCLAIMER = """
|
34 |
-
Note
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
"""
|
36 |
PARETO_DISCLAIMER = """
|
37 |
Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
|
@@ -164,7 +175,7 @@ table.gr-table {
|
|
164 |
padding-top: 0 !important;
|
165 |
}
|
166 |
#scatter-disclaimer {
|
167 |
-
|
168 |
}
|
169 |
#pareto-disclaimer {
|
170 |
color: #f0529c !important;
|
|
|
31 |
🔍 Learn more in the AstaBench technical blog post
|
32 |
"""
|
33 |
SCATTER_DISCLAIMER = """
|
34 |
+
**Note:** Agents without cost data are displayed to the right of the vertical divider line. <span class="tooltip-icon" data-tooltip="Missing Cost Dashed Line: Max Cost + (MaxCost/10) Missing Cost Datapoints/No Cost Data = Max Cost + (MaxCost/5)">ⓘ</span>
|
35 |
+
"""
|
36 |
+
scatter_disclaimer_html = """
|
37 |
+
<div class="disclaimer-text">
|
38 |
+
<b>Note:</b> Agents without cost data are displayed to the right of the vertical divider line.
|
39 |
+
<span class="tooltip-icon" data-tooltip="Missing Cost Dashed Line:
|
40 |
+
Max Cost + (MaxCost/10)
|
41 |
+
Missing Cost Datapoints / No Cost Data:
|
42 |
+
Max Cost + (MaxCost/5)">
|
43 |
+
ⓘ
|
44 |
+
</span>
|
45 |
+
</div>
|
46 |
"""
|
47 |
PARETO_DISCLAIMER = """
|
48 |
Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
|
|
|
175 |
padding-top: 0 !important;
|
176 |
}
|
177 |
#scatter-disclaimer {
|
178 |
+
overflow: visible !important;
|
179 |
}
|
180 |
#pareto-disclaimer {
|
181 |
color: #f0529c !important;
|
leaderboard_transformer.py
CHANGED
@@ -371,7 +371,7 @@ def _plot_scatter_plotly(
|
|
371 |
data_plot = data.copy()
|
372 |
data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
|
373 |
|
374 |
-
x_axis_label = f"
|
375 |
max_reported_cost = 0
|
376 |
divider_line_x = 0
|
377 |
|
@@ -552,7 +552,7 @@ def _plot_scatter_plotly(
|
|
552 |
template="plotly_white",
|
553 |
title=f"Astabench {name} Leaderboard",
|
554 |
xaxis=xaxis_config, # Use the updated config
|
555 |
-
yaxis=dict(title="
|
556 |
legend=dict(
|
557 |
bgcolor='#FAF2E9',
|
558 |
),
|
|
|
371 |
data_plot = data.copy()
|
372 |
data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
|
373 |
|
374 |
+
x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
|
375 |
max_reported_cost = 0
|
376 |
divider_line_x = 0
|
377 |
|
|
|
552 |
template="plotly_white",
|
553 |
title=f"Astabench {name} Leaderboard",
|
554 |
xaxis=xaxis_config, # Use the updated config
|
555 |
+
yaxis=dict(title="Average (mean) score", rangemode="tozero"),
|
556 |
legend=dict(
|
557 |
bgcolor='#FAF2E9',
|
558 |
),
|
main_page.py
CHANGED
@@ -26,6 +26,7 @@ def build_page():
|
|
26 |
with gr.Tab("Results: Test Set") as test_tab:
|
27 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
28 |
if not test_df.empty:
|
|
|
29 |
create_leaderboard_display(
|
30 |
full_df=test_df,
|
31 |
tag_map=test_tag_map,
|
@@ -39,6 +40,7 @@ def build_page():
|
|
39 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
40 |
# Check if data was loaded successfully before trying to display it
|
41 |
if not validation_df.empty:
|
|
|
42 |
# 2. Render the display by calling the factory with the loaded data.
|
43 |
create_leaderboard_display(
|
44 |
full_df=validation_df,
|
|
|
26 |
with gr.Tab("Results: Test Set") as test_tab:
|
27 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
28 |
if not test_df.empty:
|
29 |
+
gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
|
30 |
create_leaderboard_display(
|
31 |
full_df=test_df,
|
32 |
tag_map=test_tag_map,
|
|
|
40 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
41 |
# Check if data was loaded successfully before trying to display it
|
42 |
if not validation_df.empty:
|
43 |
+
gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
|
44 |
# 2. Render the display by calling the factory with the loaded data.
|
45 |
create_leaderboard_display(
|
46 |
full_df=validation_df,
|
ui_components.py
CHANGED
@@ -20,7 +20,7 @@ from leaderboard_transformer import (
|
|
20 |
clean_llm_base_list,
|
21 |
)
|
22 |
from content import (
|
23 |
-
|
24 |
format_error,
|
25 |
format_log,
|
26 |
format_warning,
|
@@ -340,7 +340,7 @@ def create_leaderboard_display(
|
|
340 |
value=scatter_plot,
|
341 |
show_label=False
|
342 |
)
|
343 |
-
|
344 |
# Put table and key into an accordion
|
345 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
346 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
@@ -497,6 +497,7 @@ def create_benchmark_details_display(
|
|
497 |
name=benchmark_name
|
498 |
)
|
499 |
gr.Plot(value=benchmark_plot, show_label=False)
|
|
|
500 |
# Put table and key into an accordion
|
501 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
502 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
|
|
20 |
clean_llm_base_list,
|
21 |
)
|
22 |
from content import (
|
23 |
+
scatter_disclaimer_html,
|
24 |
format_error,
|
25 |
format_log,
|
26 |
format_warning,
|
|
|
340 |
value=scatter_plot,
|
341 |
show_label=False
|
342 |
)
|
343 |
+
gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
|
344 |
# Put table and key into an accordion
|
345 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
346 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
|
|
497 |
name=benchmark_name
|
498 |
)
|
499 |
gr.Plot(value=benchmark_plot, show_label=False)
|
500 |
+
gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
|
501 |
# Put table and key into an accordion
|
502 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
503 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|