asta-bench-leaderboard

Running

App Files Files Community

Amber Tanaka commited on Aug 12

Commit

02a4349

unverified ·

1 Parent(s): fbcb5bb

copy changes around graph (#47)

Browse files

Files changed (5) hide show

category_page_builder.py +2 -0
content.py +13 -2
leaderboard_transformer.py +2 -2
main_page.py +2 -0
ui_components.py +3 -2

category_page_builder.py CHANGED Viewed

@@ -20,6 +20,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
             with gr.Tab("Results: Test Set") as test_tab:
                 # Repeat the process for the "test" split
                 if not test_df.empty:
                     create_leaderboard_display(
                         full_df=test_df,
                         tag_map=test_tag_map,
@@ -36,6 +37,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
             with gr.Tab("Results: Validation Set") as validation_tab:
                 # 1. Load all necessary data for the "validation" split ONCE.
                 if not validation_df.empty:
                     # 2. Render the main category display using the loaded data.
                     create_leaderboard_display(
                         full_df=validation_df,

             with gr.Tab("Results: Test Set") as test_tab:
                 # Repeat the process for the "test" split
                 if not test_df.empty:
+                    gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
                     create_leaderboard_display(
                         full_df=test_df,
                         tag_map=test_tag_map,
             with gr.Tab("Results: Validation Set") as validation_tab:
                 # 1. Load all necessary data for the "validation" split ONCE.
                 if not validation_df.empty:
+                    gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
                     # 2. Render the main category display using the loaded data.
                     create_leaderboard_display(
                         full_df=validation_df,

content.py CHANGED Viewed

@@ -31,7 +31,18 @@ Each category page includes a summary table (average score and cost per problem
 🔍 Learn more in the AstaBench technical blog post
 """
 SCATTER_DISCLAIMER = """
-Note: Only agents with valid cost data are shown in the scatter plot, as both performance and efficiency are required for comparison. Agents without cost data still appear in the tables below.
 """
 PARETO_DISCLAIMER = """
 Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
@@ -164,7 +175,7 @@ table.gr-table {
     padding-top: 0 !important;
 }
 #scatter-disclaimer {
-    color: #f0529c !important;
 }
 #pareto-disclaimer {
     color: #f0529c !important;

 🔍 Learn more in the AstaBench technical blog post
 """
 SCATTER_DISCLAIMER = """
+**Note:** Agents without cost data are displayed to the right of the vertical divider line. <span class="tooltip-icon" data-tooltip="Missing Cost Dashed Line: Max Cost + (MaxCost/10) Missing Cost Datapoints/No Cost Data = Max Cost + (MaxCost/5)">ⓘ</span>
+"""
+scatter_disclaimer_html = """
+<div class="disclaimer-text">
+    <b>Note:</b> Agents without cost data are displayed to the right of the vertical divider line.
+    <span class="tooltip-icon" data-tooltip="Missing Cost Dashed Line:
+    Max Cost + (MaxCost/10)
+    Missing Cost Datapoints / No Cost Data:
+    Max Cost + (MaxCost/5)">
+        ⓘ
+    </span>
+</div>
 """
 PARETO_DISCLAIMER = """
 Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
     padding-top: 0 !important;
 }
 #scatter-disclaimer {
+        overflow: visible !important;
 }
 #pareto-disclaimer {
     color: #f0529c !important;

leaderboard_transformer.py CHANGED Viewed

@@ -371,7 +371,7 @@ def _plot_scatter_plotly(
     data_plot = data.copy()
     data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
-    x_axis_label = f"Cost per problem (USD)" if x else "Cost (Data N/A)"
     max_reported_cost = 0
     divider_line_x = 0
@@ -552,7 +552,7 @@ def _plot_scatter_plotly(
         template="plotly_white",
         title=f"Astabench {name} Leaderboard",
         xaxis=xaxis_config, # Use the updated config
-        yaxis=dict(title="Score", rangemode="tozero"),
         legend=dict(
             bgcolor='#FAF2E9',
         ),

     data_plot = data.copy()
     data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
+    x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
     max_reported_cost = 0
     divider_line_x = 0
         template="plotly_white",
         title=f"Astabench {name} Leaderboard",
         xaxis=xaxis_config, # Use the updated config
+        yaxis=dict(title="Average (mean) score", rangemode="tozero"),
         legend=dict(
             bgcolor='#FAF2E9',
         ),

main_page.py CHANGED Viewed

@@ -26,6 +26,7 @@ def build_page():
         with gr.Tab("Results: Test Set") as test_tab:
             test_df, test_tag_map = get_full_leaderboard_data("test")
             if not test_df.empty:
                 create_leaderboard_display(
                     full_df=test_df,
                     tag_map=test_tag_map,
@@ -39,6 +40,7 @@ def build_page():
             validation_df, validation_tag_map = get_full_leaderboard_data("validation")
             # Check if data was loaded successfully before trying to display it
             if not validation_df.empty:
                 # 2. Render the display by calling the factory with the loaded data.
                 create_leaderboard_display(
                     full_df=validation_df,

         with gr.Tab("Results: Test Set") as test_tab:
             test_df, test_tag_map = get_full_leaderboard_data("test")
             if not test_df.empty:
+                gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
                 create_leaderboard_display(
                     full_df=test_df,
                     tag_map=test_tag_map,
             validation_df, validation_tag_map = get_full_leaderboard_data("validation")
             # Check if data was loaded successfully before trying to display it
             if not validation_df.empty:
+                gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
                 # 2. Render the display by calling the factory with the loaded data.
                 create_leaderboard_display(
                     full_df=validation_df,

ui_components.py CHANGED Viewed

@@ -20,7 +20,7 @@ from leaderboard_transformer import (
     clean_llm_base_list,
 )
 from content import (
-    SCATTER_DISCLAIMER,
     format_error,
     format_log,
     format_warning,
@@ -340,7 +340,7 @@ def create_leaderboard_display(
         value=scatter_plot,
         show_label=False
     )
     # Put table and key into an accordion
     with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
         gr.HTML(value=legend_markdown, elem_id="legend-markdown")
@@ -497,6 +497,7 @@ def create_benchmark_details_display(
             name=benchmark_name
         )
         gr.Plot(value=benchmark_plot, show_label=False)
         # Put table and key into an accordion
         with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
             gr.HTML(value=legend_markdown, elem_id="legend-markdown")

     clean_llm_base_list,
 )
 from content import (
+    scatter_disclaimer_html,
     format_error,
     format_log,
     format_warning,
         value=scatter_plot,
         show_label=False
     )
+    gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
     # Put table and key into an accordion
     with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
         gr.HTML(value=legend_markdown, elem_id="legend-markdown")
             name=benchmark_name
         )
         gr.Plot(value=benchmark_plot, show_label=False)
+        gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
         # Put table and key into an accordion
         with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
             gr.HTML(value=legend_markdown, elem_id="legend-markdown")