Amber Tanaka commited on
Commit
02a4349
·
unverified ·
1 Parent(s): fbcb5bb

copy changes around graph (#47)

Browse files
category_page_builder.py CHANGED
@@ -20,6 +20,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
20
  with gr.Tab("Results: Test Set") as test_tab:
21
  # Repeat the process for the "test" split
22
  if not test_df.empty:
 
23
  create_leaderboard_display(
24
  full_df=test_df,
25
  tag_map=test_tag_map,
@@ -36,6 +37,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
36
  with gr.Tab("Results: Validation Set") as validation_tab:
37
  # 1. Load all necessary data for the "validation" split ONCE.
38
  if not validation_df.empty:
 
39
  # 2. Render the main category display using the loaded data.
40
  create_leaderboard_display(
41
  full_df=validation_df,
 
20
  with gr.Tab("Results: Test Set") as test_tab:
21
  # Repeat the process for the "test" split
22
  if not test_df.empty:
23
+ gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
24
  create_leaderboard_display(
25
  full_df=test_df,
26
  tag_map=test_tag_map,
 
37
  with gr.Tab("Results: Validation Set") as validation_tab:
38
  # 1. Load all necessary data for the "validation" split ONCE.
39
  if not validation_df.empty:
40
+ gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
41
  # 2. Render the main category display using the loaded data.
42
  create_leaderboard_display(
43
  full_df=validation_df,
content.py CHANGED
@@ -31,7 +31,18 @@ Each category page includes a summary table (average score and cost per problem
31
  🔍 Learn more in the AstaBench technical blog post
32
  """
33
  SCATTER_DISCLAIMER = """
34
- Note: Only agents with valid cost data are shown in the scatter plot, as both performance and efficiency are required for comparison. Agents without cost data still appear in the tables below.
 
 
 
 
 
 
 
 
 
 
 
35
  """
36
  PARETO_DISCLAIMER = """
37
  Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
@@ -164,7 +175,7 @@ table.gr-table {
164
  padding-top: 0 !important;
165
  }
166
  #scatter-disclaimer {
167
- color: #f0529c !important;
168
  }
169
  #pareto-disclaimer {
170
  color: #f0529c !important;
 
31
  🔍 Learn more in the AstaBench technical blog post
32
  """
33
  SCATTER_DISCLAIMER = """
34
+ **Note:** Agents without cost data are displayed to the right of the vertical divider line. <span class="tooltip-icon" data-tooltip="Missing Cost Dashed Line: Max Cost + (MaxCost/10) Missing Cost Datapoints/No Cost Data = Max Cost + (MaxCost/5)">ⓘ</span>
35
+ """
36
+ scatter_disclaimer_html = """
37
+ <div class="disclaimer-text">
38
+ <b>Note:</b> Agents without cost data are displayed to the right of the vertical divider line.
39
+ <span class="tooltip-icon" data-tooltip="Missing Cost Dashed Line:
40
+ Max Cost + (MaxCost/10)
41
+ Missing Cost Datapoints / No Cost Data:
42
+ Max Cost + (MaxCost/5)">
43
+
44
+ </span>
45
+ </div>
46
  """
47
  PARETO_DISCLAIMER = """
48
  Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
 
175
  padding-top: 0 !important;
176
  }
177
  #scatter-disclaimer {
178
+ overflow: visible !important;
179
  }
180
  #pareto-disclaimer {
181
  color: #f0529c !important;
leaderboard_transformer.py CHANGED
@@ -371,7 +371,7 @@ def _plot_scatter_plotly(
371
  data_plot = data.copy()
372
  data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
373
 
374
- x_axis_label = f"Cost per problem (USD)" if x else "Cost (Data N/A)"
375
  max_reported_cost = 0
376
  divider_line_x = 0
377
 
@@ -552,7 +552,7 @@ def _plot_scatter_plotly(
552
  template="plotly_white",
553
  title=f"Astabench {name} Leaderboard",
554
  xaxis=xaxis_config, # Use the updated config
555
- yaxis=dict(title="Score", rangemode="tozero"),
556
  legend=dict(
557
  bgcolor='#FAF2E9',
558
  ),
 
371
  data_plot = data.copy()
372
  data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
373
 
374
+ x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
375
  max_reported_cost = 0
376
  divider_line_x = 0
377
 
 
552
  template="plotly_white",
553
  title=f"Astabench {name} Leaderboard",
554
  xaxis=xaxis_config, # Use the updated config
555
+ yaxis=dict(title="Average (mean) score", rangemode="tozero"),
556
  legend=dict(
557
  bgcolor='#FAF2E9',
558
  ),
main_page.py CHANGED
@@ -26,6 +26,7 @@ def build_page():
26
  with gr.Tab("Results: Test Set") as test_tab:
27
  test_df, test_tag_map = get_full_leaderboard_data("test")
28
  if not test_df.empty:
 
29
  create_leaderboard_display(
30
  full_df=test_df,
31
  tag_map=test_tag_map,
@@ -39,6 +40,7 @@ def build_page():
39
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
40
  # Check if data was loaded successfully before trying to display it
41
  if not validation_df.empty:
 
42
  # 2. Render the display by calling the factory with the loaded data.
43
  create_leaderboard_display(
44
  full_df=validation_df,
 
26
  with gr.Tab("Results: Test Set") as test_tab:
27
  test_df, test_tag_map = get_full_leaderboard_data("test")
28
  if not test_df.empty:
29
+ gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
30
  create_leaderboard_display(
31
  full_df=test_df,
32
  tag_map=test_tag_map,
 
40
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
41
  # Check if data was loaded successfully before trying to display it
42
  if not validation_df.empty:
43
+ gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
44
  # 2. Render the display by calling the factory with the loaded data.
45
  create_leaderboard_display(
46
  full_df=validation_df,
ui_components.py CHANGED
@@ -20,7 +20,7 @@ from leaderboard_transformer import (
20
  clean_llm_base_list,
21
  )
22
  from content import (
23
- SCATTER_DISCLAIMER,
24
  format_error,
25
  format_log,
26
  format_warning,
@@ -340,7 +340,7 @@ def create_leaderboard_display(
340
  value=scatter_plot,
341
  show_label=False
342
  )
343
-
344
  # Put table and key into an accordion
345
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
346
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
@@ -497,6 +497,7 @@ def create_benchmark_details_display(
497
  name=benchmark_name
498
  )
499
  gr.Plot(value=benchmark_plot, show_label=False)
 
500
  # Put table and key into an accordion
501
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
502
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
 
20
  clean_llm_base_list,
21
  )
22
  from content import (
23
+ scatter_disclaimer_html,
24
  format_error,
25
  format_log,
26
  format_warning,
 
340
  value=scatter_plot,
341
  show_label=False
342
  )
343
+ gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
344
  # Put table and key into an accordion
345
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
346
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
 
497
  name=benchmark_name
498
  )
499
  gr.Plot(value=benchmark_plot, show_label=False)
500
+ gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
501
  # Put table and key into an accordion
502
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
503
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")