Amber Tanaka commited on
Commit
85744c7
·
unverified ·
1 Parent(s): 268d785

Change name of LLM Base and adjust hover behavior (#85)

Browse files
Files changed (2) hide show
  1. leaderboard_transformer.py +16 -22
  2. ui_components.py +14 -14
leaderboard_transformer.py CHANGED
@@ -112,7 +112,7 @@ def _pretty_column_name(raw_col: str) -> str:
112
  'Logs': 'Logs',
113
  'Openness': 'Openness',
114
  'Agent tooling': 'Agent Tooling',
115
- 'LLM base': 'LLM Base',
116
  'Source': 'Source',
117
  }
118
 
@@ -255,7 +255,7 @@ class DataTransformer:
255
  df_view = df_sorted.copy()
256
 
257
  # --- 3. Add Columns for Agent Openness and Tooling ---
258
- base_cols = ["id","Agent","Submitter","LLM Base","Source"]
259
  new_cols = ["Openness", "Agent Tooling"]
260
  ending_cols = ["Date", "Logs"]
261
 
@@ -361,7 +361,7 @@ def _plot_scatter_plotly(
361
 
362
  x_col_to_use = x
363
  y_col_to_use = y
364
- llm_base = data["LLM Base"] if "LLM Base" in data.columns else "LLM Base"
365
 
366
  # --- Section 2: Data Preparation---
367
  required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
@@ -443,33 +443,37 @@ def _plot_scatter_plotly(
443
  ))
444
 
445
  # --- Section 5: Prepare for Marker Plotting ---
446
- def format_hover_text(row, agent_col, x_axis_label, x_col, y_col):
447
  """
448
  Builds the complete HTML string for the plot's hover tooltip.
449
- Formats the 'LLM Base' column as a bulleted list if multiple.
450
  """
451
  h_pad = " "
452
  parts = ["<br>"]
453
  parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
454
  parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
455
- parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
 
 
 
 
456
  parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
457
  parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
458
 
459
  # Add extra vertical space (line spacing) before the next section
460
  parts.append("<br>")
461
- # Clean and format LLM Base column
462
- llm_base_value = row['LLM Base']
463
  llm_base_value = clean_llm_base_list(llm_base_value)
464
  if isinstance(llm_base_value, list) and llm_base_value:
465
- parts.append(f"{h_pad}LLM Base:{h_pad}<br>")
466
  # Create a list of padded bullet points
467
  list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
468
  # Join them with line breaks
469
  parts.append('<br>'.join(list_items))
470
  else:
471
  # Handle the non-list case with padding
472
- parts.append(f"{h_pad}LLM Base: <b>{llm_base_value}</b>{h_pad}")
473
  # Add a final line break for bottom "padding"
474
  parts.append("<br>")
475
  # Join all the parts together into the final HTML string
@@ -481,7 +485,8 @@ def _plot_scatter_plotly(
481
  agent_col=agent_col,
482
  x_axis_label=x_axis_label,
483
  x_col=x_col_to_use,
484
- y_col=y_col_to_use
 
485
  ),
486
  axis=1
487
  )
@@ -542,17 +547,6 @@ def _plot_scatter_plotly(
542
  font_color="#d3dedc",
543
  ),
544
  )
545
- # fig.add_layout_image(
546
- # dict(
547
- # source=logo_data_uri,
548
- # xref="x domain", yref="y domain",
549
- # x=1.1, y=1.1,
550
- # sizex=0.2, sizey=0.2,
551
- # xanchor="left",
552
- # yanchor="bottom",
553
- # layer="above",
554
- # ),
555
- # )
556
 
557
  return fig
558
 
 
112
  'Logs': 'Logs',
113
  'Openness': 'Openness',
114
  'Agent tooling': 'Agent Tooling',
115
+ 'LLM base': 'Models Used',
116
  'Source': 'Source',
117
  }
118
 
 
255
  df_view = df_sorted.copy()
256
 
257
  # --- 3. Add Columns for Agent Openness and Tooling ---
258
+ base_cols = ["id","Agent","Submitter","Models Used","Source"]
259
  new_cols = ["Openness", "Agent Tooling"]
260
  ending_cols = ["Date", "Logs"]
261
 
 
361
 
362
  x_col_to_use = x
363
  y_col_to_use = y
364
+ llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
365
 
366
  # --- Section 2: Data Preparation---
367
  required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
 
443
  ))
444
 
445
  # --- Section 5: Prepare for Marker Plotting ---
446
+ def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
447
  """
448
  Builds the complete HTML string for the plot's hover tooltip.
449
+ Formats the 'Models Used' column as a bulleted list if multiple.
450
  """
451
  h_pad = " "
452
  parts = ["<br>"]
453
  parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
454
  parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
455
+ if divider_line_x > 0 and row[x_col] >= divider_line_x:
456
+ # If no cost, display "Missing" for the cost.
457
+ parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
458
+ else:
459
+ parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
460
  parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
461
  parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
462
 
463
  # Add extra vertical space (line spacing) before the next section
464
  parts.append("<br>")
465
+ # Clean and format Models Used column
466
+ llm_base_value = row['Models Used']
467
  llm_base_value = clean_llm_base_list(llm_base_value)
468
  if isinstance(llm_base_value, list) and llm_base_value:
469
+ parts.append(f"{h_pad}Models Used:{h_pad}<br>")
470
  # Create a list of padded bullet points
471
  list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
472
  # Join them with line breaks
473
  parts.append('<br>'.join(list_items))
474
  else:
475
  # Handle the non-list case with padding
476
+ parts.append(f"{h_pad}Models Used: <b>{llm_base_value}</b>{h_pad}")
477
  # Add a final line break for bottom "padding"
478
  parts.append("<br>")
479
  # Join all the parts together into the final HTML string
 
485
  agent_col=agent_col,
486
  x_axis_label=x_axis_label,
487
  x_col=x_col_to_use,
488
+ y_col=y_col_to_use,
489
+ divider_line_x=divider_line_x
490
  ),
491
  axis=1
492
  )
 
547
  font_color="#d3dedc",
548
  ),
549
  )
 
 
 
 
 
 
 
 
 
 
 
550
 
551
  return fig
552
 
ui_components.py CHANGED
@@ -246,7 +246,7 @@ def build_descriptions_tooltip_content(table) -> str:
246
  return """
247
  <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
248
  <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
249
- <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
250
  <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
251
  <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
252
  <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
@@ -264,7 +264,7 @@ def build_descriptions_tooltip_content(table) -> str:
264
  return f"""
265
  <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
266
  <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
267
- <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
268
  <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
269
  <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
270
  <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
@@ -277,7 +277,7 @@ def build_descriptions_tooltip_content(table) -> str:
277
  return f"""
278
  <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
279
  <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
280
- <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
281
  <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
282
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
283
  <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
@@ -543,9 +543,9 @@ def create_leaderboard_display(
543
  if "Score" in col:
544
  df_view = format_score_column(df_view, col)
545
  scatter_plot = plots_dict.get('scatter_plot', go.Figure())
546
- #Make pretty and format the LLM Base column
547
- df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list)
548
- df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html)
549
  # append the repro url to the end of the agent name
550
  if 'Source' in df_view.columns:
551
  df_view['Agent'] = df_view.apply(
@@ -567,7 +567,7 @@ def create_leaderboard_display(
567
  for col in df_headers:
568
  if col == "Logs" or "Cost" in col or "Score" in col:
569
  df_datatypes.append("markdown")
570
- elif col in ["Agent","Icon","LLM Base", "Pareto"]:
571
  df_datatypes.append("html")
572
  else:
573
  df_datatypes.append("str")
@@ -655,7 +655,7 @@ def create_benchmark_details_display(
655
  benchmark_cost_col = f"{benchmark_name} Cost"
656
 
657
  # Define the columns needed for the detailed table
658
- table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base']
659
 
660
  # Filter to only columns that actually exist in the full dataframe
661
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -684,9 +684,9 @@ def create_benchmark_details_display(
684
  axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
685
  )
686
 
687
- #Make pretty and format the LLM Base column
688
- benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list)
689
- benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html)
690
  # append the repro url to the end of the agent name
691
  if 'Source' in benchmark_table_df.columns:
692
  benchmark_table_df['Agent'] = benchmark_table_df.apply(
@@ -719,7 +719,7 @@ def create_benchmark_details_display(
719
  'Icon',
720
  'Agent',
721
  'Submitter',
722
- 'LLM Base',
723
  'Attempted Benchmark',
724
  benchmark_score_col,
725
  benchmark_cost_col,
@@ -741,7 +741,7 @@ def create_benchmark_details_display(
741
  for col in df_headers:
742
  if "Logs" in col or "Cost" in col or "Score" in col:
743
  df_datatypes.append("markdown")
744
- elif col in ["Agent", "Icon", "LLM Base", "Pareto"]:
745
  df_datatypes.append("html")
746
  else:
747
  df_datatypes.append("str")
@@ -857,7 +857,7 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: boo
857
 
858
  def format_llm_base_with_html(value):
859
  """
860
- Formats the 'LLM Base' cell value.
861
  If the value is a list with more than 1 element, it returns an
862
  HTML <span> with the full list in a hover-over tooltip.
863
  If it's a single-element list, it returns just that element.
 
246
  return """
247
  <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
248
  <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
249
+ <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
250
  <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
251
  <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
252
  <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
 
264
  return f"""
265
  <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
266
  <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
267
+ <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
268
  <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
269
  <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
270
  <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
 
277
  return f"""
278
  <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
279
  <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
280
+ <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
281
  <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
282
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
283
  <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
 
543
  if "Score" in col:
544
  df_view = format_score_column(df_view, col)
545
  scatter_plot = plots_dict.get('scatter_plot', go.Figure())
546
+ #Make pretty and format the Models Used column
547
+ df_view['Models Used'] = df_view['Models Used'].apply(clean_llm_base_list)
548
+ df_view['Models Used'] = df_view['Models Used'].apply(format_llm_base_with_html)
549
  # append the repro url to the end of the agent name
550
  if 'Source' in df_view.columns:
551
  df_view['Agent'] = df_view.apply(
 
567
  for col in df_headers:
568
  if col == "Logs" or "Cost" in col or "Score" in col:
569
  df_datatypes.append("markdown")
570
+ elif col in ["Agent","Icon","Models Used", "Pareto"]:
571
  df_datatypes.append("html")
572
  else:
573
  df_datatypes.append("str")
 
655
  benchmark_cost_col = f"{benchmark_name} Cost"
656
 
657
  # Define the columns needed for the detailed table
658
+ table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
659
 
660
  # Filter to only columns that actually exist in the full dataframe
661
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
 
684
  axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
685
  )
686
 
687
+ #Make pretty and format the Models Used column
688
+ benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)
689
+ benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(format_llm_base_with_html)
690
  # append the repro url to the end of the agent name
691
  if 'Source' in benchmark_table_df.columns:
692
  benchmark_table_df['Agent'] = benchmark_table_df.apply(
 
719
  'Icon',
720
  'Agent',
721
  'Submitter',
722
+ 'Models Used',
723
  'Attempted Benchmark',
724
  benchmark_score_col,
725
  benchmark_cost_col,
 
741
  for col in df_headers:
742
  if "Logs" in col or "Cost" in col or "Score" in col:
743
  df_datatypes.append("markdown")
744
+ elif col in ["Agent", "Icon", "Models Used", "Pareto"]:
745
  df_datatypes.append("html")
746
  else:
747
  df_datatypes.append("str")
 
857
 
858
  def format_llm_base_with_html(value):
859
  """
860
+ Formats the 'Models Used' cell value.
861
  If the value is a list with more than 1 element, it returns an
862
  HTML <span> with the full list in a hover-over tooltip.
863
  If it's a single-element list, it returns just that element.