Spaces:
Running
Running
Amber Tanaka
commited on
Change name of LLM Base and adjust hover behavior (#85)
Browse files- leaderboard_transformer.py +16 -22
- ui_components.py +14 -14
leaderboard_transformer.py
CHANGED
|
@@ -112,7 +112,7 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 112 |
'Logs': 'Logs',
|
| 113 |
'Openness': 'Openness',
|
| 114 |
'Agent tooling': 'Agent Tooling',
|
| 115 |
-
'LLM base': '
|
| 116 |
'Source': 'Source',
|
| 117 |
}
|
| 118 |
|
|
@@ -255,7 +255,7 @@ class DataTransformer:
|
|
| 255 |
df_view = df_sorted.copy()
|
| 256 |
|
| 257 |
# --- 3. Add Columns for Agent Openness and Tooling ---
|
| 258 |
-
base_cols = ["id","Agent","Submitter","
|
| 259 |
new_cols = ["Openness", "Agent Tooling"]
|
| 260 |
ending_cols = ["Date", "Logs"]
|
| 261 |
|
|
@@ -361,7 +361,7 @@ def _plot_scatter_plotly(
|
|
| 361 |
|
| 362 |
x_col_to_use = x
|
| 363 |
y_col_to_use = y
|
| 364 |
-
llm_base = data["
|
| 365 |
|
| 366 |
# --- Section 2: Data Preparation---
|
| 367 |
required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
|
|
@@ -443,33 +443,37 @@ def _plot_scatter_plotly(
|
|
| 443 |
))
|
| 444 |
|
| 445 |
# --- Section 5: Prepare for Marker Plotting ---
|
| 446 |
-
def format_hover_text(row, agent_col, x_axis_label, x_col, y_col):
|
| 447 |
"""
|
| 448 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 449 |
-
Formats the '
|
| 450 |
"""
|
| 451 |
h_pad = " "
|
| 452 |
parts = ["<br>"]
|
| 453 |
parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
|
| 454 |
parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 455 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
|
| 457 |
parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
|
| 458 |
|
| 459 |
# Add extra vertical space (line spacing) before the next section
|
| 460 |
parts.append("<br>")
|
| 461 |
-
# Clean and format
|
| 462 |
-
llm_base_value = row['
|
| 463 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
| 464 |
if isinstance(llm_base_value, list) and llm_base_value:
|
| 465 |
-
parts.append(f"{h_pad}
|
| 466 |
# Create a list of padded bullet points
|
| 467 |
list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
|
| 468 |
# Join them with line breaks
|
| 469 |
parts.append('<br>'.join(list_items))
|
| 470 |
else:
|
| 471 |
# Handle the non-list case with padding
|
| 472 |
-
parts.append(f"{h_pad}
|
| 473 |
# Add a final line break for bottom "padding"
|
| 474 |
parts.append("<br>")
|
| 475 |
# Join all the parts together into the final HTML string
|
|
@@ -481,7 +485,8 @@ def _plot_scatter_plotly(
|
|
| 481 |
agent_col=agent_col,
|
| 482 |
x_axis_label=x_axis_label,
|
| 483 |
x_col=x_col_to_use,
|
| 484 |
-
y_col=y_col_to_use
|
|
|
|
| 485 |
),
|
| 486 |
axis=1
|
| 487 |
)
|
|
@@ -542,17 +547,6 @@ def _plot_scatter_plotly(
|
|
| 542 |
font_color="#d3dedc",
|
| 543 |
),
|
| 544 |
)
|
| 545 |
-
# fig.add_layout_image(
|
| 546 |
-
# dict(
|
| 547 |
-
# source=logo_data_uri,
|
| 548 |
-
# xref="x domain", yref="y domain",
|
| 549 |
-
# x=1.1, y=1.1,
|
| 550 |
-
# sizex=0.2, sizey=0.2,
|
| 551 |
-
# xanchor="left",
|
| 552 |
-
# yanchor="bottom",
|
| 553 |
-
# layer="above",
|
| 554 |
-
# ),
|
| 555 |
-
# )
|
| 556 |
|
| 557 |
return fig
|
| 558 |
|
|
|
|
| 112 |
'Logs': 'Logs',
|
| 113 |
'Openness': 'Openness',
|
| 114 |
'Agent tooling': 'Agent Tooling',
|
| 115 |
+
'LLM base': 'Models Used',
|
| 116 |
'Source': 'Source',
|
| 117 |
}
|
| 118 |
|
|
|
|
| 255 |
df_view = df_sorted.copy()
|
| 256 |
|
| 257 |
# --- 3. Add Columns for Agent Openness and Tooling ---
|
| 258 |
+
base_cols = ["id","Agent","Submitter","Models Used","Source"]
|
| 259 |
new_cols = ["Openness", "Agent Tooling"]
|
| 260 |
ending_cols = ["Date", "Logs"]
|
| 261 |
|
|
|
|
| 361 |
|
| 362 |
x_col_to_use = x
|
| 363 |
y_col_to_use = y
|
| 364 |
+
llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
|
| 365 |
|
| 366 |
# --- Section 2: Data Preparation---
|
| 367 |
required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
|
|
|
|
| 443 |
))
|
| 444 |
|
| 445 |
# --- Section 5: Prepare for Marker Plotting ---
|
| 446 |
+
def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
|
| 447 |
"""
|
| 448 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 449 |
+
Formats the 'Models Used' column as a bulleted list if multiple.
|
| 450 |
"""
|
| 451 |
h_pad = " "
|
| 452 |
parts = ["<br>"]
|
| 453 |
parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
|
| 454 |
parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 455 |
+
if divider_line_x > 0 and row[x_col] >= divider_line_x:
|
| 456 |
+
# If no cost, display "Missing" for the cost.
|
| 457 |
+
parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
|
| 458 |
+
else:
|
| 459 |
+
parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
|
| 460 |
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
|
| 461 |
parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
|
| 462 |
|
| 463 |
# Add extra vertical space (line spacing) before the next section
|
| 464 |
parts.append("<br>")
|
| 465 |
+
# Clean and format Models Used column
|
| 466 |
+
llm_base_value = row['Models Used']
|
| 467 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
| 468 |
if isinstance(llm_base_value, list) and llm_base_value:
|
| 469 |
+
parts.append(f"{h_pad}Models Used:{h_pad}<br>")
|
| 470 |
# Create a list of padded bullet points
|
| 471 |
list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
|
| 472 |
# Join them with line breaks
|
| 473 |
parts.append('<br>'.join(list_items))
|
| 474 |
else:
|
| 475 |
# Handle the non-list case with padding
|
| 476 |
+
parts.append(f"{h_pad}Models Used: <b>{llm_base_value}</b>{h_pad}")
|
| 477 |
# Add a final line break for bottom "padding"
|
| 478 |
parts.append("<br>")
|
| 479 |
# Join all the parts together into the final HTML string
|
|
|
|
| 485 |
agent_col=agent_col,
|
| 486 |
x_axis_label=x_axis_label,
|
| 487 |
x_col=x_col_to_use,
|
| 488 |
+
y_col=y_col_to_use,
|
| 489 |
+
divider_line_x=divider_line_x
|
| 490 |
),
|
| 491 |
axis=1
|
| 492 |
)
|
|
|
|
| 547 |
font_color="#d3dedc",
|
| 548 |
),
|
| 549 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
|
| 551 |
return fig
|
| 552 |
|
ui_components.py
CHANGED
|
@@ -246,7 +246,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 246 |
return """
|
| 247 |
<div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
|
| 248 |
<div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
|
| 249 |
-
<div class="tooltip-description-item"><b>
|
| 250 |
<div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
|
| 251 |
<div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
|
| 252 |
<div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
|
|
@@ -264,7 +264,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 264 |
return f"""
|
| 265 |
<div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
|
| 266 |
<div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
|
| 267 |
-
<div class="tooltip-description-item"><b>
|
| 268 |
<div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
|
| 269 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
|
| 270 |
<div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
|
|
@@ -277,7 +277,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 277 |
return f"""
|
| 278 |
<div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
|
| 279 |
<div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
|
| 280 |
-
<div class="tooltip-description-item"><b>
|
| 281 |
<div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
|
| 282 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
| 283 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
|
|
@@ -543,9 +543,9 @@ def create_leaderboard_display(
|
|
| 543 |
if "Score" in col:
|
| 544 |
df_view = format_score_column(df_view, col)
|
| 545 |
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
|
| 546 |
-
#Make pretty and format the
|
| 547 |
-
df_view['
|
| 548 |
-
df_view['
|
| 549 |
# append the repro url to the end of the agent name
|
| 550 |
if 'Source' in df_view.columns:
|
| 551 |
df_view['Agent'] = df_view.apply(
|
|
@@ -567,7 +567,7 @@ def create_leaderboard_display(
|
|
| 567 |
for col in df_headers:
|
| 568 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 569 |
df_datatypes.append("markdown")
|
| 570 |
-
elif col in ["Agent","Icon","
|
| 571 |
df_datatypes.append("html")
|
| 572 |
else:
|
| 573 |
df_datatypes.append("str")
|
|
@@ -655,7 +655,7 @@ def create_benchmark_details_display(
|
|
| 655 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 656 |
|
| 657 |
# Define the columns needed for the detailed table
|
| 658 |
-
table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', '
|
| 659 |
|
| 660 |
# Filter to only columns that actually exist in the full dataframe
|
| 661 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
@@ -684,9 +684,9 @@ def create_benchmark_details_display(
|
|
| 684 |
axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
|
| 685 |
)
|
| 686 |
|
| 687 |
-
#Make pretty and format the
|
| 688 |
-
benchmark_table_df['
|
| 689 |
-
benchmark_table_df['
|
| 690 |
# append the repro url to the end of the agent name
|
| 691 |
if 'Source' in benchmark_table_df.columns:
|
| 692 |
benchmark_table_df['Agent'] = benchmark_table_df.apply(
|
|
@@ -719,7 +719,7 @@ def create_benchmark_details_display(
|
|
| 719 |
'Icon',
|
| 720 |
'Agent',
|
| 721 |
'Submitter',
|
| 722 |
-
'
|
| 723 |
'Attempted Benchmark',
|
| 724 |
benchmark_score_col,
|
| 725 |
benchmark_cost_col,
|
|
@@ -741,7 +741,7 @@ def create_benchmark_details_display(
|
|
| 741 |
for col in df_headers:
|
| 742 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 743 |
df_datatypes.append("markdown")
|
| 744 |
-
elif col in ["Agent", "Icon", "
|
| 745 |
df_datatypes.append("html")
|
| 746 |
else:
|
| 747 |
df_datatypes.append("str")
|
|
@@ -857,7 +857,7 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: boo
|
|
| 857 |
|
| 858 |
def format_llm_base_with_html(value):
|
| 859 |
"""
|
| 860 |
-
Formats the '
|
| 861 |
If the value is a list with more than 1 element, it returns an
|
| 862 |
HTML <span> with the full list in a hover-over tooltip.
|
| 863 |
If it's a single-element list, it returns just that element.
|
|
|
|
| 246 |
return """
|
| 247 |
<div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
|
| 248 |
<div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
|
| 249 |
+
<div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 250 |
<div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
|
| 251 |
<div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
|
| 252 |
<div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
|
|
|
|
| 264 |
return f"""
|
| 265 |
<div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
|
| 266 |
<div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
|
| 267 |
+
<div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 268 |
<div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
|
| 269 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
|
| 270 |
<div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
|
|
|
|
| 277 |
return f"""
|
| 278 |
<div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
|
| 279 |
<div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
|
| 280 |
+
<div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 281 |
<div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
|
| 282 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
| 283 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
|
|
|
|
| 543 |
if "Score" in col:
|
| 544 |
df_view = format_score_column(df_view, col)
|
| 545 |
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
|
| 546 |
+
#Make pretty and format the Models Used column
|
| 547 |
+
df_view['Models Used'] = df_view['Models Used'].apply(clean_llm_base_list)
|
| 548 |
+
df_view['Models Used'] = df_view['Models Used'].apply(format_llm_base_with_html)
|
| 549 |
# append the repro url to the end of the agent name
|
| 550 |
if 'Source' in df_view.columns:
|
| 551 |
df_view['Agent'] = df_view.apply(
|
|
|
|
| 567 |
for col in df_headers:
|
| 568 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 569 |
df_datatypes.append("markdown")
|
| 570 |
+
elif col in ["Agent","Icon","Models Used", "Pareto"]:
|
| 571 |
df_datatypes.append("html")
|
| 572 |
else:
|
| 573 |
df_datatypes.append("str")
|
|
|
|
| 655 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 656 |
|
| 657 |
# Define the columns needed for the detailed table
|
| 658 |
+
table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
|
| 659 |
|
| 660 |
# Filter to only columns that actually exist in the full dataframe
|
| 661 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
|
|
| 684 |
axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
|
| 685 |
)
|
| 686 |
|
| 687 |
+
#Make pretty and format the Models Used column
|
| 688 |
+
benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)
|
| 689 |
+
benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(format_llm_base_with_html)
|
| 690 |
# append the repro url to the end of the agent name
|
| 691 |
if 'Source' in benchmark_table_df.columns:
|
| 692 |
benchmark_table_df['Agent'] = benchmark_table_df.apply(
|
|
|
|
| 719 |
'Icon',
|
| 720 |
'Agent',
|
| 721 |
'Submitter',
|
| 722 |
+
'Models Used',
|
| 723 |
'Attempted Benchmark',
|
| 724 |
benchmark_score_col,
|
| 725 |
benchmark_cost_col,
|
|
|
|
| 741 |
for col in df_headers:
|
| 742 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 743 |
df_datatypes.append("markdown")
|
| 744 |
+
elif col in ["Agent", "Icon", "Models Used", "Pareto"]:
|
| 745 |
df_datatypes.append("html")
|
| 746 |
else:
|
| 747 |
df_datatypes.append("str")
|
|
|
|
| 857 |
|
| 858 |
def format_llm_base_with_html(value):
|
| 859 |
"""
|
| 860 |
+
Formats the 'Models Used' cell value.
|
| 861 |
If the value is a list with more than 1 element, it returns an
|
| 862 |
HTML <span> with the full list in a hover-over tooltip.
|
| 863 |
If it's a single-element list, it returns just that element.
|