Spaces:
Paused
Paused
Amber Tanaka
commited on
Add llm base (#14)
Browse files- content.py +6 -2
- leaderboard_transformer.py +5 -1
- ui_components.py +176 -139
content.py
CHANGED
|
@@ -319,10 +319,14 @@ html:not(.dark) #legend-markdown .light-mode-icon,
|
|
| 319 |
pointer-events: none;
|
| 320 |
left: 50%;
|
| 321 |
transform: translateX(-50%);
|
| 322 |
-
z-index: 1000;
|
| 323 |
}
|
| 324 |
-
|
| 325 |
.tooltip-icon:hover::after {
|
| 326 |
opacity: 1;
|
| 327 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
"""
|
|
|
|
| 319 |
pointer-events: none;
|
| 320 |
left: 50%;
|
| 321 |
transform: translateX(-50%);
|
| 322 |
+
z-index: 1000;
|
| 323 |
}
|
|
|
|
| 324 |
.tooltip-icon:hover::after {
|
| 325 |
opacity: 1;
|
| 326 |
}
|
| 327 |
+
/* --- inside table tooltips --- */
|
| 328 |
+
.native-tooltip-icon {
|
| 329 |
+
cursor: help;
|
| 330 |
+
text-decoration: underline dotted 1px;
|
| 331 |
+
}
|
| 332 |
"""
|
leaderboard_transformer.py
CHANGED
|
@@ -4,6 +4,7 @@ import pandas as pd
|
|
| 4 |
import logging
|
| 5 |
from typing import Optional
|
| 6 |
import base64
|
|
|
|
| 7 |
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
|
@@ -68,6 +69,7 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 68 |
'Logs': 'Logs',
|
| 69 |
'Openness': 'Openness',
|
| 70 |
'Agent tooling': 'Agent Tooling',
|
|
|
|
| 71 |
}
|
| 72 |
|
| 73 |
if raw_col in fixed_mappings:
|
|
@@ -233,7 +235,7 @@ class DataTransformer:
|
|
| 233 |
df_view = df_view.drop(columns=['Submitter'])
|
| 234 |
|
| 235 |
# 4. Build the List of Columns to Display (now simplified)
|
| 236 |
-
base_cols = ["id","Agent","agent_for_hover"]
|
| 237 |
new_cols = ["Openness", "Agent Tooling"]
|
| 238 |
ending_cols = ["Logs"]
|
| 239 |
|
|
@@ -585,3 +587,5 @@ def svg_to_data_uri(path: str) -> str:
|
|
| 585 |
except FileNotFoundError:
|
| 586 |
logger.warning(f"SVG file not found at: {path}")
|
| 587 |
return None
|
|
|
|
|
|
|
|
|
| 4 |
import logging
|
| 5 |
from typing import Optional
|
| 6 |
import base64
|
| 7 |
+
import html
|
| 8 |
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
|
|
|
| 69 |
'Logs': 'Logs',
|
| 70 |
'Openness': 'Openness',
|
| 71 |
'Agent tooling': 'Agent Tooling',
|
| 72 |
+
'LLM base': 'LLM Base',
|
| 73 |
}
|
| 74 |
|
| 75 |
if raw_col in fixed_mappings:
|
|
|
|
| 235 |
df_view = df_view.drop(columns=['Submitter'])
|
| 236 |
|
| 237 |
# 4. Build the List of Columns to Display (now simplified)
|
| 238 |
+
base_cols = ["id","Agent","LLM Base", "agent_for_hover"]
|
| 239 |
new_cols = ["Openness", "Agent Tooling"]
|
| 240 |
ending_cols = ["Logs"]
|
| 241 |
|
|
|
|
| 587 |
except FileNotFoundError:
|
| 588 |
logger.warning(f"SVG file not found at: {path}")
|
| 589 |
return None
|
| 590 |
+
|
| 591 |
+
|
ui_components.py
CHANGED
|
@@ -301,7 +301,9 @@ def create_leaderboard_display(
|
|
| 301 |
if "Score" in col:
|
| 302 |
df_view = format_score_column(df_view, col)
|
| 303 |
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
|
| 304 |
-
|
|
|
|
|
|
|
| 305 |
|
| 306 |
all_cols = df_view.columns.tolist()
|
| 307 |
# Remove 'Pareto' from the list and insert it at the beginning
|
|
@@ -316,7 +318,7 @@ def create_leaderboard_display(
|
|
| 316 |
for col in df_headers:
|
| 317 |
if col in ["Logs", "Agent"] or "Cost" in col or "Score" in col:
|
| 318 |
df_datatypes.append("markdown")
|
| 319 |
-
elif col in ["Openness", "Agent Tooling"]:
|
| 320 |
df_datatypes.append("html")
|
| 321 |
else:
|
| 322 |
df_datatypes.append("str")
|
|
@@ -344,13 +346,157 @@ def create_leaderboard_display(
|
|
| 344 |
datatype=df_datatypes,
|
| 345 |
interactive=False,
|
| 346 |
wrap=True,
|
| 347 |
-
column_widths=[30, 30, 30, 200],
|
| 348 |
elem_classes=["wrap-header-df"]
|
| 349 |
)
|
| 350 |
|
| 351 |
# Return the components so they can be referenced elsewhere.
|
| 352 |
return plot_component, dataframe_component
|
| 353 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
| 355 |
"""
|
| 356 |
Loads and transforms the complete dataset for a given split.
|
|
@@ -427,143 +573,34 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str):
|
|
| 427 |
# Return the entire navigation bar as one single Gradio HTML component
|
| 428 |
return gr.HTML(full_html)
|
| 429 |
|
| 430 |
-
|
| 431 |
-
def create_benchmark_details_display(
|
| 432 |
-
full_df: pd.DataFrame,
|
| 433 |
-
tag_map: dict,
|
| 434 |
-
category_name: str
|
| 435 |
-
):
|
| 436 |
"""
|
| 437 |
-
|
| 438 |
-
For
|
| 439 |
-
Args:
|
| 440 |
-
full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
|
| 441 |
-
tag_map (dict): The "pretty" tag map to find the list of benchmarks.
|
| 442 |
-
category_name (str): The main category to display details for (e.g., "Literature Understanding").
|
| 443 |
"""
|
| 444 |
-
#
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
if not benchmark_names:
|
| 448 |
-
gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
|
| 449 |
-
return
|
| 450 |
-
|
| 451 |
-
gr.Markdown("---")
|
| 452 |
-
gr.Markdown("## Detailed Benchmark Results")
|
| 453 |
-
|
| 454 |
-
# 2. Loop through each benchmark and create its UI components
|
| 455 |
-
for benchmark_name in benchmark_names:
|
| 456 |
-
gr.Markdown(f"### {benchmark_name} Leaderboard", header_links=True)
|
| 457 |
-
|
| 458 |
-
# 3. Prepare the data for this specific benchmark's table and plot
|
| 459 |
-
benchmark_score_col = f"{benchmark_name} Score"
|
| 460 |
-
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 461 |
-
|
| 462 |
-
# Define the columns needed for the detailed table
|
| 463 |
-
table_cols = ['Agent','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id']
|
| 464 |
-
|
| 465 |
-
# Filter to only columns that actually exist in the full dataframe
|
| 466 |
-
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
| 467 |
-
|
| 468 |
-
if benchmark_score_col not in existing_table_cols:
|
| 469 |
-
gr.Markdown(f"Score data for {benchmark_name} not available.")
|
| 470 |
-
continue # Skip to the next benchmark if score is missing
|
| 471 |
-
|
| 472 |
-
# Create a specific DataFrame for the table view
|
| 473 |
-
benchmark_table_df = full_df[existing_table_cols].copy()
|
| 474 |
-
pareto_df = get_pareto_df(benchmark_table_df)
|
| 475 |
-
# Get the list of agents on the frontier. We'll use this list later.
|
| 476 |
-
if not pareto_df.empty and 'id' in pareto_df.columns:
|
| 477 |
-
pareto_agent_names = pareto_df['id'].tolist()
|
| 478 |
-
else:
|
| 479 |
-
pareto_agent_names = []
|
| 480 |
-
benchmark_table_df['Pareto'] = benchmark_table_df.apply(
|
| 481 |
-
lambda row: '📈' if row['id'] in pareto_agent_names else '',
|
| 482 |
-
axis=1
|
| 483 |
-
)
|
| 484 |
-
|
| 485 |
-
benchmark_table_df['Openness'] = benchmark_table_df['Openness'].apply(lambda x: create_svg_html(x, OPENNESS_SVG_MAP))
|
| 486 |
-
benchmark_table_df['Agent Tooling'] = benchmark_table_df['Agent Tooling'].apply(lambda x: create_svg_html(x, TOOLING_SVG_MAP))
|
| 487 |
-
|
| 488 |
-
# Calculated and add "Benchmark Attempted" column
|
| 489 |
-
def check_benchmark_status(row):
|
| 490 |
-
has_score = pd.notna(row.get(benchmark_score_col))
|
| 491 |
-
has_cost = pd.notna(row.get(benchmark_cost_col))
|
| 492 |
-
if has_score and has_cost:
|
| 493 |
-
return "✅"
|
| 494 |
-
if has_score or has_cost:
|
| 495 |
-
return "⚠️"
|
| 496 |
-
return "🚫 "
|
| 497 |
-
|
| 498 |
-
# Apply the function to create the new column
|
| 499 |
-
benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
|
| 500 |
-
# Sort the DataFrame
|
| 501 |
-
if benchmark_score_col in benchmark_table_df.columns:
|
| 502 |
-
benchmark_table_df = benchmark_table_df.sort_values(
|
| 503 |
-
by=benchmark_score_col, ascending=False, na_position='last'
|
| 504 |
-
)
|
| 505 |
-
# 1. Format the cost and score columns
|
| 506 |
-
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
|
| 507 |
-
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
|
| 508 |
-
desired_cols_in_order = [
|
| 509 |
-
'Pareto',
|
| 510 |
-
'Openness',
|
| 511 |
-
'Agent Tooling',
|
| 512 |
-
'Agent',
|
| 513 |
-
'Submitter',
|
| 514 |
-
'Attempted Benchmark',
|
| 515 |
-
benchmark_score_col,
|
| 516 |
-
benchmark_cost_col,
|
| 517 |
-
'Logs'
|
| 518 |
-
]
|
| 519 |
-
for col in desired_cols_in_order:
|
| 520 |
-
if col not in benchmark_table_df.columns:
|
| 521 |
-
benchmark_table_df[col] = pd.NA # Add as an empty column
|
| 522 |
-
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
|
| 523 |
-
# Rename columns for a cleaner table display, as requested
|
| 524 |
-
benchmark_table_df.rename({
|
| 525 |
-
benchmark_score_col: 'Score',
|
| 526 |
-
benchmark_cost_col: 'Cost',
|
| 527 |
-
}, inplace=True)
|
| 528 |
-
# Ensure the 'Logs' column is formatted correctly
|
| 529 |
-
df_headers = benchmark_table_df.columns.tolist()
|
| 530 |
-
df_datatypes = []
|
| 531 |
-
for col in df_headers:
|
| 532 |
-
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 533 |
-
df_datatypes.append("markdown")
|
| 534 |
-
elif col in ["Openness", "Agent Tooling"]:
|
| 535 |
-
df_datatypes.append("html")
|
| 536 |
-
else:
|
| 537 |
-
df_datatypes.append("str")
|
| 538 |
-
# Remove Pareto, Openness, and Agent Tooling from the headers
|
| 539 |
-
header_rename_map = {
|
| 540 |
-
"Pareto": "",
|
| 541 |
-
"Openness": "",
|
| 542 |
-
"Agent Tooling": ""
|
| 543 |
-
}
|
| 544 |
-
# 2. Create the final list of headers for display.
|
| 545 |
-
benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
|
| 546 |
-
# Create the scatter plot using the full data for context, but plotting benchmark metrics
|
| 547 |
-
# This shows all agents on the same axis for better comparison.
|
| 548 |
-
benchmark_plot = _plot_scatter_plotly(
|
| 549 |
-
data=full_df,
|
| 550 |
-
x=benchmark_cost_col,
|
| 551 |
-
y=benchmark_score_col,
|
| 552 |
-
agent_col="Agent"
|
| 553 |
-
)
|
| 554 |
-
gr.Plot(value=benchmark_plot)
|
| 555 |
-
gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
| 556 |
-
# Put table and key into an accordion
|
| 557 |
-
with gr.Accordion("Details", open=True, elem_id="leaderboard-accordion"):
|
| 558 |
-
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
| 559 |
-
gr.DataFrame(
|
| 560 |
-
headers=df_headers,
|
| 561 |
-
value=benchmark_table_df,
|
| 562 |
-
datatype=df_datatypes,
|
| 563 |
-
interactive=False,
|
| 564 |
-
wrap=True,
|
| 565 |
-
column_widths=[40, 40, 40, 350],
|
| 566 |
-
elem_classes=["wrap-header-df"]
|
| 567 |
-
)
|
| 568 |
|
|
|
|
|
|
|
| 569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
if "Score" in col:
|
| 302 |
df_view = format_score_column(df_view, col)
|
| 303 |
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
|
| 304 |
+
#Make pretty and format the LLM Base column
|
| 305 |
+
df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list)
|
| 306 |
+
df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html)
|
| 307 |
|
| 308 |
all_cols = df_view.columns.tolist()
|
| 309 |
# Remove 'Pareto' from the list and insert it at the beginning
|
|
|
|
| 318 |
for col in df_headers:
|
| 319 |
if col in ["Logs", "Agent"] or "Cost" in col or "Score" in col:
|
| 320 |
df_datatypes.append("markdown")
|
| 321 |
+
elif col in ["Openness", "Agent Tooling","LLM Base"]:
|
| 322 |
df_datatypes.append("html")
|
| 323 |
else:
|
| 324 |
df_datatypes.append("str")
|
|
|
|
| 346 |
datatype=df_datatypes,
|
| 347 |
interactive=False,
|
| 348 |
wrap=True,
|
| 349 |
+
column_widths=[30, 30, 30, 200, 200],
|
| 350 |
elem_classes=["wrap-header-df"]
|
| 351 |
)
|
| 352 |
|
| 353 |
# Return the components so they can be referenced elsewhere.
|
| 354 |
return plot_component, dataframe_component
|
| 355 |
|
| 356 |
+
# # --- Detailed Benchmark Display ---
|
| 357 |
+
def create_benchmark_details_display(
|
| 358 |
+
full_df: pd.DataFrame,
|
| 359 |
+
tag_map: dict,
|
| 360 |
+
category_name: str
|
| 361 |
+
):
|
| 362 |
+
"""
|
| 363 |
+
Generates a detailed breakdown for each benchmark within a given category.
|
| 364 |
+
For each benchmark, it creates a title, a filtered table, and a scatter plot.
|
| 365 |
+
Args:
|
| 366 |
+
full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
|
| 367 |
+
tag_map (dict): The "pretty" tag map to find the list of benchmarks.
|
| 368 |
+
category_name (str): The main category to display details for (e.g., "Literature Understanding").
|
| 369 |
+
"""
|
| 370 |
+
# 1. Get the list of benchmarks for the selected category
|
| 371 |
+
benchmark_names = tag_map.get(category_name, [])
|
| 372 |
+
|
| 373 |
+
if not benchmark_names:
|
| 374 |
+
gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
|
| 375 |
+
return
|
| 376 |
+
|
| 377 |
+
gr.Markdown("---")
|
| 378 |
+
gr.Markdown("## Detailed Benchmark Results")
|
| 379 |
+
|
| 380 |
+
# 2. Loop through each benchmark and create its UI components
|
| 381 |
+
for benchmark_name in benchmark_names:
|
| 382 |
+
gr.Markdown(f"### {benchmark_name} Leaderboard", header_links=True)
|
| 383 |
+
|
| 384 |
+
# 3. Prepare the data for this specific benchmark's table and plot
|
| 385 |
+
benchmark_score_col = f"{benchmark_name} Score"
|
| 386 |
+
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 387 |
+
|
| 388 |
+
# Define the columns needed for the detailed table
|
| 389 |
+
table_cols = ['Agent','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base']
|
| 390 |
+
|
| 391 |
+
# Filter to only columns that actually exist in the full dataframe
|
| 392 |
+
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
| 393 |
+
|
| 394 |
+
if benchmark_score_col not in existing_table_cols:
|
| 395 |
+
gr.Markdown(f"Score data for {benchmark_name} not available.")
|
| 396 |
+
continue # Skip to the next benchmark if score is missing
|
| 397 |
+
|
| 398 |
+
# Create a specific DataFrame for the table view
|
| 399 |
+
benchmark_table_df = full_df[existing_table_cols].copy()
|
| 400 |
+
pareto_df = get_pareto_df(benchmark_table_df)
|
| 401 |
+
# Get the list of agents on the frontier. We'll use this list later.
|
| 402 |
+
if not pareto_df.empty and 'id' in pareto_df.columns:
|
| 403 |
+
pareto_agent_names = pareto_df['id'].tolist()
|
| 404 |
+
else:
|
| 405 |
+
pareto_agent_names = []
|
| 406 |
+
benchmark_table_df['Pareto'] = benchmark_table_df.apply(
|
| 407 |
+
lambda row: '📈' if row['id'] in pareto_agent_names else '',
|
| 408 |
+
axis=1
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
benchmark_table_df['Openness'] = benchmark_table_df['Openness'].apply(lambda x: create_svg_html(x, OPENNESS_SVG_MAP))
|
| 412 |
+
benchmark_table_df['Agent Tooling'] = benchmark_table_df['Agent Tooling'].apply(lambda x: create_svg_html(x, TOOLING_SVG_MAP))
|
| 413 |
+
|
| 414 |
+
#Make pretty and format the LLM Base column
|
| 415 |
+
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list)
|
| 416 |
+
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html)
|
| 417 |
+
|
| 418 |
+
# Calculated and add "Benchmark Attempted" column
|
| 419 |
+
def check_benchmark_status(row):
|
| 420 |
+
has_score = pd.notna(row.get(benchmark_score_col))
|
| 421 |
+
has_cost = pd.notna(row.get(benchmark_cost_col))
|
| 422 |
+
if has_score and has_cost:
|
| 423 |
+
return "✅"
|
| 424 |
+
if has_score or has_cost:
|
| 425 |
+
return "⚠️"
|
| 426 |
+
return "🚫 "
|
| 427 |
+
|
| 428 |
+
# Apply the function to create the new column
|
| 429 |
+
benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
|
| 430 |
+
# Sort the DataFrame
|
| 431 |
+
if benchmark_score_col in benchmark_table_df.columns:
|
| 432 |
+
benchmark_table_df = benchmark_table_df.sort_values(
|
| 433 |
+
by=benchmark_score_col, ascending=False, na_position='last'
|
| 434 |
+
)
|
| 435 |
+
# 1. Format the cost and score columns
|
| 436 |
+
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
|
| 437 |
+
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
|
| 438 |
+
desired_cols_in_order = [
|
| 439 |
+
'Pareto',
|
| 440 |
+
'Openness',
|
| 441 |
+
'Agent Tooling',
|
| 442 |
+
'Agent',
|
| 443 |
+
'Submitter',
|
| 444 |
+
'LLM Base',
|
| 445 |
+
'Attempted Benchmark',
|
| 446 |
+
benchmark_score_col,
|
| 447 |
+
benchmark_cost_col,
|
| 448 |
+
'Logs'
|
| 449 |
+
]
|
| 450 |
+
for col in desired_cols_in_order:
|
| 451 |
+
if col not in benchmark_table_df.columns:
|
| 452 |
+
benchmark_table_df[col] = pd.NA # Add as an empty column
|
| 453 |
+
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
|
| 454 |
+
# Rename columns for a cleaner table display, as requested
|
| 455 |
+
benchmark_table_df.rename({
|
| 456 |
+
benchmark_score_col: 'Score',
|
| 457 |
+
benchmark_cost_col: 'Cost',
|
| 458 |
+
}, inplace=True)
|
| 459 |
+
# Ensure the 'Logs' column is formatted correctly
|
| 460 |
+
df_headers = benchmark_table_df.columns.tolist()
|
| 461 |
+
df_datatypes = []
|
| 462 |
+
for col in df_headers:
|
| 463 |
+
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 464 |
+
df_datatypes.append("markdown")
|
| 465 |
+
elif col in ["Openness", "Agent Tooling", "LLM Base"]:
|
| 466 |
+
df_datatypes.append("html")
|
| 467 |
+
else:
|
| 468 |
+
df_datatypes.append("str")
|
| 469 |
+
# Remove Pareto, Openness, and Agent Tooling from the headers
|
| 470 |
+
header_rename_map = {
|
| 471 |
+
"Pareto": "",
|
| 472 |
+
"Openness": "",
|
| 473 |
+
"Agent Tooling": ""
|
| 474 |
+
}
|
| 475 |
+
# 2. Create the final list of headers for display.
|
| 476 |
+
benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
|
| 477 |
+
# Create the scatter plot using the full data for context, but plotting benchmark metrics
|
| 478 |
+
# This shows all agents on the same axis for better comparison.
|
| 479 |
+
benchmark_plot = _plot_scatter_plotly(
|
| 480 |
+
data=full_df,
|
| 481 |
+
x=benchmark_cost_col,
|
| 482 |
+
y=benchmark_score_col,
|
| 483 |
+
agent_col="Agent"
|
| 484 |
+
)
|
| 485 |
+
gr.Plot(value=benchmark_plot)
|
| 486 |
+
gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
| 487 |
+
# Put table and key into an accordion
|
| 488 |
+
with gr.Accordion("Details", open=True, elem_id="leaderboard-accordion"):
|
| 489 |
+
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
| 490 |
+
gr.DataFrame(
|
| 491 |
+
headers=df_headers,
|
| 492 |
+
value=benchmark_table_df,
|
| 493 |
+
datatype=df_datatypes,
|
| 494 |
+
interactive=False,
|
| 495 |
+
wrap=True,
|
| 496 |
+
column_widths=[40, 40, 40, 200, 150, 175, 85],
|
| 497 |
+
elem_classes=["wrap-header-df"]
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
| 501 |
"""
|
| 502 |
Loads and transforms the complete dataset for a given split.
|
|
|
|
| 573 |
# Return the entire navigation bar as one single Gradio HTML component
|
| 574 |
return gr.HTML(full_html)
|
| 575 |
|
| 576 |
+
def clean_llm_base_list(model_list):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
"""
|
| 578 |
+
Cleans a list of model strings by keeping only the text after the last '/'.
|
| 579 |
+
For example: "models/gemini-2.5-flash-preview-05-20" becomes "gemini-2.5-flash-preview-05-20".
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
"""
|
| 581 |
+
# Return the original value if it's not a list, to avoid errors.
|
| 582 |
+
if not isinstance(model_list, list):
|
| 583 |
+
return model_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
+
# Use a list comprehension for a clean and efficient transformation.
|
| 586 |
+
return [str(item).split('/')[-1] for item in model_list]
|
| 587 |
|
| 588 |
+
def format_llm_base_with_html(value):
|
| 589 |
+
"""
|
| 590 |
+
Formats the 'LLM Base' cell value.
|
| 591 |
+
If the value is a list with more than 1 element, it returns an
|
| 592 |
+
HTML <span> with the full list in a hover-over tooltip.
|
| 593 |
+
If it's a single-element list, it returns just that element.
|
| 594 |
+
Otherwise, it returns the original value.
|
| 595 |
+
"""
|
| 596 |
+
if isinstance(value, list):
|
| 597 |
+
if len(value) > 1:
|
| 598 |
+
# Join the list items with a newline character for a clean tooltip
|
| 599 |
+
tooltip_text = "\n".join(map(str, value))
|
| 600 |
+
# Return an HTML span with the title attribute for the tooltip
|
| 601 |
+
return f'<span style="cursor: help;" title="{tooltip_text}">Multiple ⓘ</span>'
|
| 602 |
+
if len(value) == 1:
|
| 603 |
+
# If only one item, just return that item
|
| 604 |
+
return value[0]
|
| 605 |
+
# Return the value as-is if it's not a list or is an empty list
|
| 606 |
+
return value
|