import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import os
import base64
from agenteval.leaderboard.view import LeaderboardViewer
from huggingface_hub import HfApi
import aliases
from leaderboard_transformer import (
DataTransformer,
transform_raw_dataframe,
create_pretty_tag_map,
INFORMAL_TO_FORMAL_NAME_MAP,
_plot_scatter_plotly,
format_cost_column,
format_score_column,
get_pareto_df,
clean_llm_base_list,
)
from config import (
CONFIG_NAME,
EXTRACTED_DATA_DIR,
IS_INTERNAL,
RESULTS_DATASET,
)
from content import (
create_gradio_anchor_id,
format_error,
get_benchmark_description,
hf_uri_to_web_url,
hyperlink,
SCATTER_DISCLAIMER,
)
api = HfApi()
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
# Global variables
COMBINED_ICON_MAP = {
aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: {
aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-ow-standard.svg",
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-ow-equivalent.svg",
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-ow-custom.svg",
},
aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {
aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-standard.svg",
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-equivalent.svg",
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-custom.svg",
},
aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {
aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/api-standard.svg",
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/api-equivalent.svg",
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/api-custom.svg",
},
aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: {
aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/c-standard.svg",
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/c-equivalent.svg",
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/c-custom.svg",
}
}
# it's important to do the tool usage first here, so that when
# we do openness, the tool usage changes get picked up
for openness in COMBINED_ICON_MAP:
for canonical_tool_usage, tool_usage_aliases in aliases.TOOL_USAGE_ALIASES.items():
for tool_usage_alias in tool_usage_aliases:
COMBINED_ICON_MAP[openness][tool_usage_alias] = COMBINED_ICON_MAP[openness][canonical_tool_usage]
for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
for openness_alias in openness_aliases:
COMBINED_ICON_MAP[openness_alias] = COMBINED_ICON_MAP[canonical_openness]
OPENNESS_SVG_MAP = {
"Open source & open weights": {
"path": "assets/ellipse-pink.svg",
"description": "Code and models are open"
},
"Open source & closed weights": {
"path": "assets/ellipse-coral.svg",
"description": "Code is open but uses closed-weight models"
},
"Closed source & API available": {
"path": "assets/ellipse-yellow.svg",
"description": "No access to code; API access only"
},
"Closed source & UI only": {
"path": "assets/ellipse-white.svg",
"description": "No access to code or API; UI access only"
},
}
TOOLING_SVG_MAP = {
"Standard": {
"path": "assets/five-point-star.svg",
"description": "Uses only tools explicitly provided in state.tools"
},
"Custom interface": {
"path": "assets/four-point-star.svg",
"description": "Custom tools for accessing an equivalent underlying environment"
},
"Fully custom": {
"path": "assets/three-point-star.svg",
"description": "Uses tools beyond constraints of Standard or Custom interface"
},
}
def get_svg_as_data_uri(path: str) -> str:
"""Reads an SVG file and returns it as a base64-encoded data URI."""
try:
with open(path, "rb") as svg_file:
encoded_svg = base64.b64encode(svg_file.read()).decode("utf-8")
return f"data:image/svg+xml;base64,{encoded_svg}"
except FileNotFoundError:
print(f"Warning: SVG file not found at {path}")
return ""
# Create a pre-loaded version of our map. This should be run ONCE when the app starts.
PRELOADED_URI_MAP = {
openness: {
tooling: get_svg_as_data_uri(path)
for tooling, path in tooling_map.items()
}
for openness, tooling_map in COMBINED_ICON_MAP.items()
}
def get_combined_icon_html(row, uri_map):
"""
Looks up the correct icon URI from the pre-loaded map based on the row's
'Openness' and 'Agent Tooling' values and returns an HTML tag.
"""
openness_val = row['Openness']
tooling_val = row['Agent Tooling']
uri = uri_map.get(openness_val, {}).get(tooling_val, "")
# The tooltip will show the exact combination for clarity.
tooltip = f"Openness: {openness_val}, Tooling: {tooling_val}"
# Return the HTML string that Gradio will render in the DataFrame.
return f''
def create_svg_html(value, svg_map):
"""
Generates the absolute simplest HTML for an icon, without any extra text.
This version is compatible with gr.DataFrame.
"""
if pd.isna(value) or value not in svg_map:
return ""
path_info = svg_map[value]
# Handle both old string format and new object format
if isinstance(path_info, dict):
path = path_info["path"]
else:
path = path_info
src = get_svg_as_data_uri(path)
# Generate the HTML for the single icon, with NO text.
if src:
return f''
return ""
def build_openness_tooltip_content() -> str:
"""
Generates the inner HTML for the Agent Openness tooltip card,
"""
descriptions = {
"Open source & open weights": "Both code and ML models are open",
"Open source & closed weights": "Code is open but uses an ML model with closed-weights",
"Closed source & API available": "No access to code; API access only",
"Closed source & UI only": "No access to code or API; UI access only",
}
html_items = []
for name, info in OPENNESS_SVG_MAP.items():
uri = get_svg_as_data_uri(info["path"])
desc = descriptions.get(name, "")
html_items.append(f"""
{name}{desc}
""")
return "".join(html_items)
def build_pareto_tooltip_content() -> str:
"""Generates the inner HTML for the Pareto tooltip card with final copy."""
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
trophy_icon_html = f''
return f"""
On Pareto Frontier
The Pareto frontier represents the best balance between score and cost.
Agents on the frontier either:
Offer the lowest cost for a given performance, or
Deliver the best performance at a given cost.
These agents are marked with this icon:{trophy_icon_html}
"""
def build_tooling_tooltip_content() -> str:
"""Generates the inner HTML for the Agent Tooling tooltip card."""
descriptions = {
"Standard": "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).",
"Custom interface": "Custom tools for accessing an equivalent underlying environment:",
"Fully custom": "Uses tools beyond constraints of Standard or Custom interface",
}
custom_interface_sub_list = """
Literature tasks: Information access is limited to date-restricted usage of the Asta MCP tools.
Code tasks: Code execution is limited to an iPython shell in a machine environment initialized with the standard Asta sandbox Dockerfile (or equivalent).
"""
html_items = []
for name, info in TOOLING_SVG_MAP.items():
uri = get_svg_as_data_uri(info["path"])
desc = descriptions.get(name, "")
# Check if this is the special case that needs a sub-list
sub_list_html = custom_interface_sub_list if name == "Custom Interface" else ""
html_items.append(f"""
{name}{desc}
{sub_list_html}
""")
return "".join(html_items)
def build_descriptions_tooltip_content(table) -> str:
"""Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
if table == "Overall":
return """
Agent: Name of the evaluated agent.
Submitter: Organization or individual who submitted the agent for evaluation.
LLM Base: Model(s) used by the agent. Hover over ⓘ to view all.
Overall Score: Macro-average of the four category-level average scores. Each category contributes equally.
Overall Cost: Macro-average cost per problem across all categories, in USD. Each category contributes equally.
Literature Understanding Score: Macro-average score across Literature Understanding benchmarks.
Literature Understanding Cost: Macro-average cost per problem (USD) across Literature Understanding benchmarks.
Code Execution Score: Macro-average score across Code & Execution benchmarks.
Code Execution Cost: Macro-average cost per problem (USD) across Code & Execution benchmarks.
Data Analysis Score: Macro-average score across Data Analysis benchmarks.
Data Analysis Cost: Macro-average cost per problem (USD) across Data Analysis benchmarks.
End-to-End Discovery Score: Macro-average score across End-to-End Discovery benchmarks.
End-to-End Discovery Cost: Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.
Categories Attempted: Number of core categories with at least one benchmark attempted (out of 4).
Logs: View evaluation run logs (e.g., outputs, traces).
Submitter: Organization or individual who submitted the agent for evaluation.
LLM Base: Model(s) used by the agent. Hover over ⓘ to view all.
{table} Score: Macro-average score across {table} benchmarks.
{table} Cost: Macro-average cost per problem (USD) across {table} benchmarks.
Benchmark Score: Average (mean) score on the benchmark.
Benchmark Cost: Average (mean) cost per problem (USD) on the benchmark.
Benchmarks Attempted: Number of benchmarks attempted in this category (e.g., 3/5).
Logs: View evaluation run logs (e.g., outputs, traces).
"""
else:
# Fallback for any other table type, e.g., individual benchmarks
return f"""
Agent: Name of the evaluated agent.
Submitter: Organization or individual who submitted the agent for evaluation.
LLM Base: Model(s) used by the agent. Hover over ⓘ to view all.
Benchmark Attempted: Indicates whether the agent attempted this benchmark.
{table} Score: Score achieved by the agent on this benchmark.
{table} Cost: Cost incurred by the agent to solve this benchmark (in USD).
Logs: View evaluation run logs (e.g., outputs, traces).
"""
# Dynamically generate the correct HTML for the legend parts
openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
# Create HTML for the "Openness" legend items for table
openness_html_items = []
for name, info in OPENNESS_SVG_MAP.items():
uri = get_svg_as_data_uri(info["path"])
# Each item is now its own flexbox container to guarantee alignment
openness_html_items.append(
f'
'
f''
f'{name}'
f'
'
)
openness_html = " ".join(openness_html_items)
# Create HTML for the "Tooling" legend items for table
tooling_html_items = []
for name, info in TOOLING_SVG_MAP.items():
uri = get_svg_as_data_uri(info["path"])
tooling_html_items.append(
f'
'
f''
f'{name}'
f'
'
)
tooling_html = " ".join(tooling_html_items)
pareto_tooltip_content = build_pareto_tooltip_content()
openness_tooltip_content = build_openness_tooltip_content()
tooling_tooltip_content = build_tooling_tooltip_content()
def create_legend_markdown(which_table: str) -> str:
"""
Generates the complete HTML for the legend section, including tooltips.
This is used in the main leaderboard display.
"""
descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
legend_markdown = f"""
Pareto
ⓘ
{pareto_tooltip_content}
On frontier
Agent Openness
ⓘ
Agent Openness
Indicates how transparent and reproducible an agent is.
{openness_tooltip_content}
{openness_html}
Agent Tooling
ⓘ
Agent Tooling
Describes the tool usage and execution environment of the agent during evaluation.
{tooling_tooltip_content}
{tooling_html}
Column Descriptions
ⓘ
Column Descriptions
{descriptions_tooltip_content}
"""
return legend_markdown
# Create HTML for plot legend with SVG icons and keys
openness_legend_items = []
for name, info in OPENNESS_SVG_MAP.items():
uri = get_svg_as_data_uri(info["path"])
if uri:
openness_legend_items.append(
f'
'
f''
f'
'
f'
'
f'{name}'
f'
'
f'{info["description"]}'
f'
'
f'
'
)
tooling_legend_items = []
for name, info in TOOLING_SVG_MAP.items():
uri = get_svg_as_data_uri(info["path"])
if uri:
tooling_legend_items.append(
f'
'
f''
f'
'
f'
'
f'{name}'
f'
'
f'{info["description"]}'
f'
'
f'
'
)
plot_legend_html = f"""
Pareto
On frontier
Agent Openness
{''.join(openness_legend_items)}
Agent Tooling
{''.join(tooling_legend_items)}
""";
# --- Global State for Viewers (simple caching) ---
CACHED_VIEWERS = {}
CACHED_TAG_MAPS = {}
class DummyViewer:
"""A mock viewer to be cached on error. It has a ._load() method
to ensure it behaves like the real LeaderboardViewer."""
def __init__(self, error_df):
self._error_df = error_df
def _load(self):
# The _load method returns the error DataFrame and an empty tag map
return self._error_df, {}
def get_leaderboard_viewer_instance(split: str):
"""
Fetches the LeaderboardViewer for a split, using a cache to avoid
re-downloading data. On error, returns a stable DummyViewer object.
"""
global CACHED_VIEWERS, CACHED_TAG_MAPS
if split in CACHED_VIEWERS:
# Cache hit: return the cached viewer and tag map
return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
# --- Cache miss: try to load data from the source ---
try:
print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}")
viewer = LeaderboardViewer(
repo_id=RESULTS_DATASET,
config=CONFIG_NAME,
split=split,
is_internal=IS_INTERNAL
)
# Simplify tag map creation
pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
# Cache the results for next time
CACHED_VIEWERS[split] = viewer
CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
return viewer, pretty_tag_map
except Exception as e:
# On ANY error, create a consistent error message and cache a DummyViewer
error_message = f"Error loading data for split '{split}': {e}"
print(format_error(error_message))
dummy_df = pd.DataFrame({"Message": [error_message]})
dummy_viewer = DummyViewer(dummy_df)
dummy_tag_map = {"Overall": []}
# Cache the dummy objects so we don't try to fetch again on this run
CACHED_VIEWERS[split] = dummy_viewer
CACHED_TAG_MAPS[split] = dummy_tag_map
return dummy_viewer, dummy_tag_map
def create_leaderboard_display(
full_df: pd.DataFrame,
tag_map: dict,
category_name: str,
split_name: str
):
"""
This UI factory takes pre-loaded data and renders the main DataFrame and Plot
for a given category (e.g., "Overall" or "Literature Understanding").
"""
# 1. Instantiate the transformer and get the specific view for this category.
# The function no longer loads data itself; it filters the data it receives.
transformer = DataTransformer(full_df, tag_map)
df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
pareto_df = get_pareto_df(df_view)
# Get the list of agents on the frontier. We'll use this list later.
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
trophy_icon_html = f''
if not pareto_df.empty and 'id' in pareto_df.columns:
pareto_agent_names = pareto_df['id'].tolist()
else:
pareto_agent_names = []
df_view['Pareto'] = df_view.apply(
lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
axis=1
)
# Create mapping for Openness / tooling
df_view['Icon'] = df_view.apply(
lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP),
axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
)
# Format cost columns
for col in df_view.columns:
if "Cost" in col:
df_view = format_cost_column(df_view, col)
# Fill NaN scores with 0
for col in df_view.columns:
if "Score" in col:
df_view = format_score_column(df_view, col)
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
#Make pretty and format the LLM Base column
df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list)
df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html)
# append the repro url to the end of the agent name
if 'Source' in df_view.columns:
df_view['Agent'] = df_view.apply(
lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'],
axis=1
)
all_cols = df_view.columns.tolist()
# Remove pareto and Icon columns and insert it at the beginning
all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
df_view = df_view[all_cols]
# Drop internally used columns that are not needed in the display
columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source']
df_view = df_view.drop(columns=columns_to_drop, errors='ignore')
df_headers = df_view.columns.tolist()
df_datatypes = []
for col in df_headers:
if col == "Logs" or "Cost" in col or "Score" in col:
df_datatypes.append("markdown")
elif col in ["Agent","Icon","LLM Base", "Pareto"]:
df_datatypes.append("html")
else:
df_datatypes.append("str")
header_rename_map = {
"Pareto": "",
"Icon": "",
}
# 2. Create the final list of headers for display.
df_view = df_view.rename(columns=header_rename_map)
# Dynamically set widths for the DataFrame columns
fixed_start_widths = [40, 40, 200, 100, 200]
num_score_cost_cols = 0
remaining_headers = df_headers[len(fixed_start_widths):]
for col in remaining_headers:
if "Score" in col or "Cost" in col:
num_score_cost_cols += 1
dynamic_widths = [90] * num_score_cost_cols
fixed_end_widths = [90, 100, 50]
# 5. Combine all the lists to create the final, fully dynamic list.
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
with gr.Row():
with gr.Column(scale=3):
plot_component = gr.Plot(
value=scatter_plot,
show_label=False,
)
with gr.Column(scale=1):
gr.HTML(value=plot_legend_html)
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
# Put table and key into an accordion
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
dataframe_component = gr.DataFrame(
headers=df_headers,
value=df_view,
datatype=df_datatypes,
interactive=False,
wrap=True,
column_widths=final_column_widths,
elem_classes=["wrap-header-df"],
show_search="search",
)
legend_markdown = create_legend_markdown(category_name)
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
# Return the components so they can be referenced elsewhere.
return plot_component, dataframe_component
# # --- Detailed Benchmark Display ---
def create_benchmark_details_display(
full_df: pd.DataFrame,
tag_map: dict,
category_name: str,
validation: bool = False,
):
"""
Generates a detailed breakdown for each benchmark within a given category.
For each benchmark, it creates a title, a filtered table, and a scatter plot.
Args:
full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
tag_map (dict): The "pretty" tag map to find the list of benchmarks.
category_name (str): The main category to display details for (e.g., "Literature Understanding").
"""
# 1. Get the list of benchmarks for the selected category
benchmark_names = tag_map.get(category_name, [])
if not benchmark_names:
gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
return
gr.HTML(f'
{category_name} Detailed Benchmark Results
')
gr.Markdown("---")
# 2. Loop through each benchmark and create its UI components
for benchmark_name in benchmark_names:
gr.HTML(f'''
''')
# 3. Prepare the data for this specific benchmark's table and plot
benchmark_score_col = f"{benchmark_name} Score"
benchmark_cost_col = f"{benchmark_name} Cost"
# Define the columns needed for the detailed table
table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base']
# Filter to only columns that actually exist in the full dataframe
existing_table_cols = [col for col in table_cols if col in full_df.columns]
if benchmark_score_col not in existing_table_cols:
gr.Markdown(f"Score data for {benchmark_name} not available.")
continue # Skip to the next benchmark if score is missing
# Create a specific DataFrame for the table view
benchmark_table_df = full_df[existing_table_cols].copy()
pareto_df = get_pareto_df(benchmark_table_df)
# Get the list of agents on the frontier. We'll use this list later.
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
trophy_icon_html = f''
if not pareto_df.empty and 'id' in pareto_df.columns:
pareto_agent_names = pareto_df['id'].tolist()
else:
pareto_agent_names = []
benchmark_table_df['Pareto'] = benchmark_table_df.apply(
lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
axis=1
)
benchmark_table_df['Icon'] = benchmark_table_df.apply(
lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP),
axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
)
#Make pretty and format the LLM Base column
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list)
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html)
# append the repro url to the end of the agent name
if 'Source' in benchmark_table_df.columns:
benchmark_table_df['Agent'] = benchmark_table_df.apply(
lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'],
axis=1
)
# Calculated and add "Benchmark Attempted" column
def check_benchmark_status(row):
has_score = pd.notna(row.get(benchmark_score_col))
has_cost = pd.notna(row.get(benchmark_cost_col))
if has_score and has_cost:
return "✅"
if has_score or has_cost:
return "⚠️"
return "🚫 "
# Apply the function to create the new column
benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
# Sort the DataFrame
if benchmark_score_col in benchmark_table_df.columns:
benchmark_table_df = benchmark_table_df.sort_values(
by=benchmark_score_col, ascending=False, na_position='last'
)
# 1. Format the cost and score columns
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
desired_cols_in_order = [
'Pareto',
'Icon',
'Agent',
'Submitter',
'LLM Base',
'Attempted Benchmark',
benchmark_score_col,
benchmark_cost_col,
'Date',
'Logs'
]
for col in desired_cols_in_order:
if col not in benchmark_table_df.columns:
benchmark_table_df[col] = pd.NA # Add as an empty column
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
# Rename columns for a cleaner table display, as requested
benchmark_table_df.rename({
benchmark_score_col: 'Score',
benchmark_cost_col: 'Cost',
}, inplace=True)
# Ensure the 'Logs' column is formatted correctly
df_headers = benchmark_table_df.columns.tolist()
df_datatypes = []
for col in df_headers:
if "Logs" in col or "Cost" in col or "Score" in col:
df_datatypes.append("markdown")
elif col in ["Agent", "Icon", "LLM Base", "Pareto"]:
df_datatypes.append("html")
else:
df_datatypes.append("str")
# Remove Pareto, Openness, and Agent Tooling from the headers
header_rename_map = {
"Pareto": "",
"Icon": "",
}
# 2. Create the final list of headers for display.
benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
benchmark_plot = _plot_scatter_plotly(
data=full_df,
x=benchmark_cost_col,
y=benchmark_score_col,
agent_col="Agent",
name=benchmark_name
)
with gr.Row():
with gr.Column(scale=3):
gr.Plot(value=benchmark_plot, show_label=False)
with gr.Column(scale=1):
gr.HTML(value=plot_legend_html)
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
# Put table and key into an accordion
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
gr.DataFrame(
headers=df_headers,
value=benchmark_table_df,
datatype=df_datatypes,
interactive=False,
wrap=True,
column_widths=[40, 40, 200, 150, 175, 85, 100, 100, 80, 40],
show_search="search",
elem_classes=["wrap-header-df"]
)
legend_markdown = create_legend_markdown(benchmark_name)
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
"""
Loads and transforms the complete dataset for a given split.
This function handles caching and returns the final "pretty" DataFrame and tag map.
"""
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)):
raw_df, _ = viewer_or_data._load()
if raw_df.empty:
return pd.DataFrame(), {}
pretty_df = transform_raw_dataframe(raw_df)
pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
if "Logs" in pretty_df.columns:
def format_log_entry_to_html(raw_uri):
if pd.isna(raw_uri) or raw_uri == "": return ""
web_url = hf_uri_to_web_url(str(raw_uri))
return hyperlink(web_url, "🔗") if web_url else ""
# Apply the function to the "Logs" column
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
if "Source" in pretty_df.columns:
def format_source_url_to_html(raw_url):
# Handle empty or NaN values, returning a blank string.
if pd.isna(raw_url) or raw_url == "": return ""
# Assume 'source_url' is already a valid web URL and doesn't need conversion.
return hyperlink(str(raw_url), "🔗")
# Apply the function to the "source_url" column.
pretty_df["Source"] = pretty_df["Source"].apply(format_source_url_to_html)
return pretty_df, pretty_tag_map
# Fallback for unexpected types
return pd.DataFrame(), {}
def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML:
"""
Builds the entire sub-navigation bar as a single, self-contained HTML component.
This bypasses Gradio's layout components, giving us full control.
"""
benchmark_names = tag_map.get(category_name, [])
if not benchmark_names:
# Return an empty HTML component to prevent errors
return gr.HTML()
# Start building the list of HTML button elements as strings
html_buttons = []
for name in benchmark_names:
target_id = create_gradio_anchor_id(name, validation)
# Create a standard HTML button.
# The onclick attribute calls our global JS function directly.
# Note the mix of double and single quotes.
button_str = f"""
"""
html_buttons.append(button_str)
# Join the button strings and wrap them in a single div container
# This container will be our flexbox row.
full_html = f"""
Benchmarks in this category:
{''.join(html_buttons)}
"""
# Return the entire navigation bar as one single Gradio HTML component
return gr.HTML(full_html)
def format_llm_base_with_html(value):
"""
Formats the 'LLM Base' cell value.
If the value is a list with more than 1 element, it returns an
HTML with the full list in a hover-over tooltip.
If it's a single-element list, it returns just that element.
Otherwise, it returns the original value.
"""
if isinstance(value, list):
if len(value) > 1:
# Join the list items with a newline character for a clean tooltip
tooltip_text = "\n".join(map(str, value))
# Return an HTML span with the title attribute for the tooltip
return f'{value[0]} (+ {len(value) - 1}) ⓘ'
if len(value) == 1:
# If only one item, just return that item
return value[0]
# Return the value as-is if it's not a list or is an empty list
return value