Amber Tanaka commited on
Commit
1e64d2b
Β·
unverified Β·
1 Parent(s): b36c3c5

Table Legends Refactor (#57)

Browse files
Files changed (4) hide show
  1. README.md +5 -2
  2. app.py +1 -2
  3. content.py +82 -0
  4. ui_components.py +201 -59
README.md CHANGED
@@ -18,8 +18,11 @@ tags:
18
  The leaderboard is built using the [HuggingFace Datasets](https://huggingface.co/docs/datasets/index) library, which provides a convenient way to manage and query datasets.
19
  It's currently pointed at the [AstaBench Leaderboard](https://huggingface.co/datasets/allenai/asta-bench-internal-results/) dataset, which is a public dataset hosted on HuggingFace.
20
 
21
- To run the leaderboard locally, you can use the following command:
22
-
 
 
 
23
  ```bash
24
  python app.py
25
  ```
 
18
  The leaderboard is built using the [HuggingFace Datasets](https://huggingface.co/docs/datasets/index) library, which provides a convenient way to manage and query datasets.
19
  It's currently pointed at the [AstaBench Leaderboard](https://huggingface.co/datasets/allenai/asta-bench-internal-results/) dataset, which is a public dataset hosted on HuggingFace.
20
 
21
+ To run the leaderboard locally first make sure to set this env variable:
22
+ ```bash
23
+ export IS_INTERNAL=true
24
+ ```
25
+ You can then start it up with the following command:
26
  ```bash
27
  python app.py
28
  ```
app.py CHANGED
@@ -1,6 +1,5 @@
1
  # app.py
2
  import gradio as gr
3
- import os
4
  import urllib.parse
5
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -165,7 +164,7 @@ def restart_space_job():
165
  if __name__ == "__main__":
166
  if LOCAL_DEBUG:
167
  print("Launching in LOCAL_DEBUG mode.")
168
- def get_initial_global_tag_choices(): return ["Overall", "TagA"]
169
  demo.launch(debug=True, allowed_paths=["assets"])
170
  else:
171
  print("Launching in Space mode.")
 
1
  # app.py
2
  import gradio as gr
 
3
  import urllib.parse
4
 
5
  from apscheduler.schedulers.background import BackgroundScheduler
 
164
  if __name__ == "__main__":
165
  if LOCAL_DEBUG:
166
  print("Launching in LOCAL_DEBUG mode.")
167
+ def get_initial_global_tag_choices(): return ["Overall"]
168
  demo.launch(debug=True, allowed_paths=["assets"])
169
  else:
170
  print("Launching in Space mode.")
content.py CHANGED
@@ -445,6 +445,88 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
445
  #main-header h2 {
446
  color: #f0529c;
447
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  #about-page-content-wrapper {
449
  margin-left: auto;
450
  margin-right: auto;
 
445
  #main-header h2 {
446
  color: #f0529c;
447
  }
448
+
449
+ /* --- New HTML-Based Tooltip Styles --- */
450
+ .tooltip-icon-legend {
451
+ position: relative;
452
+ cursor: help;
453
+ display: inline-block;
454
+ }
455
+
456
+ /* The HTML pop-up card.*/
457
+ .tooltip-card {
458
+ /* Hiding mechanism */
459
+ opacity: 0;
460
+ visibility: hidden;
461
+ transition: opacity 0.2s;
462
+ pointer-events: none;
463
+
464
+ /* Card appearance */
465
+ position: absolute;
466
+ bottom: 125%;
467
+ left: 50%;
468
+ transform: translateX(-50%);
469
+ z-index: 1000;
470
+ background-color: #083c40;
471
+ color: #e5e7eb;
472
+ border-radius: 12px;
473
+ padding: 15px;
474
+ width: max-content;
475
+ max-width: 400px;
476
+ text-align: left;
477
+ }
478
+
479
+ .tooltip-icon-legend:hover .tooltip-card {
480
+ opacity: 1;
481
+ visibility: visible;
482
+ }
483
+
484
+ .tooltip-card h3 {
485
+ font-size: 18px;
486
+ color: #fff;
487
+ margin-top: 0;
488
+ margin-bottom: 12px;
489
+ }
490
+ .tooltip-card .tooltip-description {
491
+ margin-bottom: 20px;
492
+ line-height: 1.3;
493
+ }
494
+ .tooltip-card .tooltip-items-container {
495
+ display: flex;
496
+ flex-direction: column;
497
+ gap: 10px;
498
+ }
499
+ .tooltip-card .tooltip-legend-item {
500
+ display: flex;
501
+ align-items:
502
+ flex-start;
503
+ gap: 10px;
504
+ }
505
+ .tooltip-card .tooltip-legend-item img {
506
+ width: 20px;
507
+ height: 20px;
508
+ margin-top: 2px;
509
+ }
510
+ .tooltip-card .tooltip-legend-item div {
511
+ display: flex;
512
+ flex-direction: column;
513
+ }
514
+ .tooltip-card .tooltip-legend-item strong {
515
+ font-weight: 600;
516
+ color: #fff;
517
+ }
518
+ .tooltip-card .tooltip-legend-item span {
519
+ font-size: 13px;
520
+ line-height: 1.3;
521
+ }
522
+ .tooltip-sub-list {
523
+ list-style-type: 'β€’ ';
524
+ padding-left: 18px;
525
+ font-size: 13px;
526
+ line-height: 1.3;
527
+ display: flex;
528
+ flex-direction: column;
529
+ /* About Page CSS */
530
  #about-page-content-wrapper {
531
  margin-left: auto;
532
  margin-right: auto;
ui_components.py CHANGED
@@ -41,36 +41,36 @@ os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
41
  # Global variables
42
  COMBINED_ICON_MAP = {
43
  "Open Source + Open Weights": {
44
- "Standard": "assets/os-ow-standard.svg", # Bright pink star
45
- "Custom with Standard Search": "assets/os-ow-equivalent.svg", # Bright pink diamond
46
- "Custom": "assets/os-ow-custom.svg", # Bright pink triangle
47
  },
48
  "Open Source": {
49
- "Standard": "assets/os-standard.svg", # Orange/pink star
50
- "Custom with Standard Search": "assets/os-equivalent.svg", # Orange/pink diamond
51
- "Fully Custom": "assets/os-custom.svg", # Orange/pink triangle
52
  },
53
  "API Available": {
54
- "Standard": "assets/api-standard.svg", # Yellow/pink star
55
- "Custom with Standard Search": "assets/api-equivalent.svg", # Yellow/pink diamond
56
- "Fully Custom": "assets/api-custom.svg", # Yellow/pink triangle
57
  },
58
  "Closed": {
59
- "Standard": "assets/c-standard.svg", # Hollow pink star
60
- "Equivalent": "assets/c-equivalent.svg", # Hollow pink diamond
61
- "Fully Custom": "assets/c-custom.svg", # Hollow pink triangle
62
  }
63
  }
64
  OPENNESS_SVG_MAP = {
65
- "Open Source + Open Weights": "assets/os-ow-standard.svg",
66
- "Open Source": "assets/os-standard.svg",
67
- "API Available": "assets/api-standard.svg",
68
- "Closed": "assets/c-standard.svg",
69
  }
70
  TOOLING_SVG_MAP = {
71
- "Standard": "assets/os-ow-standard.svg",
72
- "Custom with Standard Search": "assets/os-ow-equivalent.svg",
73
- "Fully Custom": "assets/os-ow-custom.svg",
74
  }
75
 
76
  def get_svg_as_data_uri(path: str) -> str:
@@ -122,6 +122,126 @@ def create_svg_html(value, svg_map):
122
  return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">'
123
  return ""
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  # Dynamically generate the correct HTML for the legend parts
126
  openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
127
  tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
@@ -150,47 +270,67 @@ for name, path in TOOLING_SVG_MAP.items():
150
  )
151
  tooling_html = " ".join(tooling_html_items)
152
 
 
 
 
153
 
154
- # Your final legend_markdown string (the structure of this does not change)
155
- legend_markdown = f"""
156
- <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 24px; font-size: 14px; padding-bottom: 8px;">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- <div> <!-- Container for the Pareto section -->
159
- <b>Pareto</b><span class="tooltip-icon" data-tooltip="Indicates if agent is on the Pareto frontier
160
- ">β“˜</span>
161
- <div style="padding-top: 4px;"><span>πŸ† On frontier</span></div>
162
- </div>
163
-
164
- <div> <!-- Container for the Openness section -->
165
- <b>Agent Openness</b><span class="tooltip-icon" data-tooltip="β€’Closed: No API or code available
166
- β€’API Available: API available, but no code
167
- β€’Open Source: Code available, but no weights
168
- β€’Open Source + Open Weights: Code and weights available
169
- ">β“˜</span>
170
- <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{openness_html}</div>
171
- </div>
172
-
173
- <div> <!-- Container for the Tooling section -->
174
- <b>Agent Tooling</b><span class="tooltip-icon" data-tooltip="β€’ Standard: Standard Approach used by the agent
175
- β€’ Custom with Standard Search: Standard search used by the agent
176
- β€’ Fully Custom: Fully custom tools used by the agent
177
- ">β“˜</span>
178
- <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{tooling_html}</div>
179
  </div>
180
-
181
- <div><b>Column Descriptions</b><span class="tooltip-icon" data-tooltip="β€’ Overall Score: Performance across all benchmarks
182
- β€’ Overall Cost: Cost per task in USD
183
- β€’ Literature Understanding Score: Performance on scientific literature tasks
184
- β€’ Literature Understanding Cost: Cost per literature understanding task in USD
185
- β€’ Data Analysis Score: Performance on data analysis tasks
186
- β€’ Code Execution Score: Performance on coding tasks
187
- β€’ Code Execution Cost: Cost per code execution task in USD
188
- β€’ Discovery Score: Performance on information discovery tasks
189
- β€’ Discovery Cost: Cost per discovery task in USD
190
- β€’ Categories Attempted: Number of benchmark categories the agent participated in
191
- β€’ Logs: Link to detailed evaluation logs">β“˜</span></div>
192
- </div>
193
- """
194
 
195
  # --- Global State for Viewers (simple caching) ---
196
  CACHED_VIEWERS = {}
@@ -330,7 +470,6 @@ def create_leaderboard_display(
330
  gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
331
  # Put table and key into an accordion
332
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
333
- gr.HTML(value=legend_markdown, elem_id="legend-markdown")
334
  dataframe_component = gr.DataFrame(
335
  headers=df_headers,
336
  value=df_view,
@@ -340,6 +479,8 @@ def create_leaderboard_display(
340
  column_widths=[40, 40, 200, 200],
341
  elem_classes=["wrap-header-df"]
342
  )
 
 
343
 
344
  # Return the components so they can be referenced elsewhere.
345
  return plot_component, dataframe_component
@@ -487,7 +628,6 @@ def create_benchmark_details_display(
487
  gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
488
  # Put table and key into an accordion
489
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
490
- gr.HTML(value=legend_markdown, elem_id="legend-markdown")
491
  gr.DataFrame(
492
  headers=df_headers,
493
  value=benchmark_table_df,
@@ -497,6 +637,8 @@ def create_benchmark_details_display(
497
  column_widths=[40, 40, 200, 150, 175, 85],
498
  elem_classes=["wrap-header-df"]
499
  )
 
 
500
 
501
  def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
502
  """
 
41
  # Global variables
42
  COMBINED_ICON_MAP = {
43
  "Open Source + Open Weights": {
44
+ "Standard": "assets/os-ow-standard.svg",
45
+ "Custom with Standard Search": "assets/os-ow-equivalent.svg",
46
+ "Custom": "assets/os-ow-custom.svg",
47
  },
48
  "Open Source": {
49
+ "Standard": "assets/os-standard.svg",
50
+ "Custom with Standard Search": "assets/os-equivalent.svg",
51
+ "Fully Custom": "assets/os-custom.svg",
52
  },
53
  "API Available": {
54
+ "Standard": "assets/api-standard.svg",
55
+ "Custom with Standard Search": "assets/api-equivalent.svg",
56
+ "Fully Custom": "assets/api-custom.svg",
57
  },
58
  "Closed": {
59
+ "Standard": "assets/c-standard.svg",
60
+ "Equivalent": "assets/c-equivalent.svg",
61
+ "Fully Custom": "assets/c-custom.svg",
62
  }
63
  }
64
  OPENNESS_SVG_MAP = {
65
+ "Open Source + Open Weights": "assets/os-ow-legend.svg",
66
+ "Open Source": "assets/os-legend.svg",
67
+ "API Available": "assets/api-legend.svg",
68
+ "Closed": "assets/c-legend.svg",
69
  }
70
  TOOLING_SVG_MAP = {
71
+ "Standard": "assets/standard-legend.svg",
72
+ "Custom with Standard Search": "assets/equivalent-legend.svg",
73
+ "Fully Custom": "assets/custom-legend.svg",
74
  }
75
 
76
  def get_svg_as_data_uri(path: str) -> str:
 
122
  return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">'
123
  return ""
124
 
125
+ def build_openness_tooltip_content() -> str:
126
+ """
127
+ Generates the inner HTML for the Agent Openness tooltip card,
128
+ """
129
+ descriptions = {
130
+ "Open Source + Open Weights": "Both code and ML models are open",
131
+ "Open Source": "Code is open but uses an ML model with closed-weights",
132
+ "API Available": "No access to code; API access only",
133
+ "Closed": "No access to code or API; UI access only",
134
+ }
135
+ html_items = []
136
+ for name, path in OPENNESS_SVG_MAP.items():
137
+ uri = get_svg_as_data_uri(path)
138
+ desc = descriptions.get(name, "")
139
+
140
+ # Create the HTML for a single row in the tooltip legend
141
+ html_items.append(f"""
142
+ <div class="tooltip-legend-item">
143
+ <img src="{uri}" alt="{name}">
144
+ <div>
145
+ <strong>{name}</strong>
146
+ <span>{desc}</span>
147
+ </div>
148
+ </div>
149
+ """)
150
+
151
+ return "".join(html_items)
152
+
153
+ def build_pareto_tooltip_content() -> str:
154
+ """Generates the inner HTML for the Pareto tooltip card with final copy."""
155
+ return f"""
156
+ <h3>On Pareto Frontier</h3>
157
+ <p class="tooltip-description">The Pareto frontier represents the best balance between score and cost.</p>
158
+ <p class="tooltip-description">Agents on the frontier either:</p>
159
+ <ul class="tooltip-sub-list">
160
+ <li>Offer the lowest cost for a given performance, or</li>
161
+ <li>Deliver the best performance at a given cost.</li>
162
+ </ul>
163
+ <p class="tooltip-description" style="margin-top: 12px;">These agents are marked with this icon: πŸ†</p>
164
+ """
165
+
166
+ def build_tooling_tooltip_content() -> str:
167
+ """Generates the inner HTML for the Agent Tooling tooltip card."""
168
+ descriptions = {
169
+ "Standard": "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).",
170
+ "Custom with Standard Search": "Custom tools for accessing an equivalent underlying environment:",
171
+ "Fully Custom": "Uses tools beyond constraints of Standard or Custom interface",
172
+ }
173
+ custom_interface_sub_list = """
174
+ <ul class="tooltip-sub-list">
175
+ <li>Literature tasks: Information access is limited to date-restricted usage of the Asta MCP tools.</li>
176
+ <li>Code tasks: Code execution is limited to an iPython shell in a machine environment initialized with the standard Asta sandbox Dockerfile (or equivalent).</li>
177
+ </ul>
178
+ """
179
+ html_items = []
180
+ for name, path in TOOLING_SVG_MAP.items():
181
+ uri = get_svg_as_data_uri(path)
182
+ desc = descriptions.get(name, "")
183
+
184
+ # Check if this is the special case that needs a sub-list
185
+ sub_list_html = custom_interface_sub_list if name == "Custom with Standard Search" else ""
186
+
187
+ html_items.append(f"""
188
+ <div class="tooltip-legend-item">
189
+ <img src="{uri}" alt="{name}">
190
+ <div>
191
+ <strong>{name}</strong>
192
+ <span>{desc}</span>
193
+ {sub_list_html}
194
+ </div>
195
+ </div>
196
+ """)
197
+
198
+ return "".join(html_items)
199
+
200
+
201
+ def build_descriptions_tooltip_content(table) -> str:
202
+ """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
203
+ if table == "Overall":
204
+ return """
205
+ <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
206
+ <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
207
+ <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over β“˜ to view all.</div>
208
+ <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
209
+ <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Based on submission-time values. Each category contributes equally</div>
210
+ <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
211
+ <div class="tooltip-description-item"><b>Literature Understanding Cost:</b> Macro-average cost per problem (USD) across Literature Understanding benchmarks.</div>
212
+ <div class="tooltip-description-item"><b>Code Execution Score:</b> Macro-average score across Code & Execution benchmarks.</div>
213
+ <div class="tooltip-description-item"><b>Code Execution Cost:</b> Macro-average cost per problem (USD) across Code & Execution benchmarks.</div>
214
+ <div class="tooltip-description-item"><b>Data Analysis Score:</b> Macro-average score across Data Analysis benchmarks.</div>
215
+ <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
216
+ <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
217
+ <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
218
+ <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 4).</div>
219
+ <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
220
+ """
221
+ elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
222
+ return f"""
223
+ <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
224
+ <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
225
+ <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over β“˜ to view all.</div>
226
+ <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
227
+ <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
228
+ <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
229
+ <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
230
+ <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
231
+ <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
232
+ """
233
+ else:
234
+ # Fallback for any other table type, e.g., individual benchmarks
235
+ return f"""
236
+ <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
237
+ <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
238
+ <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over β“˜ to view all.</div>
239
+ <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
240
+ <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
241
+ <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
242
+ <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
243
+ """
244
+
245
  # Dynamically generate the correct HTML for the legend parts
246
  openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
247
  tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
 
270
  )
271
  tooling_html = " ".join(tooling_html_items)
272
 
273
+ pareto_tooltip_content = build_pareto_tooltip_content()
274
+ openness_tooltip_content = build_openness_tooltip_content()
275
+ tooling_tooltip_content = build_tooling_tooltip_content()
276
 
277
+ def create_legend_markdown(which_table: str) -> str:
278
+ """
279
+ Generates the complete HTML for the legend section, including tooltips.
280
+ This is used in the main leaderboard display.
281
+ """
282
+ descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
283
+ legend_markdown = f"""
284
+ <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 10px; font-size: 14px; padding-bottom: 8px;">
285
+
286
+ <div> <!-- Container for the Pareto section -->
287
+ <b>Pareto</b>
288
+ <span class="tooltip-icon-legend">
289
+ β“˜
290
+ <span class="tooltip-card">{pareto_tooltip_content}</span>
291
+ </span>
292
+ <div style="margin-top: 8px;"><span>πŸ† On frontier</span></div>
293
+ </div>
294
+
295
+ <div> <!-- Container for the Openness section -->
296
+ <b>Agent Openness</b>
297
+ <span class="tooltip-icon-legend">
298
+ β“˜
299
+ <span class="tooltip-card">
300
+ <h3>Agent Openness</h3>
301
+ <p class="tooltip-description">Indicates how transparent and reproducible an agent is.</p>
302
+ <div class="tooltip-items-container">{openness_tooltip_content}</div>
303
+ </span>
304
+ </span>
305
+ <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 8px;">{openness_html}</div>
306
+ </div>
307
+
308
+ <div> <!-- Container for the Tooling section -->
309
+ <b>Agent Tooling</b>
310
+ <span class="tooltip-icon-legend">
311
+ β“˜
312
+ <span class="tooltip-card">
313
+ <h3>Agent Tooling</h3>
314
+ <p class="tooltip-description">Describes the tool usage and execution environment of the agent during evaluation.</p>
315
+ <div class="tooltip-items-container">{tooling_tooltip_content}</div>
316
+ </span>
317
+ </span>
318
+ <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 8px;">{tooling_html}</div>
319
+ </div>
320
 
321
+ <div><!-- Container for the Column Descriptions section -->
322
+ <b>Column Descriptions</b>
323
+ <span class="tooltip-icon-legend">
324
+ β“˜
325
+ <span class="tooltip-card">
326
+ <h3>Column Descriptions</h3>
327
+ <div class="tooltip-items-container">{descriptions_tooltip_content}</div>
328
+ </span>
329
+ </span>
330
+ </div>
 
 
 
 
 
 
 
 
 
 
 
331
  </div>
332
+ """
333
+ return legend_markdown
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  # --- Global State for Viewers (simple caching) ---
336
  CACHED_VIEWERS = {}
 
470
  gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
471
  # Put table and key into an accordion
472
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
 
473
  dataframe_component = gr.DataFrame(
474
  headers=df_headers,
475
  value=df_view,
 
479
  column_widths=[40, 40, 200, 200],
480
  elem_classes=["wrap-header-df"]
481
  )
482
+ legend_markdown = create_legend_markdown(category_name)
483
+ gr.HTML(value=legend_markdown, elem_id="legend-markdown")
484
 
485
  # Return the components so they can be referenced elsewhere.
486
  return plot_component, dataframe_component
 
628
  gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
629
  # Put table and key into an accordion
630
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
 
631
  gr.DataFrame(
632
  headers=df_headers,
633
  value=benchmark_table_df,
 
637
  column_widths=[40, 40, 200, 150, 175, 85],
638
  elem_classes=["wrap-header-df"]
639
  )
640
+ legend_markdown = create_legend_markdown(benchmark_name)
641
+ gr.HTML(value=legend_markdown, elem_id="legend-markdown")
642
 
643
  def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
644
  """