Amber Tanaka commited on
Commit
b48ebfb
·
unverified ·
1 Parent(s): ae05bbd

Add llm base (#14)

Browse files
Files changed (3) hide show
  1. content.py +6 -2
  2. leaderboard_transformer.py +5 -1
  3. ui_components.py +176 -139
content.py CHANGED
@@ -319,10 +319,14 @@ html:not(.dark) #legend-markdown .light-mode-icon,
319
  pointer-events: none;
320
  left: 50%;
321
  transform: translateX(-50%);
322
- z-index: 1000; /* Ensure it appears above other elements */
323
  }
324
-
325
  .tooltip-icon:hover::after {
326
  opacity: 1;
327
  }
 
 
 
 
 
328
  """
 
319
  pointer-events: none;
320
  left: 50%;
321
  transform: translateX(-50%);
322
+ z-index: 1000;
323
  }
 
324
  .tooltip-icon:hover::after {
325
  opacity: 1;
326
  }
327
+ /* --- inside table tooltips --- */
328
+ .native-tooltip-icon {
329
+ cursor: help;
330
+ text-decoration: underline dotted 1px;
331
+ }
332
  """
leaderboard_transformer.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  import logging
5
  from typing import Optional
6
  import base64
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
@@ -68,6 +69,7 @@ def _pretty_column_name(raw_col: str) -> str:
68
  'Logs': 'Logs',
69
  'Openness': 'Openness',
70
  'Agent tooling': 'Agent Tooling',
 
71
  }
72
 
73
  if raw_col in fixed_mappings:
@@ -233,7 +235,7 @@ class DataTransformer:
233
  df_view = df_view.drop(columns=['Submitter'])
234
 
235
  # 4. Build the List of Columns to Display (now simplified)
236
- base_cols = ["id","Agent","agent_for_hover"]
237
  new_cols = ["Openness", "Agent Tooling"]
238
  ending_cols = ["Logs"]
239
 
@@ -585,3 +587,5 @@ def svg_to_data_uri(path: str) -> str:
585
  except FileNotFoundError:
586
  logger.warning(f"SVG file not found at: {path}")
587
  return None
 
 
 
4
  import logging
5
  from typing import Optional
6
  import base64
7
+ import html
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
69
  'Logs': 'Logs',
70
  'Openness': 'Openness',
71
  'Agent tooling': 'Agent Tooling',
72
+ 'LLM base': 'LLM Base',
73
  }
74
 
75
  if raw_col in fixed_mappings:
 
235
  df_view = df_view.drop(columns=['Submitter'])
236
 
237
  # 4. Build the List of Columns to Display (now simplified)
238
+ base_cols = ["id","Agent","LLM Base", "agent_for_hover"]
239
  new_cols = ["Openness", "Agent Tooling"]
240
  ending_cols = ["Logs"]
241
 
 
587
  except FileNotFoundError:
588
  logger.warning(f"SVG file not found at: {path}")
589
  return None
590
+
591
+
ui_components.py CHANGED
@@ -301,7 +301,9 @@ def create_leaderboard_display(
301
  if "Score" in col:
302
  df_view = format_score_column(df_view, col)
303
  scatter_plot = plots_dict.get('scatter_plot', go.Figure())
304
-
 
 
305
 
306
  all_cols = df_view.columns.tolist()
307
  # Remove 'Pareto' from the list and insert it at the beginning
@@ -316,7 +318,7 @@ def create_leaderboard_display(
316
  for col in df_headers:
317
  if col in ["Logs", "Agent"] or "Cost" in col or "Score" in col:
318
  df_datatypes.append("markdown")
319
- elif col in ["Openness", "Agent Tooling"]:
320
  df_datatypes.append("html")
321
  else:
322
  df_datatypes.append("str")
@@ -344,13 +346,157 @@ def create_leaderboard_display(
344
  datatype=df_datatypes,
345
  interactive=False,
346
  wrap=True,
347
- column_widths=[30, 30, 30, 200],
348
  elem_classes=["wrap-header-df"]
349
  )
350
 
351
  # Return the components so they can be referenced elsewhere.
352
  return plot_component, dataframe_component
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
355
  """
356
  Loads and transforms the complete dataset for a given split.
@@ -427,143 +573,34 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str):
427
  # Return the entire navigation bar as one single Gradio HTML component
428
  return gr.HTML(full_html)
429
 
430
- # # --- Detailed Benchmark Display ---
431
- def create_benchmark_details_display(
432
- full_df: pd.DataFrame,
433
- tag_map: dict,
434
- category_name: str
435
- ):
436
  """
437
- Generates a detailed breakdown for each benchmark within a given category.
438
- For each benchmark, it creates a title, a filtered table, and a scatter plot.
439
- Args:
440
- full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
441
- tag_map (dict): The "pretty" tag map to find the list of benchmarks.
442
- category_name (str): The main category to display details for (e.g., "Literature Understanding").
443
  """
444
- # 1. Get the list of benchmarks for the selected category
445
- benchmark_names = tag_map.get(category_name, [])
446
-
447
- if not benchmark_names:
448
- gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
449
- return
450
-
451
- gr.Markdown("---")
452
- gr.Markdown("## Detailed Benchmark Results")
453
-
454
- # 2. Loop through each benchmark and create its UI components
455
- for benchmark_name in benchmark_names:
456
- gr.Markdown(f"### {benchmark_name} Leaderboard", header_links=True)
457
-
458
- # 3. Prepare the data for this specific benchmark's table and plot
459
- benchmark_score_col = f"{benchmark_name} Score"
460
- benchmark_cost_col = f"{benchmark_name} Cost"
461
-
462
- # Define the columns needed for the detailed table
463
- table_cols = ['Agent','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id']
464
-
465
- # Filter to only columns that actually exist in the full dataframe
466
- existing_table_cols = [col for col in table_cols if col in full_df.columns]
467
-
468
- if benchmark_score_col not in existing_table_cols:
469
- gr.Markdown(f"Score data for {benchmark_name} not available.")
470
- continue # Skip to the next benchmark if score is missing
471
-
472
- # Create a specific DataFrame for the table view
473
- benchmark_table_df = full_df[existing_table_cols].copy()
474
- pareto_df = get_pareto_df(benchmark_table_df)
475
- # Get the list of agents on the frontier. We'll use this list later.
476
- if not pareto_df.empty and 'id' in pareto_df.columns:
477
- pareto_agent_names = pareto_df['id'].tolist()
478
- else:
479
- pareto_agent_names = []
480
- benchmark_table_df['Pareto'] = benchmark_table_df.apply(
481
- lambda row: '📈' if row['id'] in pareto_agent_names else '',
482
- axis=1
483
- )
484
-
485
- benchmark_table_df['Openness'] = benchmark_table_df['Openness'].apply(lambda x: create_svg_html(x, OPENNESS_SVG_MAP))
486
- benchmark_table_df['Agent Tooling'] = benchmark_table_df['Agent Tooling'].apply(lambda x: create_svg_html(x, TOOLING_SVG_MAP))
487
-
488
- # Calculated and add "Benchmark Attempted" column
489
- def check_benchmark_status(row):
490
- has_score = pd.notna(row.get(benchmark_score_col))
491
- has_cost = pd.notna(row.get(benchmark_cost_col))
492
- if has_score and has_cost:
493
- return "✅"
494
- if has_score or has_cost:
495
- return "⚠️"
496
- return "🚫 "
497
-
498
- # Apply the function to create the new column
499
- benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
500
- # Sort the DataFrame
501
- if benchmark_score_col in benchmark_table_df.columns:
502
- benchmark_table_df = benchmark_table_df.sort_values(
503
- by=benchmark_score_col, ascending=False, na_position='last'
504
- )
505
- # 1. Format the cost and score columns
506
- benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
507
- benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
508
- desired_cols_in_order = [
509
- 'Pareto',
510
- 'Openness',
511
- 'Agent Tooling',
512
- 'Agent',
513
- 'Submitter',
514
- 'Attempted Benchmark',
515
- benchmark_score_col,
516
- benchmark_cost_col,
517
- 'Logs'
518
- ]
519
- for col in desired_cols_in_order:
520
- if col not in benchmark_table_df.columns:
521
- benchmark_table_df[col] = pd.NA # Add as an empty column
522
- benchmark_table_df = benchmark_table_df[desired_cols_in_order]
523
- # Rename columns for a cleaner table display, as requested
524
- benchmark_table_df.rename({
525
- benchmark_score_col: 'Score',
526
- benchmark_cost_col: 'Cost',
527
- }, inplace=True)
528
- # Ensure the 'Logs' column is formatted correctly
529
- df_headers = benchmark_table_df.columns.tolist()
530
- df_datatypes = []
531
- for col in df_headers:
532
- if "Logs" in col or "Cost" in col or "Score" in col:
533
- df_datatypes.append("markdown")
534
- elif col in ["Openness", "Agent Tooling"]:
535
- df_datatypes.append("html")
536
- else:
537
- df_datatypes.append("str")
538
- # Remove Pareto, Openness, and Agent Tooling from the headers
539
- header_rename_map = {
540
- "Pareto": "",
541
- "Openness": "",
542
- "Agent Tooling": ""
543
- }
544
- # 2. Create the final list of headers for display.
545
- benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
546
- # Create the scatter plot using the full data for context, but plotting benchmark metrics
547
- # This shows all agents on the same axis for better comparison.
548
- benchmark_plot = _plot_scatter_plotly(
549
- data=full_df,
550
- x=benchmark_cost_col,
551
- y=benchmark_score_col,
552
- agent_col="Agent"
553
- )
554
- gr.Plot(value=benchmark_plot)
555
- gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
556
- # Put table and key into an accordion
557
- with gr.Accordion("Details", open=True, elem_id="leaderboard-accordion"):
558
- gr.HTML(value=legend_markdown, elem_id="legend-markdown")
559
- gr.DataFrame(
560
- headers=df_headers,
561
- value=benchmark_table_df,
562
- datatype=df_datatypes,
563
- interactive=False,
564
- wrap=True,
565
- column_widths=[40, 40, 40, 350],
566
- elem_classes=["wrap-header-df"]
567
- )
568
 
 
 
569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  if "Score" in col:
302
  df_view = format_score_column(df_view, col)
303
  scatter_plot = plots_dict.get('scatter_plot', go.Figure())
304
+ #Make pretty and format the LLM Base column
305
+ df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list)
306
+ df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html)
307
 
308
  all_cols = df_view.columns.tolist()
309
  # Remove 'Pareto' from the list and insert it at the beginning
 
318
  for col in df_headers:
319
  if col in ["Logs", "Agent"] or "Cost" in col or "Score" in col:
320
  df_datatypes.append("markdown")
321
+ elif col in ["Openness", "Agent Tooling","LLM Base"]:
322
  df_datatypes.append("html")
323
  else:
324
  df_datatypes.append("str")
 
346
  datatype=df_datatypes,
347
  interactive=False,
348
  wrap=True,
349
+ column_widths=[30, 30, 30, 200, 200],
350
  elem_classes=["wrap-header-df"]
351
  )
352
 
353
  # Return the components so they can be referenced elsewhere.
354
  return plot_component, dataframe_component
355
 
356
+ # # --- Detailed Benchmark Display ---
357
+ def create_benchmark_details_display(
358
+ full_df: pd.DataFrame,
359
+ tag_map: dict,
360
+ category_name: str
361
+ ):
362
+ """
363
+ Generates a detailed breakdown for each benchmark within a given category.
364
+ For each benchmark, it creates a title, a filtered table, and a scatter plot.
365
+ Args:
366
+ full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
367
+ tag_map (dict): The "pretty" tag map to find the list of benchmarks.
368
+ category_name (str): The main category to display details for (e.g., "Literature Understanding").
369
+ """
370
+ # 1. Get the list of benchmarks for the selected category
371
+ benchmark_names = tag_map.get(category_name, [])
372
+
373
+ if not benchmark_names:
374
+ gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
375
+ return
376
+
377
+ gr.Markdown("---")
378
+ gr.Markdown("## Detailed Benchmark Results")
379
+
380
+ # 2. Loop through each benchmark and create its UI components
381
+ for benchmark_name in benchmark_names:
382
+ gr.Markdown(f"### {benchmark_name} Leaderboard", header_links=True)
383
+
384
+ # 3. Prepare the data for this specific benchmark's table and plot
385
+ benchmark_score_col = f"{benchmark_name} Score"
386
+ benchmark_cost_col = f"{benchmark_name} Cost"
387
+
388
+ # Define the columns needed for the detailed table
389
+ table_cols = ['Agent','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base']
390
+
391
+ # Filter to only columns that actually exist in the full dataframe
392
+ existing_table_cols = [col for col in table_cols if col in full_df.columns]
393
+
394
+ if benchmark_score_col not in existing_table_cols:
395
+ gr.Markdown(f"Score data for {benchmark_name} not available.")
396
+ continue # Skip to the next benchmark if score is missing
397
+
398
+ # Create a specific DataFrame for the table view
399
+ benchmark_table_df = full_df[existing_table_cols].copy()
400
+ pareto_df = get_pareto_df(benchmark_table_df)
401
+ # Get the list of agents on the frontier. We'll use this list later.
402
+ if not pareto_df.empty and 'id' in pareto_df.columns:
403
+ pareto_agent_names = pareto_df['id'].tolist()
404
+ else:
405
+ pareto_agent_names = []
406
+ benchmark_table_df['Pareto'] = benchmark_table_df.apply(
407
+ lambda row: '📈' if row['id'] in pareto_agent_names else '',
408
+ axis=1
409
+ )
410
+
411
+ benchmark_table_df['Openness'] = benchmark_table_df['Openness'].apply(lambda x: create_svg_html(x, OPENNESS_SVG_MAP))
412
+ benchmark_table_df['Agent Tooling'] = benchmark_table_df['Agent Tooling'].apply(lambda x: create_svg_html(x, TOOLING_SVG_MAP))
413
+
414
+ #Make pretty and format the LLM Base column
415
+ benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list)
416
+ benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html)
417
+
418
+ # Calculated and add "Benchmark Attempted" column
419
+ def check_benchmark_status(row):
420
+ has_score = pd.notna(row.get(benchmark_score_col))
421
+ has_cost = pd.notna(row.get(benchmark_cost_col))
422
+ if has_score and has_cost:
423
+ return "✅"
424
+ if has_score or has_cost:
425
+ return "⚠️"
426
+ return "🚫 "
427
+
428
+ # Apply the function to create the new column
429
+ benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
430
+ # Sort the DataFrame
431
+ if benchmark_score_col in benchmark_table_df.columns:
432
+ benchmark_table_df = benchmark_table_df.sort_values(
433
+ by=benchmark_score_col, ascending=False, na_position='last'
434
+ )
435
+ # 1. Format the cost and score columns
436
+ benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
437
+ benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
438
+ desired_cols_in_order = [
439
+ 'Pareto',
440
+ 'Openness',
441
+ 'Agent Tooling',
442
+ 'Agent',
443
+ 'Submitter',
444
+ 'LLM Base',
445
+ 'Attempted Benchmark',
446
+ benchmark_score_col,
447
+ benchmark_cost_col,
448
+ 'Logs'
449
+ ]
450
+ for col in desired_cols_in_order:
451
+ if col not in benchmark_table_df.columns:
452
+ benchmark_table_df[col] = pd.NA # Add as an empty column
453
+ benchmark_table_df = benchmark_table_df[desired_cols_in_order]
454
+ # Rename columns for a cleaner table display, as requested
455
+ benchmark_table_df.rename({
456
+ benchmark_score_col: 'Score',
457
+ benchmark_cost_col: 'Cost',
458
+ }, inplace=True)
459
+ # Ensure the 'Logs' column is formatted correctly
460
+ df_headers = benchmark_table_df.columns.tolist()
461
+ df_datatypes = []
462
+ for col in df_headers:
463
+ if "Logs" in col or "Cost" in col or "Score" in col:
464
+ df_datatypes.append("markdown")
465
+ elif col in ["Openness", "Agent Tooling", "LLM Base"]:
466
+ df_datatypes.append("html")
467
+ else:
468
+ df_datatypes.append("str")
469
+ # Remove Pareto, Openness, and Agent Tooling from the headers
470
+ header_rename_map = {
471
+ "Pareto": "",
472
+ "Openness": "",
473
+ "Agent Tooling": ""
474
+ }
475
+ # 2. Create the final list of headers for display.
476
+ benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
477
+ # Create the scatter plot using the full data for context, but plotting benchmark metrics
478
+ # This shows all agents on the same axis for better comparison.
479
+ benchmark_plot = _plot_scatter_plotly(
480
+ data=full_df,
481
+ x=benchmark_cost_col,
482
+ y=benchmark_score_col,
483
+ agent_col="Agent"
484
+ )
485
+ gr.Plot(value=benchmark_plot)
486
+ gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
487
+ # Put table and key into an accordion
488
+ with gr.Accordion("Details", open=True, elem_id="leaderboard-accordion"):
489
+ gr.HTML(value=legend_markdown, elem_id="legend-markdown")
490
+ gr.DataFrame(
491
+ headers=df_headers,
492
+ value=benchmark_table_df,
493
+ datatype=df_datatypes,
494
+ interactive=False,
495
+ wrap=True,
496
+ column_widths=[40, 40, 40, 200, 150, 175, 85],
497
+ elem_classes=["wrap-header-df"]
498
+ )
499
+
500
  def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
501
  """
502
  Loads and transforms the complete dataset for a given split.
 
573
  # Return the entire navigation bar as one single Gradio HTML component
574
  return gr.HTML(full_html)
575
 
576
+ def clean_llm_base_list(model_list):
 
 
 
 
 
577
  """
578
+ Cleans a list of model strings by keeping only the text after the last '/'.
579
+ For example: "models/gemini-2.5-flash-preview-05-20" becomes "gemini-2.5-flash-preview-05-20".
 
 
 
 
580
  """
581
+ # Return the original value if it's not a list, to avoid errors.
582
+ if not isinstance(model_list, list):
583
+ return model_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
 
585
+ # Use a list comprehension for a clean and efficient transformation.
586
+ return [str(item).split('/')[-1] for item in model_list]
587
 
588
+ def format_llm_base_with_html(value):
589
+ """
590
+ Formats the 'LLM Base' cell value.
591
+ If the value is a list with more than 1 element, it returns an
592
+ HTML <span> with the full list in a hover-over tooltip.
593
+ If it's a single-element list, it returns just that element.
594
+ Otherwise, it returns the original value.
595
+ """
596
+ if isinstance(value, list):
597
+ if len(value) > 1:
598
+ # Join the list items with a newline character for a clean tooltip
599
+ tooltip_text = "\n".join(map(str, value))
600
+ # Return an HTML span with the title attribute for the tooltip
601
+ return f'<span style="cursor: help;" title="{tooltip_text}">Multiple ⓘ</span>'
602
+ if len(value) == 1:
603
+ # If only one item, just return that item
604
+ return value[0]
605
+ # Return the value as-is if it's not a list or is an empty list
606
+ return value