Amber Tanaka commited on
Commit
ee1b999
·
unverified ·
1 Parent(s): 5ef407f

Asta Leaderboard First Draft (#3)

Browse files
Ai2_logo_pink_padding_RGB.png ADDED
app.py CHANGED
@@ -1,644 +1,104 @@
1
- """app.py: Gradio app for the AstaBench leaderboard.
2
-
3
- Modeled after the GAIA huggingface leaderboard app.
4
-
5
- """
6
-
7
- import json
8
  import os
9
- import shutil
10
- import tarfile
11
- import tempfile
12
- from datetime import datetime, timedelta, timezone
13
- from email.utils import parseaddr
14
- from pathlib import Path
15
- from zoneinfo import ZoneInfo
16
 
17
- import gradio as gr
18
- import numpy as np
19
- import pandas as pd
20
- import requests
21
- from agenteval import (
22
- compute_summary_statistics,
23
- process_eval_logs,
24
- upload_folder_to_hf,
25
- upload_summary_to_hf,
26
- )
27
- from agenteval.models import EvalResult
28
- from agenteval.upload import sanitize_path_component
29
  from apscheduler.schedulers.background import BackgroundScheduler
30
- from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
31
- from datasets.data_files import EmptyDatasetError
32
  from huggingface_hub import HfApi
 
33
 
34
- from content import (
35
- CITATION_BUTTON_LABEL,
36
- CITATION_BUTTON_TEXT,
37
- INTRODUCTION_TEXT,
38
- SUBMISSION_TEXT,
39
- TITLE,
40
- format_error,
41
- format_log,
42
- format_warning,
43
- hf_uri_to_web_url,
44
- hyperlink,
45
- )
46
 
47
- # Should be False on spaces and True outside
48
  LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
49
-
50
-
51
- CONFIG_NAME = "1.0.0-dev1"
52
-
53
  IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
54
-
55
  OWNER = "allenai"
56
  PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
57
- SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions" # all raw and scored submissions (val and test)
58
- SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public" # copy scored val submissions (public for transparency - not used for rendering leaderboard)
59
- CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
60
- RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # just the summary score statistics (val and test), to be displayed on the leaderboard
61
  LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
62
-
63
- if LOCAL_DEBUG:
64
- DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
65
- else:
66
- DATA_DIR = "/home/user/data/" + CONFIG_NAME
67
- EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
68
-
69
  api = HfApi()
70
-
71
- # max upload size of 100MB
72
- MAX_UPLOAD_BYTES = 100 * 1024**2
73
-
74
- AGENTEVAL_MANIFEST_NAME = "agenteval.json"
75
-
76
-
77
- os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
78
-
79
-
80
- def try_load_dataset(*args, **kwargs) -> DatasetDict:
81
- try:
82
- return load_dataset(*args, **kwargs)
83
- except EmptyDatasetError:
84
- return DatasetDict()
85
- except ValueError:
86
- return DatasetDict()
87
-
88
-
89
- def pretty_column_name(col: str) -> str:
90
- """Map any raw column name to its display name."""
91
- # text columns
92
- if col == "submit_time":
93
- return "Submission date"
94
- elif col == "agent_name":
95
- return "Agent"
96
- elif col == "agent_description":
97
- return "Agent description"
98
- elif col == "username":
99
- return "User/organization"
100
- elif col == "logs_url":
101
- return "Logs"
102
- # cost → $
103
- if col.endswith("/cost"):
104
- return "$"
105
- # stderr → CI
106
- elif col.endswith("/cost_stderr") or col.endswith("/score_stderr"):
107
- return "CI"
108
- # overall score
109
- elif col == "overall/score":
110
- return "Overall"
111
- # any other score → its tag/task name
112
- elif col.endswith("/score"):
113
- return col.split("/")[1]
114
- # fallback to unchanged
115
- return col
116
-
117
-
118
- def get_dataframe_from_results(eval_results: DatasetDict, split: str):
119
- local_df = eval_results.get(split)
120
- # return default if split is missing or contains no records
121
- if local_df is None or len(local_df) == 0:
122
- default_raw_cols = [
123
- "agent_name",
124
- "agent_description",
125
- "username",
126
- "submit_time",
127
- ]
128
- pretty_cols = [pretty_column_name(c) for c in default_raw_cols]
129
- return pd.DataFrame({col: ["No data"] for col in pretty_cols})
130
-
131
- # Use the first suite_config for all rows
132
- # because the suite_config should not change given a single CONFIG_NAME
133
- first_suite_config = None
134
- if len(local_df) > 0:
135
- first_suite_config = EvalResult.model_validate(local_df[0]).suite_config
136
-
137
- def extract_scores(eval_res: EvalResult) -> dict[str, float | None]:
138
- summary_stats = compute_summary_statistics(
139
- suite_config=first_suite_config,
140
- split=split,
141
- results=eval_res.results,
142
- )
143
-
144
- values: dict[str, float | None] = {}
145
- for key in summary_stats:
146
- if key == "overall":
147
- values["overall/score"] = summary_stats[key].score
148
- values["overall/cost"] = summary_stats[key].cost
149
- elif key.startswith("tag/"):
150
- tag = key.split("/")[1]
151
- values[f"tag/{tag}/score"] = summary_stats[key].score
152
- values[f"tag/{tag}/cost"] = summary_stats[key].cost
153
- elif key.startswith("task/"):
154
- task = key.split("/")[1]
155
- values[f"task/{task}/score"] = summary_stats[key].score
156
- values[f"task/{task}/score_stderr"] = summary_stats[key].score_stderr
157
- values[f"task/{task}/cost"] = summary_stats[key].cost
158
- values[f"task/{task}/cost_stderr"] = summary_stats[key].cost_stderr
159
- return values
160
-
161
- def format_row(row) -> dict[str, float | str | None]:
162
- eval_res = EvalResult.model_validate(row)
163
- sub = eval_res.submission
164
- sub.submit_time = sub.submit_time or datetime(1970, 1, 1, 0, 0, 0)
165
- data = {
166
- "submit_time": sub.submit_time.astimezone(ZoneInfo("US/Pacific")).strftime(
167
- "%Y-%m-%d"
168
- ),
169
- "agent_name": (
170
- hyperlink(sub.agent_url, sub.agent_name)
171
- if sub.agent_url
172
- else sub.agent_name
173
- ),
174
- "agent_description": sub.agent_description or "",
175
- "username": sub.username or "",
176
- **extract_scores(eval_res),
177
- "logs_url": (
178
- hyperlink(
179
- hf_uri_to_web_url(
180
- sub.logs_url if IS_INTERNAL else sub.logs_url_public
181
- ),
182
- "🔗",
183
- )
184
- if (sub.logs_url or sub.logs_url_public)
185
- else ""
186
- ),
187
- }
188
- return data
189
-
190
- local_df = local_df.map(format_row)
191
-
192
- df = pd.DataFrame(local_df)
193
-
194
- # Multiply score, cost, and stderr values by 100 and round to 1 decimal
195
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
196
- df[numeric_cols] = df[numeric_cols].multiply(100).round(1)
197
-
198
- # Build column order on raw names, then rename via pretty_column_name
199
- all_cols = df.columns.tolist()
200
- base = ["agent_name", "agent_description", "username"]
201
- overall = ["overall/score", "overall/cost"]
202
- tags = sorted({c.split("/")[1] for c in all_cols if c.startswith("tag/")})
203
- tasks = sorted({c.split("/")[1] for c in all_cols if c.startswith("task/")})
204
- rest = ["submit_time", "logs_url"]
205
- column_order = (
206
- base
207
- + overall
208
- + [col for tag in tags for col in (f"tag/{tag}/score", f"tag/{tag}/cost")]
209
- + [
210
- col
211
- for t in tasks
212
- for col in (
213
- f"task/{t}/score",
214
- f"task/{t}/score_stderr",
215
- f"task/{t}/cost",
216
- f"task/{t}/cost_stderr",
217
- )
218
- ]
219
- + rest
220
- )
221
- df = df.reindex(columns=[c for c in column_order if c in all_cols])
222
- # sort by overall score (descending)
223
- df = df.sort_values(by=["overall/score"], ascending=False)
224
- # apply all renames via pretty_column_name
225
- orig_cols = df.columns.tolist()
226
- df.columns = [pretty_column_name(col) for col in orig_cols]
227
-
228
- # blank out any null/NaN cells
229
- df = df.fillna("")
230
-
231
- return df
232
-
233
-
234
- def load_and_format_dataframes():
235
- eval_results = try_load_dataset(
236
- RESULTS_DATASET,
237
- CONFIG_NAME,
238
- download_mode="force_redownload",
239
- verification_mode=VerificationMode.NO_CHECKS,
240
- trust_remote_code=True,
241
- )
242
- eval_dataframe_val = get_dataframe_from_results(
243
- eval_results=eval_results, split="validation"
244
- )
245
- eval_dataframe_test = get_dataframe_from_results(
246
- eval_results=eval_results, split="test"
247
- )
248
- return eval_results, eval_dataframe_val, eval_dataframe_test
249
-
250
-
251
- # Display the results
252
- eval_results, eval_dataframe_val, eval_dataframe_test = load_and_format_dataframes()
253
-
254
-
255
- def restart_space():
256
- api.restart_space(repo_id=LEADERBOARD_PATH)
257
-
258
-
259
- def checked_upload_folder(
260
- api,
261
- folder_path: str,
262
- repo_id: str,
263
- config_name: str,
264
- split: str,
265
- submission_name: str,
266
- ) -> str:
267
- """Upload with inline size check; raises ValueError if too large."""
268
- total = 0
269
- for root, _, files in os.walk(folder_path):
270
- for f in files:
271
- total += os.path.getsize(os.path.join(root, f))
272
- if total > MAX_UPLOAD_BYTES:
273
- raise ValueError(
274
- f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
275
- )
276
- # NOTE: This function raises ValueError if unsafe characters are found in the path.
277
- return upload_folder_to_hf(
278
- api=api,
279
- folder_path=folder_path,
280
- repo_id=repo_id,
281
- config_name=config_name,
282
- split=split,
283
- submission_name=submission_name,
284
- )
285
-
286
-
287
- def add_new_eval(
288
- val_or_test: str,
289
- agent_name: str | None,
290
- agent_description: str,
291
- agent_url: str,
292
- path_to_file: tempfile._TemporaryFileWrapper | None,
293
- username: str,
294
- mail: str,
295
- profile: gr.OAuthProfile,
296
- ):
297
- # default username if none provided
298
- if not username or username.strip() == "":
299
- username = profile.username
300
-
301
- if not agent_name:
302
- return format_warning("Please provide an agent name.")
303
-
304
- submission_time = datetime.now(timezone.utc)
305
-
306
- # Was the profile created less than 2 month ago?
307
- user_data = requests.get(
308
- f"https://huggingface.co/api/users/{profile.username}/overview"
309
- )
310
- creation_date = json.loads(user_data.content)["createdAt"]
311
-
312
- created_at = datetime.strptime(creation_date, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
313
- tzinfo=timezone.utc
314
- )
315
- if submission_time - created_at < timedelta(days=60):
316
- return format_error("This account is not authorized to submit here.")
317
-
318
- contact_infos = try_load_dataset(
319
- CONTACT_DATASET,
320
- CONFIG_NAME,
321
- download_mode="force_redownload",
322
- verification_mode=VerificationMode.NO_CHECKS,
323
- trust_remote_code=True,
324
- )
325
- user_submission_dates = sorted(
326
- datetime.fromisoformat(row["submit_time"])
327
- for row in contact_infos.get(val_or_test, [])
328
- if row["username_auth"] == profile.username
329
- )
330
- if len(user_submission_dates) > 0 and abs(
331
- submission_time - user_submission_dates[-1]
332
- ) < timedelta(seconds=24 * 60 * 60):
333
- return format_error(
334
- "You already submitted once in the last 24h; please try again later."
335
- )
336
-
337
- is_validation = val_or_test == "validation"
338
-
339
- # Very basic email parsing
340
- _, parsed_mail = parseaddr(mail)
341
- if "@" not in parsed_mail:
342
- return format_warning("Please provide a valid email adress.")
343
-
344
- # Check duplicate submissions by inspecting the nested "submission" dicts
345
- if val_or_test in eval_results and len(eval_results[val_or_test]) > 0:
346
- existing = eval_results[val_or_test]
347
- subs = existing.to_dict().get("submission", [])
348
- names = {item.get("agent_name", "").lower() for item in subs}
349
- users = {item.get("username", "").lower() for item in subs}
350
- if agent_name.lower() in names and username.lower() in users:
351
- return format_warning("This agent has been already submitted.")
352
-
353
- if path_to_file is None:
354
- return format_warning("Please attach a file.")
355
-
356
- # sanitize username and agent_name for filesystem
357
- safe_username = sanitize_path_component(username)
358
- safe_agent_name = sanitize_path_component(agent_name)
359
-
360
- extracted_dir = os.path.join(
361
- EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}"
362
- )
363
-
364
- if LOCAL_DEBUG:
365
- print("mock extracted file", flush=True)
366
- else:
367
- try:
368
- # 1) remove old extraction if present
369
- if os.path.exists(extracted_dir):
370
- shutil.rmtree(extracted_dir)
371
- os.makedirs(extracted_dir, exist_ok=True)
372
-
373
- # 2) securely extract only regular files, flatten structure
374
- # Flatten structure to aid finding the manifest agenteval.json file
375
- # and because hierarchical structure is not needed
376
- with tarfile.open(path_to_file.name, "r:gz") as tar:
377
- for member in tar.getmembers():
378
- if not member.isreg():
379
- continue
380
- fname = os.path.basename(member.name)
381
- # skip empty or hidden
382
- if not fname or fname.startswith("."):
383
- continue
384
- fobj = tar.extractfile(member)
385
- if not fobj:
386
- continue
387
- target = os.path.join(extracted_dir, fname)
388
- with open(target, "wb") as out:
389
- out.write(fobj.read())
390
-
391
- # 3) ensure something was extracted
392
- if not os.listdir(extracted_dir):
393
- return format_error("Submission tarball is empty or invalid.")
394
-
395
- except Exception as e:
396
- return format_error(
397
- f"Error while extracting the file: {e}. Be sure to upload a valid .tar.gz file."
398
- )
399
-
400
- submission_name = (
401
- f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d')}"
402
  )
 
403
 
404
- # SAVE UNSCORED SUBMISSION
405
- if LOCAL_DEBUG:
406
- print("mock uploaded submission", flush=True)
407
- else:
408
- try:
409
- checked_upload_folder(
410
- api=api,
411
- folder_path=extracted_dir,
412
- repo_id=SUBMISSION_DATASET,
413
- config_name=CONFIG_NAME,
414
- split=val_or_test,
415
- submission_name=submission_name,
416
- )
417
- except ValueError as e:
418
- return format_error(str(e))
419
-
420
- # SAVE CONTACT
421
- contact_info = {
422
- "agent_name": agent_name,
423
- "agent_description": agent_description,
424
- "url": agent_url,
425
- "username": username,
426
- "username_auth": profile.username,
427
- "mail": mail,
428
- "submit_time": submission_time.isoformat(),
429
- }
430
- # add or init contact dataset for this split
431
- if val_or_test in contact_infos:
432
- contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
433
- else:
434
- contact_infos[val_or_test] = Dataset.from_list([contact_info])
435
- if LOCAL_DEBUG:
436
- print("mock uploaded contact info", flush=True)
437
- else:
438
- contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
439
-
440
  try:
441
- json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
442
- if not json_path.exists():
443
- return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME}")
444
- raw = json_path.read_text(encoding="utf-8")
445
- eval_result = EvalResult.model_validate_json(raw)
446
- if eval_result.suite_config.version != CONFIG_NAME:
447
- return format_error(
448
- f"Error: submitted suite version {eval_result.suite_config.version} "
449
- f"does not match currently accepted version {CONFIG_NAME}"
450
- )
451
- if eval_result.split != val_or_test:
452
- return format_error(
453
- f"Error: uploaded split {eval_result.split} does not match selected split {val_or_test}"
454
- )
455
-
456
- # NOTE: Trusting user-computed scores, but re-computing the derived results based on the log files
457
- eval_result.results = process_eval_logs(extracted_dir)[0]
458
- eval_result.save_json(str(json_path))
459
-
460
  except Exception as e:
461
- return format_error(
462
- f"Error while scoring the submission: {e}. Be sure to upload a valid submission."
463
- )
 
464
 
465
- # # SAVE SCORED SUBMISSION
466
- if LOCAL_DEBUG:
467
- print("mock uploaded scored submission")
468
- else:
469
- try:
470
- logs_url_private = checked_upload_folder(
471
- api=api,
472
- folder_path=extracted_dir,
473
- repo_id=SUBMISSION_DATASET,
474
- config_name=CONFIG_NAME,
475
- split=val_or_test,
476
- submission_name=f"{submission_name}_scored",
477
- )
478
- except ValueError as e:
479
- return format_error(str(e))
480
-
481
- # Validation submissions are public for public leaderboard
482
- if is_validation and not IS_INTERNAL:
483
- try:
484
- logs_url_public = checked_upload_folder(
485
- api=api,
486
- folder_path=extracted_dir,
487
- repo_id=SUBMISSION_DATASET_PUBLIC,
488
- config_name=CONFIG_NAME,
489
- split=val_or_test,
490
- submission_name=f"{submission_name}_scored",
491
- )
492
- except ValueError as e:
493
- return format_error(str(e))
494
- else:
495
- logs_url_public = None
496
-
497
- eval_result.submission.agent_name = agent_name
498
- eval_result.submission.agent_description = agent_description
499
- eval_result.submission.agent_url = agent_url
500
- eval_result.submission.username = username
501
- eval_result.submission.submit_time = submission_time
502
- eval_result.submission.logs_url = logs_url_private
503
- eval_result.submission.logs_url_public = logs_url_public
504
 
 
 
505
  if LOCAL_DEBUG:
506
- print("mock uploaded results to lb")
 
 
507
  else:
508
- upload_summary_to_hf(
509
- api=api,
510
- eval_result=eval_result,
511
- repo_id=RESULTS_DATASET,
512
- config_name=CONFIG_NAME,
513
- split=val_or_test,
514
- submission_name=f"{submission_name}_scored",
515
- )
516
-
517
- return format_log(
518
- f"Agent {agent_name} submitted by {username} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed."
519
- )
520
-
521
-
522
- def refresh():
523
- _, eval_dataframe_val, eval_dataframe_test = load_and_format_dataframes()
524
- return eval_dataframe_val, eval_dataframe_test
525
-
526
-
527
- # Determine column types dynamically based on dataframe columns
528
- def compute_column_types(df):
529
- col_types = []
530
- for col in df.columns:
531
- if col == "Agent":
532
- col_types.append("markdown")
533
- elif col in ["Agent description", "User/organization", "Submission date"]:
534
- col_types.append("str")
535
- elif col == "Logs":
536
- col_types.append("markdown")
537
- else:
538
- col_types.append("number")
539
- return col_types
540
-
541
-
542
- test_col_types = compute_column_types(eval_dataframe_test)
543
- val_col_types = compute_column_types(eval_dataframe_val)
544
-
545
- demo = gr.Blocks()
546
- with demo:
547
- gr.HTML(TITLE)
548
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
549
-
550
- with gr.Row():
551
- with gr.Accordion("📙 Citation", open=False):
552
- citation_button = gr.Textbox(
553
- value=CITATION_BUTTON_TEXT,
554
- label=CITATION_BUTTON_LABEL,
555
- elem_id="citation-button",
556
- ) # .style(show_copy_button=True)
557
-
558
- leaderboard_table_test = gr.Dataframe(
559
- value=eval_dataframe_test,
560
- headers=list(eval_dataframe_test.columns),
561
- datatype=test_col_types,
562
- interactive=False,
563
- column_widths=["20%"],
564
- render=False,
565
- )
566
-
567
- leaderboard_table_val = gr.Dataframe(
568
- value=eval_dataframe_val,
569
- headers=list(eval_dataframe_val.columns),
570
- datatype=val_col_types,
571
- interactive=False,
572
- column_widths=["20%"],
573
- render=False,
574
- )
575
-
576
- # Build tab layout list based on desired order
577
- tabs = [
578
- ("Results: Test", leaderboard_table_test),
579
- ("Results: Validation", leaderboard_table_val),
580
- ]
581
-
582
- if IS_INTERNAL:
583
- tabs = [tabs[1], tabs[0]] # Validation first for internal users
584
-
585
- # Render the tabs in desired order
586
- for label, component in tabs:
587
- with gr.Tab(label):
588
- component.render()
589
-
590
- refresh_button = gr.Button("Refresh")
591
- refresh_button.click(
592
- refresh,
593
- inputs=[],
594
- outputs=[
595
- leaderboard_table_val,
596
- leaderboard_table_test,
597
- ],
598
- )
599
- with gr.Accordion("Submit a new agent for evaluation"):
600
- with gr.Row():
601
- gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
602
- with gr.Row():
603
- with gr.Column():
604
- level_of_test = gr.Radio(
605
- ["validation", "test"], value="validation", label="Split"
606
- )
607
- agent_name_textbox = gr.Textbox(label="Agent name")
608
- agent_description_textbox = gr.Textbox(label="Agent description")
609
- agent_url_textbox = gr.Textbox(label="Url to agent information")
610
- with gr.Column():
611
- username = gr.Textbox(
612
- label="Organization or user name (defaults to your HF username)",
613
- placeholder="Leave blank to use your HF username",
614
- )
615
- mail = gr.Textbox(
616
- label="Contact email (will be stored privately, & used if there is an issue with your submission)"
617
- )
618
- file_output = gr.File()
619
-
620
- with gr.Row():
621
- gr.LoginButton()
622
- submit_button = gr.Button("Submit Eval")
623
- submission_result = gr.Markdown()
624
- submit_button.click(
625
- add_new_eval,
626
- [
627
- level_of_test,
628
- agent_name_textbox,
629
- agent_description_textbox,
630
- agent_url_textbox,
631
- file_output,
632
- username,
633
- mail,
634
- ],
635
- submission_result,
636
- )
637
 
638
- scheduler = BackgroundScheduler()
639
- scheduler.add_job(restart_space, "interval", seconds=3600)
640
- scheduler.start()
641
- if LOCAL_DEBUG:
642
- demo.launch(debug=True)
643
- else:
644
- demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
 
1
+ # app.py
2
+ import gradio as gr
 
 
 
 
 
3
  import os
 
 
 
 
 
 
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from apscheduler.schedulers.background import BackgroundScheduler
 
 
6
  from huggingface_hub import HfApi
7
+ import literature_understanding, main_page, c_and_e, data_analysis, e2e
8
 
9
+ from content import TITLE, css
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # --- Constants and Configuration ---
12
  LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
 
 
 
 
13
  IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
 
14
  OWNER = "allenai"
15
  PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
 
 
 
 
16
  LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
 
 
 
 
 
 
 
17
  api = HfApi()
18
+ LOGO_PATH = "Ai2_logo_pink_padding_RGB.png"
19
+
20
+
21
+
22
+ # --- Theme Definition ---
23
+ theme = gr.themes.Base(
24
+ primary_hue=gr.themes.Color(c100="#CFF5E8", c200="#B7EFDD", c300="#9FEAD1", c400="#87E5C5", c50="#E7FAF3", c500="#6FE0BA", c600="#57DBAF", c700="#3FD5A3", c800="#27D09C", c900="#0FCB8C", c950="#0fcb8c"),
25
+ secondary_hue=gr.themes.Color(c100="#FCDCEB", c200="#FBCBE1", c300="#F9BAD7", c400="#F7A8CD", c50="#FDEEF5", c500="#F697C4", c600="#F586BA", c700="#F375B0", c800="#F263A6", c900="#F0529C", c950="#F0529C"),
26
+ neutral_hue=gr.themes.Color(c100="#FDF9F4", c200="#C9C9C3", c300="#B0B5AF", c400="#97A09C", c50="#FAF2E9", c500="#7F8C89", c600="#667876", c700="#344F4F", c800="#1C3A3C", c900="#032629", c950="032629"),
27
+ font=[gr.themes.GoogleFont('Manrope'), 'ui-sans-serif', 'sans-serif', 'sans-serif'],
28
+ font_mono=[gr.themes.GoogleFont('Roboto Mono'), 'ui-monospace', 'monospace', 'monospace'],
29
+ ).set(
30
+ body_text_color='*neutral_950',
31
+ body_text_color_dark='*neutral_50',
32
+ background_fill_primary='*neutral_50',
33
+ background_fill_primary_dark='*neutral_900',
34
+ background_fill_secondary='*neutral_100',
35
+ background_fill_secondary_dark='*neutral_800',
36
+ border_color_accent='*secondary_900',
37
+ border_color_accent_subdued='*neutral_400',
38
+ border_color_accent_subdued_dark='*neutral_400',
39
+ color_accent='*primary_900',
40
+ color_accent_soft='*neutral_200',
41
+ color_accent_soft_dark='*neutral_800',
42
+ link_text_color='*secondary_900',
43
+ link_text_color_dark='*primary_900',
44
+ link_text_color_active_dark='*primary_600',
45
+ link_text_color_hover_dark='*primary_700',
46
+ link_text_color_visited_dark='*primary_600',
47
+ table_even_background_fill='*neutral_100',
48
+ table_even_background_fill_dark='*neutral_800',
49
+ button_primary_background_fill='*secondary_900',
50
+ button_primary_background_fill_dark='*primary_900',
51
+ button_primary_background_fill_hover='*secondary_600',
52
+ button_primary_background_fill_hover_dark='*primary_600',
53
+ button_primary_text_color='*neutral_900',
54
+ button_primary_text_color_dark='*neutral_900'
55
+ )
56
+ # --- Gradio App Definition ---
57
+ demo = gr.Blocks(theme=theme, css=css)
58
+ with demo:
59
+ gr.Image(
60
+ value=LOGO_PATH,
61
+ show_label=False,
62
+ interactive=False,
63
+ container=False,
64
+ show_download_button=False,
65
+ show_fullscreen_button=False,
66
+ elem_id="logo-image"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  )
68
+ gr.HTML(TITLE)
69
 
70
+ main_page.demo.render()
71
+ with demo.route("Literature Understanding"):
72
+ literature_understanding.demo.render()
73
+ with demo.route("Code & Execution"):
74
+ c_and_e.demo.render()
75
+ with demo.route("Data Analysis"):
76
+ data_analysis.demo.render()
77
+ with demo.route("Discovery"):
78
+ e2e.demo.render()
79
+
80
+ # --- Scheduler and Launch
81
+ def restart_space_job():
82
+ print("Scheduler: Attempting to restart space.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
+ api.restart_space(repo_id=LEADERBOARD_PATH)
85
+ print("Scheduler: Space restart request sent.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  except Exception as e:
87
+ print(f"Scheduler: Error restarting space: {e}")
88
+ scheduler = BackgroundScheduler(timezone="UTC")
89
+ scheduler.add_job(restart_space_job, "interval", hours=1)
90
+ scheduler.start()
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ # Launch the Gradio app
94
+ if __name__ == "__main__":
95
  if LOCAL_DEBUG:
96
+ print("Launching in LOCAL_DEBUG mode.")
97
+ def get_initial_global_tag_choices(): return ["Overall", "TagA"]
98
+ demo.launch(debug=True)
99
  else:
100
+ print("Launching in Space mode.")
101
+ # For Spaces, share=False is typical unless specific tunneling is needed.
102
+ # debug=True can be set to False for a "production" Space.
103
+ demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
 
 
 
 
 
 
 
c_and_e.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ # Import our UI factories and the data loader
5
+ from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
6
+
7
+ # Define the category for this page
8
+ CATEGORY_NAME = "Code Execution"
9
+
10
+ with gr.Blocks() as demo:
11
+ gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
12
+
13
+ # --- This page now has two main sections: Validation and Test ---
14
+ with gr.Tabs():
15
+ with gr.Tab("Results: Validation"):
16
+ # 1. Load all necessary data for the "validation" split ONCE.
17
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
18
+
19
+ if not validation_df.empty:
20
+ # 2. Render the main category display using the loaded data.
21
+ create_leaderboard_display(
22
+ full_df=validation_df,
23
+ tag_map=validation_tag_map,
24
+ category_name=CATEGORY_NAME,
25
+ split_name="validation"
26
+ )
27
+
28
+ # 3. Render the detailed breakdown for each benchmark in the category.
29
+ create_benchmark_details_display(
30
+ full_df=validation_df,
31
+ tag_map=validation_tag_map,
32
+ category_name=CATEGORY_NAME
33
+ )
34
+ else:
35
+ gr.Markdown("No data available for validation split.")
36
+
37
+ with gr.Tab("Results: Test"):
38
+ # Repeat the process for the "test" split
39
+ test_df, test_tag_map = get_full_leaderboard_data("test")
40
+
41
+ if not test_df.empty:
42
+ create_leaderboard_display(
43
+ full_df=test_df,
44
+ tag_map=test_tag_map,
45
+ category_name=CATEGORY_NAME,
46
+ split_name="test"
47
+ )
48
+ create_benchmark_details_display(
49
+ full_df=test_df,
50
+ tag_map=test_tag_map,
51
+ category_name=CATEGORY_NAME
52
+ )
53
+ else:
54
+ gr.Markdown("No data available for test split.")
content.py CHANGED
@@ -3,6 +3,14 @@ TITLE = """<h1 align="center" id="space-title">AstaBench Leaderboard</h1>"""
3
  INTRODUCTION_TEXT = """
4
  ## Introduction
5
  """
 
 
 
 
 
 
 
 
6
 
7
  SUBMISSION_TEXT = """
8
  ## Submissions
@@ -32,8 +40,11 @@ def format_log(msg):
32
  return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
33
 
34
 
35
- def hyperlink(link, text):
36
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{text}</a>'
 
 
 
37
 
38
 
39
  def hf_uri_to_web_url(uri: str) -> str:
@@ -53,3 +64,56 @@ def hf_uri_to_web_url(uri: str) -> str:
53
 
54
  namespace, repo, path = parts
55
  return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  INTRODUCTION_TEXT = """
4
  ## Introduction
5
  """
6
+ INTRO_PARAGRAPH = """
7
+ AI agents are on the rise, promising everything from travel planning to scientific discovery. But evaluating them—especially for real-world research tasks—remains a messy, inconsistent process. Metrics vary, cost is often ignored, and scientific use cases are rarely the focus. <br>
8
+ <br>
9
+ Enter AstaBench, a grand challenge benchmark developed by Ai2 to test how well agentic AI systems perform on scientific tasks that actually matter. As part of the Asta initiative, AstaBench spans ten multi-step benchmarks covering literature review, data analysis, code execution, and complex decision-making. It brings standardization and transparency to agent evaluation, with statistical confidence reporting, and a leaderboard that highlights tradeoffs between accuracy and computational cost.
10
+ """
11
+ SCATTER_DISCLAIMER = """
12
+ Only agents that have cost data available will be shown in the scatter plot. If you don't see your agent, please ensure that you have provided cost data in your submission.
13
+ """
14
 
15
  SUBMISSION_TEXT = """
16
  ## Submissions
 
40
  return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
41
 
42
 
43
+ def hyperlink(link_url: str, text: str = "🔗") -> str:
44
+ if not link_url or not isinstance(link_url, str):
45
+ return str(text) # Or simply "" if link_url is bad
46
+ # Using a simpler style here for broad compatibility, your original style is fine too.
47
+ return f'<a target="_blank" href="{link_url}">{text}</a>'
48
 
49
 
50
  def hf_uri_to_web_url(uri: str) -> str:
 
64
 
65
  namespace, repo, path = parts
66
  return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
67
+
68
+ css = """
69
+ .submission-accordion {
70
+ border-style: solid;
71
+ border-width: 3px !important;
72
+ border-color: #ec4899;
73
+ }
74
+ .submission-accordion span.svelte-1w6vloh {
75
+ font-weight: bold !important;
76
+ font-size: 1.2em !important;
77
+ }
78
+ #logo-image {
79
+ margin: auto;
80
+ max-width: 250px;
81
+ height: auto;
82
+ }
83
+ .table-component{
84
+ height: auto !important;
85
+ max-height: none !important;
86
+ }
87
+
88
+ .table-wrap {
89
+ max-height: none !important;
90
+ height: auto !important;
91
+ overflow-y: visible !important;
92
+ }
93
+ /* --- New Rules for Table Density --- */
94
+ table.gr-table th, table.gr-table td {
95
+ padding: 4px 4px !important;
96
+ width: 1%;
97
+ white-space: nowrap;
98
+ }
99
+
100
+ table.gr-table {
101
+ font-size: 14px !important;
102
+ }
103
+
104
+ /* Example of making the "Agent" column (the 1st column) a bit wider if needed */
105
+ table.gr-table th:nth-child(1),
106
+ table.gr-table td:nth-child(1) {
107
+ min-width: 150px !important;
108
+ white-space: normal !important; /* Allow agent names to wrap if long */
109
+ }
110
+ .html-container {
111
+ padding-top: 0 !important;
112
+ }
113
+ #scatter-disclaimer {
114
+ color: #f0529c !important;
115
+ }
116
+ thead.svelte-1e98i6s th {
117
+ background: white !important;
118
+ }
119
+ """
data/1.0.0-dev1/agenteval.json ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "suite_config": {
3
+ "name": "asta-bench",
4
+ "version": "1.0.0-dev1",
5
+ "splits": [
6
+ {
7
+ "name": "validation",
8
+ "tasks": [
9
+ {
10
+ "name": "arxivdigestables_validation",
11
+ "path": "astabench/arxivdigestables_validation",
12
+ "primary_metric": "score_tables/mean",
13
+ "tags": [
14
+ "lit"
15
+ ]
16
+ },
17
+ {
18
+ "name": "sqa_dev",
19
+ "path": "astabench/sqa_dev",
20
+ "primary_metric": "global_avg/mean",
21
+ "tags": [
22
+ "lit"
23
+ ]
24
+ },
25
+ {
26
+ "name": "litqa2_validation",
27
+ "path": "astabench/litqa2_validation",
28
+ "primary_metric": "is_correct/accuracy",
29
+ "tags": [
30
+ "lit"
31
+ ]
32
+ },
33
+ {
34
+ "name": "paper_finder_validation",
35
+ "path": "astabench/paper_finder_validation",
36
+ "primary_metric": "score_paper_finder/macro_avg",
37
+ "tags": [
38
+ "lit"
39
+ ]
40
+ },
41
+ {
42
+ "name": "discoverybench_validation",
43
+ "path": "astabench/discoverybench_validation",
44
+ "primary_metric": "score_discoverybench/mean",
45
+ "tags": [
46
+ "data"
47
+ ]
48
+ },
49
+ {
50
+ "name": "core_bench_validation",
51
+ "path": "astabench/core_bench_validation",
52
+ "primary_metric": "evaluate_task_questions/accuracy",
53
+ "tags": [
54
+ "code"
55
+ ]
56
+ },
57
+ {
58
+ "name": "ds1000_validation",
59
+ "path": "astabench/ds1000_validation",
60
+ "primary_metric": "ds1000_scorer/accuracy",
61
+ "tags": [
62
+ "code"
63
+ ]
64
+ },
65
+ {
66
+ "name": "e2e_discovery_validation",
67
+ "path": "astabench/e2e_discovery_validation",
68
+ "primary_metric": "score_rubric/accuracy",
69
+ "tags": [
70
+ "discovery"
71
+ ]
72
+ },
73
+ {
74
+ "name": "super_validation",
75
+ "path": "astabench/super_validation",
76
+ "primary_metric": "check_super_execution/entrypoints",
77
+ "tags": [
78
+ "code"
79
+ ]
80
+ }
81
+ ]
82
+ },
83
+ {
84
+ "name": "test",
85
+ "tasks": [
86
+ {
87
+ "name": "paper_finder_test",
88
+ "path": "astabench/paper_finder_test",
89
+ "primary_metric": "score_paper_finder/macro_avg",
90
+ "tags": [
91
+ "lit"
92
+ ]
93
+ },
94
+ {
95
+ "name": "sqa_test",
96
+ "path": "astabench/sqa_test",
97
+ "primary_metric": "global_avg/mean",
98
+ "tags": [
99
+ "lit"
100
+ ]
101
+ },
102
+ {
103
+ "name": "arxivdigestables_test",
104
+ "path": "astabench/arxivdigestables_test",
105
+ "primary_metric": "score_tables/mean",
106
+ "tags": [
107
+ "lit"
108
+ ]
109
+ },
110
+ {
111
+ "name": "litqa2_test",
112
+ "path": "astabench/litqa2_test",
113
+ "primary_metric": "is_correct/accuracy",
114
+ "tags": [
115
+ "lit"
116
+ ]
117
+ },
118
+ {
119
+ "name": "discoverybench_test",
120
+ "path": "astabench/discoverybench_test",
121
+ "primary_metric": "score_discoverybench/mean",
122
+ "tags": [
123
+ "data"
124
+ ]
125
+ },
126
+ {
127
+ "name": "core_bench_test",
128
+ "path": "astabench/core_bench_test",
129
+ "primary_metric": "evaluate_task_questions/accuracy",
130
+ "tags": [
131
+ "code"
132
+ ]
133
+ },
134
+ {
135
+ "name": "ds1000_test",
136
+ "path": "astabench/ds1000_test",
137
+ "primary_metric": "ds1000_scorer/accuracy",
138
+ "tags": [
139
+ "code"
140
+ ]
141
+ },
142
+ {
143
+ "name": "e2e_discovery_test",
144
+ "path": "astabench/e2e_discovery_test",
145
+ "primary_metric": "score_rubric/accuracy",
146
+ "tags": [
147
+ "discovery"
148
+ ]
149
+ },
150
+ {
151
+ "name": "super_test",
152
+ "path": "astabench/super_test",
153
+ "primary_metric": "check_super_execution/entrypoints",
154
+ "tags": [
155
+ "code"
156
+ ]
157
+ }
158
+ ]
159
+ }
160
+ ]
161
+ },
162
+ "split": "validation",
163
+ "results": [
164
+ {
165
+ "task_name": "sqa_dev",
166
+ "metrics": [
167
+ {
168
+ "name": "global_avg/mean",
169
+ "value": 0.6215245045241414
170
+ },
171
+ {
172
+ "name": "global_avg/stderr",
173
+ "value": 0.02088486499225903
174
+ },
175
+ {
176
+ "name": "ingredient_recall/mean",
177
+ "value": 0.6029178145087237
178
+ },
179
+ {
180
+ "name": "ingredient_recall/stderr",
181
+ "value": 0.026215888361291618
182
+ },
183
+ {
184
+ "name": "answer_precision/mean",
185
+ "value": 0.7960436785436785
186
+ },
187
+ {
188
+ "name": "answer_precision/stderr",
189
+ "value": 0.027692773517249983
190
+ },
191
+ {
192
+ "name": "citation_precision/mean",
193
+ "value": 0.697849041353826
194
+ },
195
+ {
196
+ "name": "citation_precision/stderr",
197
+ "value": 0.026784164936602798
198
+ },
199
+ {
200
+ "name": "citation_recall/mean",
201
+ "value": 0.3892874836903378
202
+ },
203
+ {
204
+ "name": "citation_recall/stderr",
205
+ "value": 0.015094770200171756
206
+ }
207
+ ],
208
+ "model_costs": [
209
+ 1.3829150000000001,
210
+ 0.9759700000000001,
211
+ 2.2324650000000004,
212
+ 0.76631,
213
+ 0.9277900000000001,
214
+ 2.6388600000000006,
215
+ 0.8114100000000002,
216
+ 2.3263174999999996,
217
+ 2.5423725,
218
+ 1.2398675000000001,
219
+ 1.7387300000000003,
220
+ 1.2176599999999997,
221
+ 0.564655,
222
+ 0.9726750000000001,
223
+ 0.7675700000000001,
224
+ 1.5198850000000002,
225
+ 1.4726625000000002,
226
+ 2.1937650000000004,
227
+ 0.6907700000000001,
228
+ 1.39835,
229
+ 1.2598175,
230
+ 2.5373550000000002,
231
+ 2.19239,
232
+ 1.2508875000000006,
233
+ 2.2650550000000007,
234
+ 1.6047725,
235
+ 0.6525125000000003,
236
+ 1.4262200000000003,
237
+ 1.0533299999999999,
238
+ 1.7252375,
239
+ 1.407145,
240
+ 1.5408700000000004,
241
+ 2.8073224999999993,
242
+ 1.0448125000000006,
243
+ 1.7037300000000004,
244
+ 0.8650500000000001,
245
+ 1.0171225000000002,
246
+ 0.5697925000000001,
247
+ 2.7851025,
248
+ 1.0551425,
249
+ 2.9213775,
250
+ 1.7772975000000004,
251
+ 1.2753225000000001,
252
+ 0.8108325000000001,
253
+ 0.6958375000000001,
254
+ 0.8840950000000003,
255
+ 1.2028724999999998,
256
+ 1.2490475000000003,
257
+ 2.4272,
258
+ 1.95026,
259
+ 1.5352475,
260
+ 2.11181,
261
+ 2.3612249999999997,
262
+ 1.8619225000000004,
263
+ 0.7431075000000001,
264
+ 1.5189675000000002,
265
+ 1.089575,
266
+ 1.6103700000000003,
267
+ 1.4201450000000002,
268
+ 2.397835,
269
+ 1.469175,
270
+ 1.0723550000000004,
271
+ 0.7964050000000003,
272
+ 3.3733175,
273
+ 4.197085,
274
+ 4.2637675,
275
+ 1.2982124999999998,
276
+ 0.66146,
277
+ 1.1130475000000002,
278
+ 2.4393974999999997,
279
+ 2.582,
280
+ 1.7381725000000001,
281
+ 0.415025,
282
+ 1.6777325,
283
+ 1.0507825000000002,
284
+ 2.4627125000000003,
285
+ 1.017005,
286
+ 1.9210250000000002,
287
+ 1.5009025000000003,
288
+ 0.8283125000000001,
289
+ 2.9854425,
290
+ 0.4633375000000001,
291
+ 0.397685,
292
+ 1.2803425,
293
+ 3.0388200000000003,
294
+ 1.2610875000000004,
295
+ 1.798365,
296
+ 3.427287500000001,
297
+ 0.29307750000000005,
298
+ 0.37101249999999997,
299
+ 2.8046925000000003,
300
+ 0.35557000000000005,
301
+ 3.5481700000000007,
302
+ 1.1073975,
303
+ 1.5280825,
304
+ 1.1714900000000001,
305
+ 3.1791275000000003,
306
+ 3.8214725000000005,
307
+ 1.8440275,
308
+ 1.730515,
309
+ 1.9350675000000002,
310
+ 1.6592125000000002,
311
+ 1.9227124999999998,
312
+ 1.202885,
313
+ 1.2688150000000002,
314
+ 0.8819875000000001,
315
+ 0.6989325,
316
+ 1.965635,
317
+ 1.7467800000000002,
318
+ 1.6940625000000002
319
+ ]
320
+ }
321
+ ],
322
+ "submission": {
323
+ "submit_time": "2025-06-09T20:55:35.869831Z",
324
+ "username": "miked-ai",
325
+ "agent_name": "Basic ReAct",
326
+ "agent_description": null,
327
+ "agent_url": null,
328
+ "logs_url": "hf://datasets/allenai/asta-bench-internal-submissions/1.0.0-dev1/validation/miked-ai_Basic_ReAct__task_tools__report_editor__2025-06-09T20-55-35",
329
+ "logs_url_public": null,
330
+ "summary_url": null
331
+ }
332
+ }
data_analysis.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ # Import our UI factories and the data loader
5
+ from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
6
+
7
+ # Define the category for this page
8
+ CATEGORY_NAME = "Data Analysis"
9
+
10
+ with gr.Blocks() as demo:
11
+ gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
12
+
13
+ # --- This page now has two main sections: Validation and Test ---
14
+ with gr.Tabs():
15
+ with gr.Tab("Results: Validation"):
16
+ # 1. Load all necessary data for the "validation" split ONCE.
17
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
18
+
19
+ if not validation_df.empty:
20
+ # 2. Render the main category display using the loaded data.
21
+ create_leaderboard_display(
22
+ full_df=validation_df,
23
+ tag_map=validation_tag_map,
24
+ category_name=CATEGORY_NAME,
25
+ split_name="validation"
26
+ )
27
+
28
+ # 3. Render the detailed breakdown for each benchmark in the category.
29
+ create_benchmark_details_display(
30
+ full_df=validation_df,
31
+ tag_map=validation_tag_map,
32
+ category_name=CATEGORY_NAME
33
+ )
34
+ else:
35
+ gr.Markdown("No data available for validation split.")
36
+
37
+ with gr.Tab("Results: Test"):
38
+ # Repeat the process for the "test" split
39
+ test_df, test_tag_map = get_full_leaderboard_data("test")
40
+
41
+ if not test_df.empty:
42
+ create_leaderboard_display(
43
+ full_df=test_df,
44
+ tag_map=test_tag_map,
45
+ category_name=CATEGORY_NAME,
46
+ split_name="test"
47
+ )
48
+ create_benchmark_details_display(
49
+ full_df=test_df,
50
+ tag_map=test_tag_map,
51
+ category_name=CATEGORY_NAME
52
+ )
53
+ else:
54
+ gr.Markdown("No data available for test split.")
e2e.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ # Import our UI factories and the data loader
5
+ from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
6
+
7
+ # Define the category for this page
8
+ CATEGORY_NAME = "Discovery"
9
+
10
+ with gr.Blocks() as demo:
11
+ gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
12
+
13
+ # --- This page now has two main sections: Validation and Test ---
14
+ with gr.Tabs():
15
+ with gr.Tab("Results: Validation"):
16
+ # 1. Load all necessary data for the "validation" split ONCE.
17
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
18
+
19
+ if not validation_df.empty:
20
+ # 2. Render the main category display using the loaded data.
21
+ create_leaderboard_display(
22
+ full_df=validation_df,
23
+ tag_map=validation_tag_map,
24
+ category_name=CATEGORY_NAME,
25
+ split_name="validation"
26
+ )
27
+
28
+ # 3. Render the detailed breakdown for each benchmark in the category.
29
+ create_benchmark_details_display(
30
+ full_df=validation_df,
31
+ tag_map=validation_tag_map,
32
+ category_name=CATEGORY_NAME
33
+ )
34
+ else:
35
+ gr.Markdown("No data available for validation split.")
36
+
37
+ with gr.Tab("Results: Test"):
38
+ # Repeat the process for the "test" split
39
+ test_df, test_tag_map = get_full_leaderboard_data("test")
40
+
41
+ if not test_df.empty:
42
+ create_leaderboard_display(
43
+ full_df=test_df,
44
+ tag_map=test_tag_map,
45
+ category_name=CATEGORY_NAME,
46
+ split_name="test"
47
+ )
48
+ create_benchmark_details_display(
49
+ full_df=test_df,
50
+ tag_map=test_tag_map,
51
+ category_name=CATEGORY_NAME
52
+ )
53
+ else:
54
+ gr.Markdown("No data available for test split.")
json_leaderboard.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, Any, Dict # Added Dict
3
+ from zoneinfo import ZoneInfo
4
+
5
+ # datasets import might not be strictly needed by LeaderboardViewer itself anymore,
6
+ # but _get_dataframe might still use types from it if EvalResult refers to them.
7
+ # For now, let's keep it if your EvalResult or SuiteConfig models have dependencies.
8
+ # If not, it can be removed from here.
9
+ import datasets # Potentially removable from this file
10
+ import matplotlib.pyplot as plt
11
+ import plotly.express as px
12
+ import plotly.graph_objects as go
13
+ import numpy as np
14
+ import pandas as pd
15
+ import seaborn as sns
16
+ import json # For loading the local JSON file
17
+ import os # For checking file existence
18
+
19
+ from agenteval import compute_summary_statistics
20
+ from agenteval.config import SuiteConfig
21
+ from agenteval.models import EvalResult
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ import logging
26
+ from typing import Optional, Any, Dict, List # Added List
27
+ from zoneinfo import ZoneInfo # Assuming this might be used by SuiteConfig/EvalResult or _get_dataframe
28
+ import json
29
+ import os
30
+
31
+ # Assuming these are correctly imported from your project
32
+ from agenteval.config import SuiteConfig
33
+ from agenteval.models import EvalResult
34
+ # from agenteval import compute_summary_statistics # Used by _get_dataframe
35
+
36
+
37
+ class DataTransformer:
38
+ """
39
+ Load and visualize leaderboard from a single, local JSON result file.
40
+ """
41
+ _INFORMAL_TO_FORMAL_NAME_MAP = {
42
+ "lit": "Literature Understanding",
43
+ "data": "Data Analysis",
44
+ "code": "Code Execution",
45
+ "discovery": "Discovery",
46
+ "arxivdigestables_validation": "Arxivdigestables Validation",
47
+ "sqa_dev": "Sqa Dev",
48
+ "litqa2_validation": "Litqa2 Validation",
49
+ "paper_finder_validation": "Paper Finder Validation",
50
+ "discoverybench_validation": "Discoverybench Validation",
51
+ "core_bench_validation": "Core Bench Validation",
52
+ "ds1000_validation": "DS1000 Validation",
53
+ "e2e_discovery_validation": "E2E Discovery Validation",
54
+ "super_validation": "Super Validation",
55
+ # Add any other raw names that can appear in task.name or task.tags
56
+ }
57
+
58
+ def __init__(
59
+ self,
60
+ json_file_path: str, # Mandatory: path to the local JSON file
61
+ split: str, # Still needed for context within the JSON's suite_config
62
+ is_internal: bool = False
63
+ ):
64
+ self._json_file_path = json_file_path
65
+ self._split = split
66
+ self._internal = is_internal
67
+ self._loaded_json_data: Optional[Dict[str, Any]] = None
68
+ self._cfg: Optional[SuiteConfig] = None
69
+
70
+ logger.info(f"Initializing LeaderboardViewer with local JSON file: {self._json_file_path}")
71
+
72
+ # --- Load and Validate JSON data ---
73
+ if not os.path.exists(self._json_file_path):
74
+ raise FileNotFoundError(f"JSON file not found at path: {self._json_file_path}")
75
+ try:
76
+ with open(self._json_file_path, 'r', encoding='utf-8') as f:
77
+ self._loaded_json_data = json.load(f)
78
+ except json.JSONDecodeError as e:
79
+ raise ValueError(f"Failed to parse JSON from local file {self._json_file_path}: {e}")
80
+ except Exception as e:
81
+ raise ValueError(f"Error reading local file {self._json_file_path}: {e}")
82
+
83
+ if not self._loaded_json_data:
84
+ raise ValueError(f"No data loaded from JSON file {self._json_file_path}.")
85
+
86
+ try:
87
+ eval_result = EvalResult.model_validate(self._loaded_json_data)
88
+ except Exception as e:
89
+ raise ValueError(f"Failed to validate JSON data from file '{self._json_file_path}' against EvalResult model: {e}")
90
+
91
+ self._cfg = eval_result.suite_config
92
+ if not isinstance(self._cfg, SuiteConfig):
93
+ raise TypeError(f"self._cfg is not a SuiteConfig object after loading from '{self._json_file_path}', got {type(self._cfg)}.")
94
+
95
+ # --- Populate Tag Map (Corrected Placement and Helper Function Access) ---
96
+ self.tag_map: dict[str, list[str]] = {}
97
+
98
+ # Access tasks from the loaded config
99
+ tasks_for_split: List[Any] = self._cfg.get_tasks(self._split) # Assuming get_tasks returns a list of task-like objects
100
+
101
+ for task in tasks_for_split:
102
+ # Ensure task object has 'name' and 'tags' attributes
103
+ if not hasattr(task, 'name') or not hasattr(task, 'tags'):
104
+ logger.warning(f"Task object {task} is missing 'name' or 'tags' attribute. Skipping.")
105
+ continue
106
+
107
+ formal_task_display_name = self._get_formal_display_name_static(task.name) # Use the helper method
108
+
109
+ if not (task.tags or []):
110
+ continue
111
+
112
+ for raw_tag_name in task.tags:
113
+ formal_tag_display_name_key = self._get_formal_display_name_static(raw_tag_name)
114
+
115
+ self.tag_map.setdefault(formal_tag_display_name_key, []).append(formal_task_display_name)
116
+
117
+ for key in self.tag_map:
118
+ self.tag_map[key] = sorted(list(set(self.tag_map[key])))
119
+
120
+ # --- Helper function defined as a static method or regular method ---
121
+ # Option 1: Static method (doesn't need 'self', uses the class attribute)
122
+ @staticmethod
123
+ def _get_formal_display_name_static(raw_name: str) -> str:
124
+ """
125
+ Helper function to get the formal display name for a raw tag or task name.
126
+ Uses the class's map and provides a fallback.
127
+ """
128
+ return DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP.get(raw_name, raw_name.replace("_", " ").title())
129
+
130
+ def _load(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
131
+ """
132
+ Prepares the DataFrame from the loaded JSON data.
133
+ The JSON data is already loaded and validated in __init__.
134
+ """
135
+ if self._loaded_json_data is None or self._cfg is None:
136
+ # This should not happen if __init__ completed successfully
137
+ raise RuntimeError("LeaderboardViewer2 not properly initialized. JSON data or SuiteConfig is missing.")
138
+
139
+ # The _get_dataframe function expects a list of records.
140
+ # Since we have a single JSON file representing one result, wrap it in a list.
141
+ records_list: list[dict] = [self._loaded_json_data]
142
+
143
+ overview_df = _get_dataframe(
144
+ records_list=records_list,
145
+ split=self._split,
146
+ is_internal=self._internal,
147
+ suite_config=self._cfg, # Pass the SuiteConfig loaded in __init__
148
+ )
149
+ return overview_df, self.tag_map
150
+
151
+ # --- view method remains the same as your last version ---
152
+ def view(
153
+ self,
154
+ tag: Optional[str] = None,
155
+ with_plots: bool = False,
156
+ use_plotly: bool = False,
157
+ ) -> tuple[pd.DataFrame, dict[str, Any]]:
158
+ data, tag_map = self._load() # tag_map is also returned by _load now
159
+ print(f"AHAHASHJDBFGASJHDBJAHSDB,AHDB {tag_map}")
160
+ print(f"THIS IS THE DATA DATA DTAA {data.columns}")
161
+ if data.empty or (len(data) == 1 and data.iloc[0].get("Agent") == "No data"):
162
+ logger.warning("No data available to view. Returning empty DataFrame and plots.")
163
+ return data, {}
164
+
165
+ base_cols = ["Agent", "Submitter", "Date", "Logs"]
166
+ existing_cols = [col for col in base_cols if col in data.columns]
167
+
168
+ primary_score_col: str
169
+ group_metric_names: list[str]
170
+
171
+ if tag is None:
172
+ primary = "Overall"
173
+ group = list(tag_map.keys())
174
+ else:
175
+ primary = tag
176
+ group = tag_map.get(tag, [])
177
+
178
+ if f"{primary} Score" in data.columns:
179
+ data = data.sort_values(f"{primary} Score", ascending=False)
180
+ else:
181
+ logger.warning(f"Primary metric '{primary}' for sorting not found. Data will not be sorted by it.")
182
+
183
+ metrics_to_display = []
184
+ if f"{primary} Cost" in data.columns:
185
+ metrics_to_display.append(f"{primary} Cost")
186
+ if f"{primary} Score" in data.columns:
187
+ metrics_to_display.append(f"{primary} Score")
188
+
189
+ for g_item in group:
190
+ if g_item in data.columns:
191
+ metrics_to_display.append(g_item)
192
+ if f"{g_item} Cost" in data.columns:
193
+ metrics_to_display.append(f"{g_item} Cost")
194
+ if f"{g_item} Score" in data.columns:
195
+ metrics_to_display.append(f"{g_item} Score")
196
+
197
+
198
+ final_cols_to_display = existing_cols + [m for m in metrics_to_display if m in data.columns]
199
+ final_cols_to_display = sorted(list(set(final_cols_to_display)), key=final_cols_to_display.index)
200
+
201
+ df_view = data.loc[:, final_cols_to_display].reset_index(drop=True)
202
+
203
+ plots: dict[str, Any] = {}
204
+ if with_plots:
205
+ plot_metric_names = [primary] + [g_item for g_item in group if g_item in data.columns]
206
+ for metric_name in plot_metric_names:
207
+ score_col = f"{metric_name} Score"
208
+ cost_col = f"{metric_name} Cost"
209
+ if score_col in df_view.columns and cost_col in df_view.columns:
210
+ if use_plotly:
211
+ fig = _plot_scatter_plotly(df_view, x=cost_col, y=score_col, agent_col="Agent")
212
+ plots[f"scatter_{metric_name}"] = fig
213
+ else:
214
+ logger.warning(
215
+ f"Skipping plot for '{metric_name}': score column '{score_col}' or cost column '{cost_col}' not found."
216
+ )
217
+ return df_view, plots
218
+
219
+
220
+ def _safe_round(value, digits=2):
221
+ return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
222
+
223
+ def _get_dataframe(
224
+ records_list: list[dict],
225
+ split: str,
226
+ is_internal: bool,
227
+ suite_config: SuiteConfig,
228
+ timezone: str = "US/Pacific",
229
+ ) -> pd.DataFrame:
230
+ # This function remains the same as in the previous version you provided.
231
+ # It takes a list of records (which will be a list containing one item
232
+ # from the loaded JSON file) and processes it.
233
+ if not records_list:
234
+ logger.warning(f"No records provided to _get_dataframe for split '{split}'. Returning empty DataFrame with placeholder.")
235
+ expected_pretty_cols = ["Agent Name", "Submitter", "Date", "Overall Score", "Logs"]
236
+ empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
237
+ return empty_df
238
+
239
+ cfg = suite_config
240
+
241
+ rows = []
242
+ for itm_idx, itm in enumerate(records_list):
243
+ if not isinstance(itm, dict):
244
+ logger.warning(f"Item {itm_idx} in records_list is not a dict, skipping.")
245
+ continue
246
+ try:
247
+ ev = EvalResult.model_validate(itm)
248
+ except Exception as e:
249
+ logger.error(f"Failed to validate item {itm_idx} with EvalResult: {itm}. Error: {e}")
250
+ continue
251
+
252
+ sub = ev.submission
253
+ date_str = None
254
+ if sub.submit_time is not None:
255
+ submit_dt = sub.submit_time
256
+ if not isinstance(submit_dt, pd.Timestamp):
257
+ if submit_dt.tzinfo is None:
258
+ logger.debug(f"Submission time for {sub.agent_name} is timezone-naive, assuming UTC.")
259
+ submit_dt = submit_dt.replace(tzinfo=ZoneInfo("UTC"))
260
+ date_str = pd.Timestamp(submit_dt).tz_convert(ZoneInfo(timezone)).strftime("%Y-%m-%d")
261
+ else:
262
+ date_str = None
263
+
264
+ if not ev.results:
265
+ logger.warning(
266
+ f"Skipping submission {sub.agent_name} ({sub.username or 'N/A'}) "
267
+ f"({sub.submit_time or 'N/A'}) due to no results."
268
+ )
269
+ continue
270
+ stats = compute_summary_statistics(
271
+ suite_config=cfg, split=split, results=ev.results
272
+ )
273
+ flat = {}
274
+ print(f"STATS STATS ASTATAS SD T S T A A {stats}")
275
+ for key, s_obj in stats.items():
276
+ parts = key.split("/")
277
+ if parts[0] == "overall":
278
+ flat["overall/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
279
+ flat["overall/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
280
+ elif parts[0] == "tag" and len(parts) > 1:
281
+ tag_name = parts[1]
282
+ flat[f"tag/{tag_name}/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
283
+ flat[f"tag/{tag_name}/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
284
+ elif parts[0] == "task" and len(parts) > 1:
285
+ task_name = parts[1]
286
+ score = getattr(s_obj, 'score', np.nan)
287
+ cost = getattr(s_obj, 'cost', np.nan)
288
+ score_stderr = getattr(s_obj, 'score_stderr', np.nan)
289
+ cost_stderr = getattr(s_obj, 'cost_stderr', np.nan)
290
+
291
+ flat[f"task/{task_name}/score"] = _safe_round(score)
292
+ flat[f"task/{task_name}/score_ci"] = _safe_round(score_stderr * 1.96 if pd.notna(score_stderr) else np.nan)
293
+ flat[f"task/{task_name}/cost"] = _safe_round(cost)
294
+ flat[f"task/{task_name}/cost_ci"] = _safe_round(cost_stderr * 1.96 if pd.notna(cost_stderr) else np.nan)
295
+ else:
296
+ logger.debug(f"Uncommon key structure from compute_summary_statistics: '{key}'. Attempting generic add.")
297
+ if hasattr(s_obj, 'score'):
298
+ flat[f"{key}/score"] = _safe_round(s_obj.score)
299
+ if hasattr(s_obj, 'cost'):
300
+ flat[f"{key}/cost"] = _safe_round(s_obj.cost)
301
+
302
+ current_logs_url = None
303
+ if is_internal and sub.logs_url:
304
+ current_logs_url = str(sub.logs_url)
305
+ elif not is_internal and sub.logs_url_public:
306
+ current_logs_url = str(sub.logs_url_public)
307
+
308
+ rows.append(
309
+ {
310
+ "agent_name": sub.agent_name or "N/A",
311
+ "username": sub.username or "N/A",
312
+ "submit_time": date_str,
313
+ **flat,
314
+ "logs_url": current_logs_url,
315
+ }
316
+ )
317
+
318
+ if not rows:
319
+ logger.warning(f"No valid rows generated from records_list for split '{split}'. Returning empty DataFrame with placeholder.")
320
+ expected_pretty_cols = ["Agent", "Submitter", "Date", "Overall Score", "Overall Cost", "Logs"]
321
+ empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
322
+ return empty_df
323
+
324
+ df = pd.DataFrame(rows)
325
+ pretty_cols = {c: _pretty_column_name(c) for c in df.columns if c in df.columns}
326
+ overview = df.rename(columns=pretty_cols)
327
+ return overview
328
+
329
+ def _pretty_column_name(col: str) -> str:
330
+ """Map raw column name to display name."""
331
+ # --- Step 1: Fixed, direct mappings ---
332
+ fixed_mappings = {
333
+ "submit_time": "Date",
334
+ "agent_name": "Agent",
335
+ "username": "Submitter",
336
+ "logs_url": "Logs",
337
+ "overall/score": "Overall Score",
338
+ "overall/cost": "Overall Cost",
339
+ }
340
+ if col in fixed_mappings:
341
+ return fixed_mappings[col]
342
+
343
+ # --- Step 2: Define your mapping for informal names to descriptive names ---
344
+ informal_map = DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP
345
+
346
+ # --- Step 3: Dynamic mappings for task or tag columns using the informal_to_formal_name_map ---
347
+ parts = col.split("/")
348
+ if len(parts) == 3:
349
+ item_type, informal_name, metric_suffix = parts #
350
+
351
+ formal_name = informal_map.get(informal_name)
352
+ if formal_name is None:
353
+ formal_name = informal_name.replace("_", " ").title()
354
+ print(f"[DEBUG _pretty_column_name] Informal name '{informal_name}' not in map, using fallback: '{formal_name}'")
355
+
356
+ if metric_suffix == "score":
357
+ return f"{formal_name} Score"
358
+ if metric_suffix == "cost":
359
+ return f"{formal_name} Cost"
360
+ if metric_suffix == "score_ci":
361
+ return f"{formal_name} Score 95% CI"
362
+ if metric_suffix == "cost_ci":
363
+ return f"{formal_name} Cost 95% CI"
364
+
365
+ # --- Step 4: Fallback for columns that don't match the "type/name/metric" pattern ---
366
+ if "/" not in col:
367
+ return col.replace("_", " ").title()
368
+ else:
369
+ return parts[-1].replace("_", " ").title()
370
+
371
+ DEFAULT_Y_COLUMN = "Overall Score"
372
+ DUMMY_X_VALUE_FOR_MISSING_COSTS = 0 # Value to use if x-axis data (costs) is missing
373
+
374
+ def _plot_scatter_plotly(
375
+ data: pd.DataFrame,
376
+ x: Optional[str],
377
+ y: str,
378
+ agent_col: str = "Agent"
379
+ ) -> go.Figure:
380
+
381
+ x_col_to_use = x
382
+ y_col_to_use = y
383
+
384
+ # 1. Check if y-column exists
385
+ if y_col_to_use not in data.columns:
386
+ logger.error(
387
+ f"y-axis column '{y_col_to_use}' MUST exist in DataFrame. "
388
+ f"Cannot generate plot. Available columns: {data.columns.tolist()}"
389
+ )
390
+ return go.Figure()
391
+
392
+ # 2. Check if agent_col exists
393
+ if agent_col not in data.columns:
394
+ logger.warning(
395
+ f"Agent column '{agent_col}' not found in DataFrame. "
396
+ f"Available columns: {data.columns.tolist()}. Returning empty figure."
397
+ )
398
+ return go.Figure()
399
+
400
+ # 3. Prepare data (make a copy, handle numeric conversion for y)
401
+ data_plot = data.copy()
402
+ try:
403
+ data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
404
+ except Exception as e:
405
+ logger.error(f"Error converting y-column '{y_col_to_use}' to numeric: {e}. Returning empty figure.")
406
+ return go.Figure()
407
+
408
+ # 4. Handle x-column (costs)
409
+ x_axis_label = x_col_to_use if x_col_to_use else "Cost (Data N/A)" # Label for the x-axis
410
+ x_data_is_valid = False
411
+
412
+ if x_col_to_use and x_col_to_use in data_plot.columns:
413
+ try:
414
+ data_plot[x_col_to_use] = pd.to_numeric(data_plot[x_col_to_use], errors='coerce')
415
+ # Check if there's any non-NaN data after coercion for x
416
+ if data_plot[x_col_to_use].notna().any():
417
+ x_data_is_valid = True
418
+ else:
419
+ logger.info(f"x-axis column '{x_col_to_use}' exists but contains all NaN/None values after numeric conversion.")
420
+ except Exception as e:
421
+ logger.warning(f"Error converting x-column '{x_col_to_use}' to numeric: {e}. Will use dummy x-values.")
422
+ # x_data_is_valid remains False
423
+ else:
424
+ if x_col_to_use: # Name was provided but column doesn't exist
425
+ logger.warning(f"x-axis column '{x_col_to_use}' not found in DataFrame.")
426
+ else: # x (column name) was None
427
+ logger.info("x-axis column name was not provided (is None).")
428
+
429
+ if not x_data_is_valid:
430
+ logger.info(f"Using dummy x-value '{DUMMY_X_VALUE_FOR_MISSING_COSTS}' for all data points as x-data is missing or invalid.")
431
+ # Create a new column with the dummy x-value for all rows
432
+ # Use a unique name for this dummy column to avoid potential clashes
433
+ dummy_x_col_name = "__dummy_x_for_plotting__"
434
+ data_plot[dummy_x_col_name] = DUMMY_X_VALUE_FOR_MISSING_COSTS
435
+ x_col_to_use = dummy_x_col_name # Update x_col_to_use to point to our dummy data
436
+ x_axis_label = x if x else "Cost (Data N/A)" # Use original x name for label if provided
437
+ # or a generic label if x was None.
438
+ # Could also be f"Cost (Fixed at {DUMMY_X_VALUE_FOR_MISSING_COSTS})"
439
+
440
+
441
+ # 5. Drop rows where y is NaN (x is now guaranteed to have values, either real or dummy)
442
+ data_plot.dropna(subset=[y_col_to_use], inplace=True)
443
+
444
+ fig = go.Figure()
445
+
446
+ if data_plot.empty:
447
+ logger.warning(f"No valid data to plot for y='{y_col_to_use}' (and x='{x_col_to_use}') after cleaning NaNs from y.")
448
+ # Still return a figure object, but it will be empty. Update layout for clarity.
449
+ fig.update_layout(
450
+ title=f"{y_col_to_use} vs. {x_axis_label} (No Data)",
451
+ xaxis=dict(title=x_axis_label, range=[DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1] if not x_data_is_valid else None),
452
+ yaxis=dict(title=y_col_to_use)
453
+ )
454
+ return fig
455
+
456
+
457
+ for agent, group in data_plot.groupby(agent_col):
458
+ hover_x_display = "%{x:.2f}" if x_data_is_valid else str(DUMMY_X_VALUE_FOR_MISSING_COSTS) + " (fixed)"
459
+ fig.add_trace(go.Scatter(
460
+ x=group[x_col_to_use],
461
+ y=group[y_col_to_use],
462
+ mode='markers',
463
+ name=str(agent),
464
+ hovertemplate=f"{x_axis_label}: {hover_x_display}<br>{y_col_to_use}: %{{y:.2f}}<extra>{str(agent)}</extra>",
465
+ marker=dict(size=10)
466
+ ))
467
+
468
+ # Configure layout
469
+ xaxis_config = dict(title=x_axis_label)
470
+ if not x_data_is_valid: # If using dummy x, set a tighter, fixed range for x-axis
471
+ xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
472
+ xaxis_config['tickvals'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS] # Show only one tick at the dummy value
473
+ xaxis_config['ticktext'] = [str(DUMMY_X_VALUE_FOR_MISSING_COSTS)]
474
+ else: # Real x-data
475
+ xaxis_config['rangemode'] = "tozero"
476
+
477
+
478
+ fig.update_layout(
479
+ title=f"{y_col_to_use} vs. {x_axis_label}",
480
+ xaxis=xaxis_config,
481
+ yaxis=dict(title=y_col_to_use, rangemode="tozero"),
482
+ legend_title_text=agent_col
483
+ )
484
+
485
+ return fig
leaderboard_transformer.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+ import numpy as np
3
+ import pandas as pd
4
+ import logging
5
+ from typing import Optional, Any, Dict, List # Added List
6
+ from zoneinfo import ZoneInfo # Assuming this might be used by SuiteConfig/EvalResult or _get_dataframe
7
+ import json
8
+ import os
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ INFORMAL_TO_FORMAL_NAME_MAP = {
13
+ # Short Names
14
+ "lit": "Literature Understanding",
15
+ "data": "Data Analysis",
16
+ "code": "Code Execution",
17
+ "discovery": "Discovery",
18
+
19
+ # Long Raw Names
20
+ "arxivdigestables_validation": "Arxivdigestables Validation",
21
+ "sqa_dev": "Sqa Dev",
22
+ "litqa2_validation": "Litqa2 Validation",
23
+ "paper_finder_validation": "Paper Finder Validation",
24
+ "discoverybench_validation": "Discoverybench Validation",
25
+ "core_bench_validation": "Core Bench Validation",
26
+ "ds1000_validation": "DS1000 Validation",
27
+ "e2e_discovery_validation": "E2E Discovery Validation",
28
+ "super_validation": "Super Validation",
29
+ }
30
+
31
+
32
+ ### 2. The Updated Helper Functions ###
33
+
34
+ def _safe_round(value, digits=2):
35
+ """Rounds a number if it's a valid float/int, otherwise returns it as is."""
36
+ return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
37
+
38
+
39
+ def _pretty_column_name(raw_col: str) -> str:
40
+ """
41
+ Takes a raw column name from the DataFrame and returns a "pretty" version.
42
+ Handles three cases:
43
+ 1. Fixed names (e.g., 'User/organization' -> 'Submitter').
44
+ 2. Dynamic names (e.g., 'ds1000_validation score' -> 'DS1000 Validation Score').
45
+ 3. Fallback for any other names.
46
+ """
47
+ # Case 1: Handle fixed, special-case mappings first.
48
+ fixed_mappings = {
49
+ 'Agent': 'Agent',
50
+ 'Agent description': 'Agent Description',
51
+ 'User/organization': 'Submitter',
52
+ 'Submission date': 'Date',
53
+ 'Overall': 'Overall Score',
54
+ 'Overall cost': 'Overall Cost',
55
+ 'Logs': 'Logs'
56
+ }
57
+ if raw_col in fixed_mappings:
58
+ return fixed_mappings[raw_col]
59
+
60
+ # Case 2: Handle dynamic names by finding the longest matching base name.
61
+ # We sort by length (desc) to match 'core_bench_validation' before 'core_bench'.
62
+ sorted_base_names = sorted(INFORMAL_TO_FORMAL_NAME_MAP.keys(), key=len, reverse=True)
63
+
64
+ for base_name in sorted_base_names:
65
+ if raw_col.startswith(base_name):
66
+ formal_name = INFORMAL_TO_FORMAL_NAME_MAP[base_name]
67
+
68
+ # Get the metric part (e.g., ' score' or ' cost 95% CI')
69
+ metric_part = raw_col[len(base_name):].strip()
70
+
71
+ # Capitalize the metric part correctly (e.g., 'score' -> 'Score')
72
+ pretty_metric = metric_part.capitalize()
73
+
74
+ return f"{formal_name} {pretty_metric}"
75
+
76
+ # Case 3: If no specific rule applies, just make it title case.
77
+ return raw_col.title()
78
+
79
+
80
+ def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
81
+ """
82
+ Converts a tag map with raw names into a tag map with pretty, formal names.
83
+
84
+ Args:
85
+ raw_tag_map: The map with raw keys and values (e.g., {'lit': ['litqa2_validation']}).
86
+ name_map: The INFORMAL_TO_FORMAL_NAME_MAP used for translation.
87
+
88
+ Returns:
89
+ A new dictionary with pretty names (e.g., {'Literature Understanding': ['Litqa2 Validation']}).
90
+ """
91
+ pretty_map = {}
92
+ # A reverse map to find raw keys from formal names if needed, though not used here
93
+ # This is just for understanding; the main logic uses the forward map.
94
+
95
+ # Helper to get pretty name with a fallback
96
+ def get_pretty(raw_name):
97
+ return name_map.get(raw_name, raw_name.replace("_", " ").title())
98
+
99
+ for raw_key, raw_value_list in raw_tag_map.items():
100
+ pretty_key = get_pretty(raw_key)
101
+ pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
102
+ pretty_map[pretty_key] = sorted(list(set(pretty_value_list)))
103
+
104
+ return pretty_map
105
+
106
+
107
+ def transform_raw_dataframe(raw_df: pd.DataFrame) -> pd.DataFrame:
108
+ """
109
+ Transforms a raw leaderboard DataFrame into a presentation-ready format.
110
+
111
+ This function performs two main actions:
112
+ 1. Rounds all numeric metric values (columns containing 'score' or 'cost').
113
+ 2. Renames all columns to a "pretty", human-readable format.
114
+ Args:
115
+ raw_df (pd.DataFrame): The DataFrame with raw data and column names
116
+ like 'agent_name', 'overall/score', 'tag/code/cost'.
117
+ Returns:
118
+ pd.DataFrame: A new DataFrame ready for display.
119
+ """
120
+ if not isinstance(raw_df, pd.DataFrame):
121
+ raise TypeError("Input 'raw_df' must be a pandas DataFrame.")
122
+
123
+ df = raw_df.copy()
124
+
125
+ # Create the mapping for pretty column names
126
+ pretty_cols_map = {col: _pretty_column_name(col) for col in df.columns}
127
+
128
+ # Rename the columns and return the new DataFrame
129
+ transformed_df = df.rename(columns=pretty_cols_map)
130
+ # Apply safe rounding to all metric columns
131
+ for col in transformed_df.columns:
132
+ if 'Score' in col or 'Cost' in col:
133
+ transformed_df[col] = transformed_df[col].apply(_safe_round)
134
+
135
+ logger.info("Raw DataFrame transformed: numbers rounded and columns renamed.")
136
+ return transformed_df
137
+
138
+
139
+ class DataTransformer:
140
+ """
141
+ Visualizes a pre-processed leaderboard DataFrame.
142
+
143
+ This class takes a "pretty" DataFrame and a tag map, and provides
144
+ methods to view filtered versions of the data and generate plots.
145
+ """
146
+ def __init__(self, dataframe: pd.DataFrame, tag_map: dict[str, list[str]]):
147
+ """
148
+ Initializes the viewer.
149
+
150
+ Args:
151
+ dataframe (pd.DataFrame): The presentation-ready leaderboard data.
152
+ tag_map (dict): A map of formal tag names to formal task names.
153
+ """
154
+ if not isinstance(dataframe, pd.DataFrame):
155
+ raise TypeError("Input 'dataframe' must be a pandas DataFrame.")
156
+ if not isinstance(tag_map, dict):
157
+ raise TypeError("Input 'tag_map' must be a dictionary.")
158
+
159
+ self.data = dataframe
160
+ self.tag_map = tag_map
161
+ logger.info(f"DataTransformer initialized with a DataFrame of shape {self.data.shape}.")
162
+
163
+
164
+ def view(
165
+ self,
166
+ tag: Optional[str] = "Overall", # Default to "Overall" for clarity
167
+ use_plotly: bool = False,
168
+ ) -> tuple[pd.DataFrame, dict[str, go.Figure]]:
169
+ """
170
+ Generates a filtered view of the DataFrame and a corresponding scatter plot.
171
+ """
172
+ if self.data.empty:
173
+ logger.warning("No data available to view.")
174
+ return self.data, {}
175
+
176
+ # --- 1. Determine Primary and Group Metrics Based on the Tag ---
177
+ if tag is None or tag == "Overall":
178
+ primary_metric = "Overall"
179
+ group_metrics = list(self.tag_map.keys())
180
+ else:
181
+ primary_metric = tag
182
+ # For a specific tag, the group is its list of sub-tasks.
183
+ group_metrics = self.tag_map.get(tag, [])
184
+
185
+ # --- 2. Sort the DataFrame by the Primary Score ---
186
+ primary_score_col = f"{primary_metric} Score"
187
+ df_sorted = self.data
188
+ if primary_score_col in self.data.columns:
189
+ df_sorted = self.data.sort_values(primary_score_col, ascending=False, na_position='last')
190
+
191
+ # --- 3. Build the List of Columns to Display ---
192
+ base_cols = ["Agent", "Submitter"]
193
+ new_cols = ["Openness", "Degree of Control"]
194
+ ending_cols = ["Date", "Logs"]
195
+
196
+ # Start with the primary metric score and cost
197
+ metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
198
+
199
+ # Add the score and cost for each item in our group
200
+ for item in group_metrics:
201
+ metrics_to_display.append(f"{item} Score")
202
+ metrics_to_display.append(f"{item} Cost")
203
+
204
+ # Combine base columns with metric columns, ensuring uniqueness and order
205
+ final_cols_ordered = base_cols + list(dict.fromkeys(metrics_to_display))+ new_cols + ending_cols
206
+
207
+ # Filter to only include columns that actually exist in our DataFrame
208
+ df_view = df_sorted.copy()
209
+ for col in final_cols_ordered:
210
+ if col not in df_view.columns:
211
+ df_view[col] = pd.NA
212
+
213
+ df_view = df_view[final_cols_ordered].reset_index(drop=True)
214
+
215
+ # Calculated and add "Categories Attempted" column
216
+ if primary_metric == "Overall":
217
+ def calculate_attempted(row):
218
+ main_categories = ['Literature Understanding', 'Data Analysis', 'Code Execution', 'Discovery']
219
+ count = sum(1 for category in main_categories if pd.notna(row.get(f"{category} Cost")))
220
+
221
+ # Return the formatted string with the correct emoji
222
+ if count == 4:
223
+ return f"4/4 ✅"
224
+ if count == 0:
225
+ return f"0/4 🚫"
226
+ return f"{count}/4 ⚠️"
227
+
228
+ # Apply the function row-wise to create the new column
229
+ attempted_column = df_view.apply(calculate_attempted, axis=1)
230
+ # Insert the new column at a nice position (e.g., after "Date")
231
+ df_view.insert(2, "Categories Attempted", attempted_column)
232
+ else:
233
+ total_benchmarks = len(group_metrics)
234
+ def calculate_benchmarks_attempted(row):
235
+ # Count how many benchmarks in this category have COST data reported
236
+ count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Cost")))
237
+ if count == total_benchmarks:
238
+ return f"{count}/{total_benchmarks} ✅"
239
+ elif count == 0:
240
+ return f"{count}/{total_benchmarks} 🚫"
241
+ else:
242
+ return f"{count}/{total_benchmarks}⚠️"
243
+ # Insert the new column, for example, after "Date"
244
+ df_view.insert(2, "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
245
+
246
+
247
+ # --- 4. Generate the Scatter Plot for the Primary Metric ---
248
+ plots: dict[str, go.Figure] = {}
249
+ if use_plotly:
250
+ primary_cost_col = f"{primary_metric} Cost"
251
+ # Check if the primary score and cost columns exist in the FINAL view
252
+ if primary_score_col in df_view.columns and primary_cost_col in df_view.columns:
253
+ fig = _plot_scatter_plotly(
254
+ data=df_view,
255
+ x=primary_cost_col,
256
+ y=primary_score_col,
257
+ agent_col="Agent"
258
+ )
259
+ # Use a consistent key for easy retrieval later
260
+ plots['scatter_plot'] = fig
261
+ else:
262
+ logger.warning(
263
+ f"Skipping plot for '{primary_metric}': score column '{primary_score_col}' "
264
+ f"or cost column '{primary_cost_col}' not found."
265
+ )
266
+ # Add an empty figure to avoid downstream errors
267
+ plots['scatter_plot'] = go.Figure()
268
+ return df_view, plots
269
+
270
+ DEFAULT_Y_COLUMN = "Overall Score"
271
+ DUMMY_X_VALUE_FOR_MISSING_COSTS = 0
272
+
273
+ def _plot_scatter_plotly(
274
+ data: pd.DataFrame,
275
+ x: Optional[str],
276
+ y: str,
277
+ agent_col: str = "Agent"
278
+ ) -> go.Figure:
279
+
280
+ # --- Steps 1-4: Data Validation and Preparation ---
281
+ x_col_to_use = x
282
+ y_col_to_use = y
283
+
284
+ if y_col_to_use not in data.columns:
285
+ logger.error(f"y-axis column '{y_col_to_use}' not found.")
286
+ return go.Figure()
287
+ if agent_col not in data.columns:
288
+ logger.warning(f"Agent column '{agent_col}' not found.")
289
+ return go.Figure()
290
+
291
+ data_plot = data.copy()
292
+ data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
293
+
294
+ x_axis_label = x if x else "Cost (Data N/A)"
295
+ x_data_is_valid = False
296
+ if x and x in data_plot.columns:
297
+ try:
298
+ data_plot[x_col_to_use] = pd.to_numeric(data_plot[x_col_to_use], errors='coerce')
299
+ if data_plot[x_col_to_use].notna().any():
300
+ x_data_is_valid = True
301
+ except Exception as e:
302
+ logger.warning(f"Error converting x-column '{x_col_to_use}' to numeric: {e}")
303
+
304
+ if not x_data_is_valid:
305
+ dummy_x_col_name = "__dummy_x_for_plotting__"
306
+ data_plot[dummy_x_col_name] = DUMMY_X_VALUE_FOR_MISSING_COSTS
307
+ x_col_to_use = dummy_x_col_name
308
+ logger.info("Using dummy x-values for plotting.")
309
+
310
+ # --- Step 5: Clean Data and Initialize Figure ---
311
+ data_plot.dropna(subset=[y_col_to_use, x_col_to_use], inplace=True)
312
+ fig = go.Figure()
313
+ if data_plot.empty:
314
+ logger.warning(f"No valid data to plot for y='{y_col_to_use}' and x='{x_col_to_use}'.")
315
+ return fig
316
+
317
+ # Step 6 - Calculate and Draw the Efficiency Frontier Line ---
318
+ if x_data_is_valid:
319
+ # Sort by cost (ascending), then by score (descending) to break ties
320
+ sorted_data = data_plot.sort_values(by=[x_col_to_use, y_col_to_use], ascending=[True, False])
321
+
322
+ frontier_points = []
323
+ max_score_so_far = float('-inf')
324
+
325
+ for index, row in sorted_data.iterrows():
326
+ score = row[y_col_to_use]
327
+ # If this point offers a better score than any we've seen before,
328
+ # it's part of the frontier.
329
+ if score > max_score_so_far:
330
+ frontier_points.append({'x': row[x_col_to_use], 'y': score})
331
+ max_score_so_far = score
332
+
333
+ # Add the frontier line trace to the plot if we found any points
334
+ if frontier_points:
335
+ frontier_df = pd.DataFrame(frontier_points)
336
+ fig.add_trace(go.Scatter(
337
+ x=frontier_df['x'],
338
+ y=frontier_df['y'],
339
+ mode='lines',
340
+ name='Efficiency Frontier',
341
+ line=dict(color='firebrick', width=2, dash='dash'),
342
+ hoverinfo='skip' # The line doesn't need a hover tooltip
343
+ ))
344
+
345
+ # --- Step 7: Plot Individual Agent Markers (No changes here) ---
346
+ for agent, group in data_plot.groupby(agent_col):
347
+ hover_x_display = "%{x:.2f}" if x_data_is_valid else "N/A"
348
+ fig.add_trace(go.Scatter(
349
+ x=group[x_col_to_use],
350
+ y=group[y_col_to_use],
351
+ mode='markers',
352
+ name=str(agent),
353
+ hovertemplate=f"<b>{str(agent)}</b><br>{x_axis_label}: {hover_x_display}<br>{y_col_to_use}: %{{y:.2f}}""<extra></extra>",
354
+ marker=dict(size=10, opacity=0.8)
355
+ ))
356
+
357
+ # --- Step 8: Configure Layout (No changes here) ---
358
+ xaxis_config = dict(title=x_axis_label)
359
+ if not x_data_is_valid:
360
+ xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
361
+ xaxis_config['tickvals'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS]
362
+ else:
363
+ xaxis_config['rangemode'] = "tozero"
364
+
365
+ fig.update_layout(
366
+ title=f"{y_col_to_use} vs. {x_axis_label}",
367
+ xaxis=xaxis_config,
368
+ yaxis=dict(title=y_col_to_use, rangemode="tozero"),
369
+ legend_title_text=agent_col
370
+ )
371
+
372
+ return fig
373
+
374
+ def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
375
+ """
376
+ Applies custom formatting to a cost column based on its corresponding score column.
377
+ - If cost is not null, it remains unchanged.
378
+ - If cost is null but score is not, it becomes "Missing Cost".
379
+ - If both cost and score are null, it becomes "Not Attempted".
380
+ Args:
381
+ df: The DataFrame to modify.
382
+ cost_col_name: The name of the cost column to format (e.g., "Overall Cost").
383
+ Returns:
384
+ The DataFrame with the formatted cost column.
385
+ """
386
+ # Find the corresponding score column by replacing "Cost" with "Score"
387
+ score_col_name = cost_col_name.replace("Cost", "Score")
388
+
389
+ # Ensure the score column actually exists to avoid errors
390
+ if score_col_name not in df.columns:
391
+ return df # Return the DataFrame unmodified if there's no matching score
392
+
393
+ def apply_formatting_logic(row):
394
+ cost_value = row[cost_col_name]
395
+ score_value = row[score_col_name]
396
+ status_color = "#ec4899"
397
+
398
+ if pd.notna(cost_value) and isinstance(cost_value, (int, float)):
399
+ return f"${cost_value:.2f}"
400
+ elif pd.notna(score_value):
401
+ return f'<span style="color: {status_color};">Missing Cost</span>' # Score exists, but cost is missing
402
+ else:
403
+ return f'<span style="color: {status_color};">Not Attempted</span>' # Neither score nor cost exists
404
+
405
+ # Apply the logic to the specified cost column and update the DataFrame
406
+ df[cost_col_name] = df.apply(apply_formatting_logic, axis=1)
407
+
408
+ return df
409
+
410
+ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
411
+ """
412
+ Applies custom formatting to a score column for display.
413
+ - If a score is 0 or NaN, it's displayed as a colored "0".
414
+ - Other scores are formatted to two decimal places.
415
+ """
416
+ status_color = "#ec4899" # The same color as your other status text
417
+
418
+ # First, fill any NaN values with 0 so we only have one case to handle.
419
+ # We must use reassignment to avoid the SettingWithCopyWarning.
420
+ df[score_col_name] = df[score_col_name].fillna(0)
421
+
422
+ def apply_formatting(score_value):
423
+ # Now, we just check if the value is 0.
424
+ if score_value == 0:
425
+ return f'<span style="color: {status_color};">0.0</span>'
426
+
427
+ # For all other numbers, format them for consistency.
428
+ if isinstance(score_value, (int, float)):
429
+ return f"{score_value:.2f}"
430
+
431
+ # Fallback for any unexpected non-numeric data
432
+ return score_value
433
+
434
+ # Apply the formatting and return the updated DataFrame
435
+ return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
436
+
leaderboard_viewer.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ View and plot leaderboard results.
3
+ """
4
+
5
+ import logging
6
+ from typing import Optional
7
+ from zoneinfo import ZoneInfo
8
+
9
+ import datasets
10
+ import matplotlib.pyplot as plt
11
+ import numpy as np
12
+ import pandas as pd
13
+ import seaborn as sns
14
+
15
+ from agenteval import compute_summary_statistics
16
+ from agenteval.config import SuiteConfig
17
+ from agenteval.models import EvalResult
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class LeaderboardViewer:
23
+ """
24
+ Load and visualize leaderboard for a given HF dataset split.
25
+ """
26
+
27
+ def __init__(
28
+ self, repo_id: str, config: str, split: str, is_internal: bool = False
29
+ ):
30
+ self._repo_id = repo_id
31
+ self._config = config
32
+ self._split = split
33
+ self._internal = is_internal
34
+
35
+ # build suite_config and mapping from tags to tasks from the first result
36
+ # TODO: Verify the sort order
37
+ ds = datasets.load_dataset(repo_id, name=config).get(split)
38
+ if not ds:
39
+ raise ValueError(f"Split '{split}' not found in dataset results")
40
+ suite = EvalResult.model_validate(ds[0]).suite_config
41
+ self._cfg = suite
42
+ self.tag_map: dict[str, list[str]] = {}
43
+ for task in suite.get_tasks(split):
44
+ for t in task.tags or []:
45
+ self.tag_map.setdefault(t, []).append(task.name)
46
+
47
+ def _load(self):
48
+ results = datasets.load_dataset(self._repo_id, name=self._config)
49
+ overview = _get_dataframe(
50
+ eval_results=results,
51
+ split=self._split,
52
+ is_internal=self._internal,
53
+ suite_config=self._cfg,
54
+ )
55
+ return overview, self.tag_map
56
+
57
+ def view(
58
+ self, tag: Optional[str] = None, with_plots: bool = False
59
+ ) -> tuple[pd.DataFrame, dict[str, plt.Figure]]:
60
+ """
61
+ If tag is None, primary="Overall" and group=all tags.
62
+ Otherwise primary=tag and group=tasks under that tag.
63
+ """
64
+ data, tag_map = self._load()
65
+ cols = [
66
+ "Agent",
67
+ "Submitter",
68
+ "Completeness",
69
+ "LLM Base",
70
+ "Openness" ,
71
+ "Date",
72
+ "Logs",
73
+ ]
74
+
75
+ # choose primary metric and its sub‐group
76
+ if tag is None:
77
+ primary = "Overall"
78
+ group = list(tag_map.keys())
79
+ else:
80
+ primary = tag
81
+ group = tag_map.get(tag, [])
82
+ data = data.sort_values(primary, ascending=False)
83
+
84
+ # build full metric list: primary + its cost + each member and its cost
85
+ metrics = [primary, f"{primary} cost"] + [
86
+ m for t in group for m in (t, f"{t} cost")
87
+ ]
88
+
89
+ # filter to relevant columns
90
+ ci_cols = [f"{m} 95% CI" for m in metrics if f"{m} 95% CI" in data.columns]
91
+ df = data.loc[
92
+ :,
93
+ cols + [c for c in metrics if c in data.columns] + ci_cols,
94
+ ].reset_index(drop=True)
95
+
96
+ plots: dict[str, plt.Figure] = {}
97
+ if with_plots:
98
+ avail = [c for c in metrics if c in df.columns]
99
+ for m in [primary] + group:
100
+ x, y = f"{m} cost", m
101
+ if x in df.columns and y in df.columns:
102
+ plots[f"scatter_{m}"] = _plot_scatter(
103
+ df, x=x, y=y, agent_col="Agent"
104
+ )
105
+
106
+ return df, plots
107
+
108
+
109
+ def _get_dataframe(
110
+ eval_results: datasets.DatasetDict,
111
+ split: str,
112
+ is_internal: bool,
113
+ suite_config: SuiteConfig,
114
+ timezone: str = "US/Pacific",
115
+ ) -> pd.DataFrame:
116
+ """
117
+ Load leaderboard results from the given dataset split and return a DataFrame.
118
+ """
119
+ ds = eval_results.get(split)
120
+ if not ds:
121
+ cols = ["agent_name", "agent_description", "username", "submit_time"]
122
+ pretty = [_pretty_column_name(c) for c in cols]
123
+ empty = pd.DataFrame({c: ["No data"] for c in pretty})
124
+ return empty
125
+
126
+ cfg = suite_config
127
+
128
+ rows = []
129
+ for itm in ds:
130
+ ev = EvalResult.model_validate(itm)
131
+ sub = ev.submission
132
+ # only format if submit_time present, else leave as None
133
+ ts = sub.submit_time
134
+ if ts is not None:
135
+ date = ts.astimezone(ZoneInfo(timezone)).strftime("%Y-%m-%d")
136
+ else:
137
+ date = None
138
+
139
+ if not ev.results:
140
+ logger.warning(
141
+ f"Skipping submission {sub.agent_name} ({sub.username}) "
142
+ f"({sub.submit_time}) with no results"
143
+ )
144
+ continue
145
+ stats = compute_summary_statistics(
146
+ suite_config=cfg, split=split, results=ev.results
147
+ )
148
+ flat = {}
149
+ for key, s in stats.items():
150
+ parts = key.split("/")
151
+ if parts[0] == "overall":
152
+ flat["overall/score"], flat["overall/cost"] = s.score, s.cost
153
+ elif parts[0] == "tag":
154
+ flat[f"tag/{parts[1]}/score"], flat[f"tag/{parts[1]}/cost"] = (
155
+ s.score,
156
+ s.cost,
157
+ )
158
+ else: # task
159
+ t0 = parts[1]
160
+ # compute 95% CI half-width from stderr
161
+ flat.update(
162
+ {
163
+ f"task/{t0}/score": s.score,
164
+ f"task/{t0}/score_ci": (
165
+ (s.score_stderr * 1.96)
166
+ if s.score_stderr is not None
167
+ else np.nan
168
+ ),
169
+ f"task/{t0}/cost": s.cost,
170
+ f"task/{t0}/cost_ci": (
171
+ (s.cost_stderr * 1.96)
172
+ if s.cost_stderr is not None
173
+ else np.nan
174
+ ),
175
+ }
176
+ )
177
+
178
+ rows.append(
179
+ {
180
+ "agent_name": sub.agent_name,
181
+ "username": sub.username or "",
182
+ "submit_time": date,
183
+ **flat,
184
+ "logs_url": sub.logs_url if is_internal else sub.logs_url_public,
185
+ }
186
+ )
187
+
188
+ df = pd.DataFrame(rows)
189
+
190
+ # prepare pretty column mapping
191
+ pretty_cols = {c: _pretty_column_name(c) for c in df.columns}
192
+
193
+ # construct overview table with human-friendly names
194
+ overview = df.rename(columns=pretty_cols)
195
+
196
+ return overview
197
+
198
+
199
+ def _pretty_column_name(col: str) -> str:
200
+ """Map raw column name to display name."""
201
+ # fixed mappings
202
+ mapping = {
203
+ "submit_time": "Date",
204
+ "agent_name": "Agent",
205
+ "username": "User/organization",
206
+ "logs_url": "Logs",
207
+ "overall/score": "Score",
208
+ "overall/cost": "Cost (USD)",
209
+ }
210
+ if col in mapping:
211
+ return mapping[col]
212
+ # dynamic: task/{name}/{metric} or tag/{name}/{metric}
213
+ parts = col.split("/")
214
+ if len(parts) == 3:
215
+ _, name, metric = parts
216
+ if metric == "score":
217
+ return name
218
+ if metric == "cost":
219
+ return f"{name} cost"
220
+ if metric == "score_ci":
221
+ return f"{name} 95% CI"
222
+ if metric == "cost_ci":
223
+ return f"{name} cost 95% CI"
224
+ # fallback to last segment
225
+ return parts[-1]
226
+
227
+
228
+
229
+ def _plot_scatter(
230
+ data: pd.DataFrame,
231
+ x: str, # Cost column name (e.g., "Overall cost")
232
+ y: str, # Score column name (e.g., "Overall score")
233
+ agent_col: str,
234
+ ) -> plt.Figure:
235
+ """Scatter plot of agent results, showing score vs cost with Pareto frontier."""
236
+ fig, ax = plt.subplots(figsize=(20,7))
237
+
238
+ # Make a copy for manipulation to find frontier without affecting original data
239
+ plot_data = data.copy()
240
+
241
+ # Ensure score (y) and cost (x) are numeric and drop NaNs for frontier calculation
242
+ plot_data[y] = pd.to_numeric(plot_data[y], errors='coerce')
243
+ plot_data[x] = pd.to_numeric(plot_data[x], errors='coerce')
244
+ frontier_data = plot_data.dropna(subset=[y, x])
245
+
246
+ if not frontier_data.empty:
247
+ # Sort by cost (x) ascending, then by score (y) descending for tie-breaking
248
+ frontier_data = frontier_data.sort_values(by=[x, y], ascending=[True, False])
249
+
250
+ pareto_points = []
251
+ max_score_at_cost = -np.inf # Initialize with negative infinity
252
+
253
+ for index, row in frontier_data.iterrows():
254
+ current_score = row[y]
255
+ current_cost = row[x]
256
+ # Only add point if it offers a higher score than any previous point
257
+ # on the frontier with less or equal cost (implicit by sorting).
258
+ # More strictly, for a point to be on the frontier here, it must improve the score.
259
+ if current_score > max_score_at_cost:
260
+ # Optional: If allowing same score but lower cost (already handled by sort somewhat)
261
+ # you might need to check if a point with same score but lower cost exists
262
+ # For this algorithm, we simply take points that strictly increase score.
263
+ pareto_points.append(row)
264
+ max_score_at_cost = current_score
265
+
266
+ if pareto_points:
267
+ pareto_df = pd.DataFrame(pareto_points)
268
+ # Sort pareto_df by cost again just to be sure for plotting line
269
+ pareto_df = pareto_df.sort_values(by=x)
270
+ # Plot the Pareto frontier line
271
+ ax.plot(pareto_df[x], pareto_df[y], marker='o', linestyle='-', color='red', alpha=0.7, linewidth=2, markersize=5, label='Pareto Frontier')
272
+
273
+ # Plot all data points
274
+ sns.scatterplot(data=data, x=x, y=y, hue=agent_col, s=100, ax=ax, legend="auto")
275
+
276
+ # Error bars (if they exist)
277
+ x_ci_col = f"{x} 95% CI"
278
+ y_ci_col = f"{y} 95% CI"
279
+ if x_ci_col in data.columns or y_ci_col in data.columns:
280
+ # Filter data for error bars to only include rows present in the original 'data'
281
+ # This is important if 'frontier_data' subset was used for some logic but error bars are for all.
282
+ error_bar_data = data.copy() # Use original data for error bars
283
+ error_bar_data[x_ci_col] = pd.to_numeric(error_bar_data.get(x_ci_col), errors='coerce')
284
+ error_bar_data[y_ci_col] = pd.to_numeric(error_bar_data.get(y_ci_col), errors='coerce')
285
+
286
+ ax.errorbar(
287
+ x=error_bar_data[x], # Use original data's x
288
+ y=error_bar_data[y], # Use original data's y
289
+ xerr=error_bar_data.get(x_ci_col),
290
+ yerr=error_bar_data.get(y_ci_col),
291
+ fmt="none",
292
+ ecolor="gray",
293
+ alpha=0.5,
294
+ capsize=3,
295
+ zorder=0 # Draw error bars behind scatter points
296
+ )
297
+
298
+ ax.set_xlim(left=0)
299
+ ax.set_ylim(bottom=0) # Scores and costs are typically non-negative
300
+ ax.set_xlabel(x) # x is cost
301
+ ax.set_ylabel(y) # y is score
302
+
303
+ # Adjust legend: Get handles and labels from seaborn plot, then add frontier's
304
+ handles, labels = ax.get_legend_handles_labels()
305
+ # Check if "Pareto Frontier" was actually plotted and add its handle/label if so
306
+ if pareto_points and "Pareto Frontier" not in labels: # Avoid duplicate legend items
307
+ # Find the frontier line object to get its handle
308
+ frontier_line = next((line for line in ax.get_lines() if line.get_label() == 'Pareto Frontier'), None)
309
+ if frontier_line:
310
+ handles.append(frontier_line)
311
+ labels.append('Pareto Frontier')
312
+
313
+ ax.legend(handles=handles, labels=labels, title=agent_col, bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.)
314
+
315
+ plt.tight_layout(rect=[0, 0, 0.85, 1])
316
+ return fig
317
+
318
+
319
+ __all__ = ["LeaderboardViewer"]
literature_understanding.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ # Import our UI factories and the data loader
5
+ from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
6
+
7
+ # Define the category for this page
8
+ CATEGORY_NAME = "Literature Understanding"
9
+
10
+ with gr.Blocks() as demo:
11
+ gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
12
+
13
+ # --- This page now has two main sections: Validation and Test ---
14
+ with gr.Tabs():
15
+ with gr.Tab("Results: Validation"):
16
+ # 1. Load all necessary data for the "validation" split ONCE.
17
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
18
+
19
+ if not validation_df.empty:
20
+ # 2. Render the main category display using the loaded data.
21
+ create_leaderboard_display(
22
+ full_df=validation_df,
23
+ tag_map=validation_tag_map,
24
+ category_name=CATEGORY_NAME,
25
+ split_name="validation"
26
+ )
27
+
28
+ # 3. Render the detailed breakdown for each benchmark in the category.
29
+ create_benchmark_details_display(
30
+ full_df=validation_df,
31
+ tag_map=validation_tag_map,
32
+ category_name=CATEGORY_NAME
33
+ )
34
+ else:
35
+ gr.Markdown("No data available for validation split.")
36
+
37
+ with gr.Tab("Results: Test"):
38
+ # Repeat the process for the "test" split
39
+ test_df, test_tag_map = get_full_leaderboard_data("test")
40
+
41
+ if not test_df.empty:
42
+ create_leaderboard_display(
43
+ full_df=test_df,
44
+ tag_map=test_tag_map,
45
+ category_name=CATEGORY_NAME,
46
+ split_name="test"
47
+ )
48
+ create_benchmark_details_display(
49
+ full_df=test_df,
50
+ tag_map=test_tag_map,
51
+ category_name=CATEGORY_NAME
52
+ )
53
+ else:
54
+ gr.Markdown("No data available for test split.")
main_page.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ matplotlib.use('Agg')
3
+
4
+ import os
5
+ import shutil
6
+ import tarfile
7
+ import tempfile
8
+ from datetime import datetime, timedelta, timezone
9
+ from email.utils import parseaddr
10
+ from pathlib import Path
11
+ # from zoneinfo import ZoneInfo # LeaderboardViewer uses this, ensure it's available
12
+
13
+ import gradio as gr
14
+ import requests
15
+ from agenteval import (
16
+ # compute_summary_statistics, # This will now be used by LeaderboardViewer
17
+ process_eval_logs,
18
+ upload_folder_to_hf,
19
+ upload_summary_to_hf,
20
+ )
21
+ from agenteval.models import EvalResult # Used by submission and LeaderboardViewer (implicitly)
22
+ from agenteval.leaderboard.upload import sanitize_path_component
23
+ from datasets import Dataset, DatasetDict, VerificationMode, load_dataset # load_dataset used by LV
24
+ from datasets.data_files import EmptyDatasetError
25
+ from huggingface_hub import HfApi
26
+
27
+ from ui_components import create_leaderboard_display, get_full_leaderboard_data
28
+
29
+ from content import (
30
+ CITATION_BUTTON_LABEL,
31
+ CITATION_BUTTON_TEXT,
32
+ INTRODUCTION_TEXT,
33
+ SUBMISSION_TEXT,
34
+ INTRO_PARAGRAPH,
35
+ SCATTER_DISCLAIMER,
36
+ format_error,
37
+ format_log,
38
+ format_warning,
39
+ )
40
+
41
+ # --- Constants and Configuration ---
42
+ LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
43
+ CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
44
+ IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
45
+
46
+ OWNER = "allenai"
47
+ PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
48
+ SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
49
+ SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
50
+ CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
51
+ RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
52
+ LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
53
+
54
+ if LOCAL_DEBUG:
55
+ DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
56
+ else:
57
+ DATA_DIR = "/home/user/data/" + CONFIG_NAME
58
+ EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
59
+
60
+ api = HfApi()
61
+ MAX_UPLOAD_BYTES = 100 * 1024**2
62
+ AGENTEVAL_MANIFEST_NAME = "agenteval.json"
63
+ os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
64
+
65
+ # --- Global State for Viewers (simple caching) ---
66
+ CACHED_VIEWERS = {}
67
+ CACHED_TAG_MAPS = {}
68
+
69
+ # --- Submission Logic (largely unchanged from original, ensure EvalResult and other deps are fine) ---
70
+ def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
71
+ try:
72
+ return load_dataset(*args, **kwargs)
73
+ except EmptyDatasetError:
74
+ return DatasetDict()
75
+ except ValueError: # Handles cases where dataset is empty or ill-formed
76
+ return DatasetDict()
77
+
78
+ def checked_upload_folder(
79
+ api_hf: HfApi, # Renamed to avoid conflict with global api
80
+ folder_path: str,
81
+ repo_id: str,
82
+ config_name_ul: str, # Renamed
83
+ split_ul: str, # Renamed
84
+ submission_name_ul: str, # Renamed
85
+ ) -> str:
86
+ total = 0
87
+ for root, _, files in os.walk(folder_path):
88
+ for f_ul in files: # Renamed
89
+ total += os.path.getsize(os.path.join(root, f_ul))
90
+ if total > MAX_UPLOAD_BYTES:
91
+ raise ValueError(
92
+ f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
93
+ )
94
+ return upload_folder_to_hf(
95
+ api=api_hf, # Use renamed parameter
96
+ folder_path=folder_path,
97
+ repo_id=repo_id,
98
+ config_name=config_name_ul,
99
+ split=split_ul,
100
+ submission_name=submission_name_ul,
101
+ )
102
+
103
+ def add_new_eval(
104
+ val_or_test: str,
105
+ agent_name: str | None,
106
+ agent_description: str,
107
+ agent_url: str,
108
+ openness: str | None,
109
+ degree_of_control: str | None,
110
+ path_to_file: tempfile._TemporaryFileWrapper | None,
111
+ username: str,
112
+ mail: str,
113
+ profile: gr.OAuthProfile,
114
+ # We need global eval_results for checks; this might need rethinking if it's purely display driven now
115
+ # For now, let's assume we still load it for submission checks
116
+ ):
117
+ # Load current eval_results for submission checks
118
+ # This is a bit redundant if display part reloads it, but submission needs its own consistent view
119
+ current_eval_results_for_submission = try_load_dataset_submission(
120
+ RESULTS_DATASET,
121
+ CONFIG_NAME,
122
+ download_mode="force_redownload", # Or a less aggressive mode
123
+ verification_mode=VerificationMode.NO_CHECKS,
124
+ trust_remote_code=True,
125
+ )
126
+ if not agent_name:
127
+ return format_warning("Please provide an agent name.")
128
+
129
+ submission_time = datetime.now(timezone.utc)
130
+ if not username or username.strip() == "":
131
+ username = profile.username # Default to HF username
132
+
133
+ # User account age check
134
+ try:
135
+ user_data_resp = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
136
+ user_data_resp.raise_for_status()
137
+ creation_date_str = user_data_resp.json()["createdAt"]
138
+ created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
139
+ if submission_time - created_at < timedelta(days=60):
140
+ return format_error("This account is not authorized to submit here (account too new).")
141
+ except Exception as e:
142
+ print(f"Error checking user account age: {e}")
143
+ return format_error("Could not verify account age. Please try again later.")
144
+
145
+ # Submission frequency check
146
+ contact_infos = try_load_dataset_submission(
147
+ CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
148
+ verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
149
+ )
150
+ user_submission_dates = sorted(
151
+ datetime.fromisoformat(row["submit_time"])
152
+ for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
153
+ )
154
+ if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
155
+ return format_error("You already submitted once in the last 24h for this split; please try again later.")
156
+
157
+ # Email validation
158
+ _, parsed_mail = parseaddr(mail)
159
+ if "@" not in parsed_mail:
160
+ return format_warning("Please provide a valid email address.")
161
+
162
+ # Duplicate submission check
163
+ if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0:
164
+ existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", [])
165
+ for sub_item in existing_submissions:
166
+ if (sub_item.get("agent_name", "").lower() == agent_name.lower() and
167
+ sub_item.get("username", "").lower() == username.lower()):
168
+ return format_warning("This agent name by this user has already been submitted to this split.")
169
+
170
+ if path_to_file is None:
171
+ return format_warning("Please attach a .tar.gz file.")
172
+
173
+ safe_username = sanitize_path_component(username)
174
+ safe_agent_name = sanitize_path_component(agent_name)
175
+ extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}")
176
+
177
+ # File extraction
178
+ if not LOCAL_DEBUG:
179
+ try:
180
+ if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir)
181
+ os.makedirs(extracted_dir, exist_ok=True)
182
+ with tarfile.open(path_to_file.name, "r:gz") as tar:
183
+ members_extracted = 0
184
+ for member in tar.getmembers():
185
+ if not member.isreg(): continue
186
+ fname = os.path.basename(member.name)
187
+ if not fname or fname.startswith("."): continue
188
+ fobj = tar.extractfile(member)
189
+ if not fobj: continue
190
+ with open(os.path.join(extracted_dir, fname), "wb") as out:
191
+ out.write(fobj.read())
192
+ members_extracted +=1
193
+ if members_extracted == 0:
194
+ return format_error("Submission tarball is empty or contains no valid files.")
195
+ except Exception as e:
196
+ return format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz.")
197
+ else: print("mock extracted file", flush=True)
198
+
199
+
200
+ submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}"
201
+
202
+ # 1. Upload raw (unscored) submission files
203
+ if not LOCAL_DEBUG:
204
+ try:
205
+ checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, submission_name)
206
+ except ValueError as e: return format_error(str(e))
207
+ except Exception as e: return format_error(f"Failed to upload raw submission: {e}")
208
+ else: print("mock uploaded raw submission", flush=True)
209
+
210
+ # 2. Save contact information
211
+ contact_info = {
212
+ "agent_name": agent_name, "agent_description": agent_description, "url": agent_url,
213
+ "username": username, "username_auth": profile.username, "mail": mail,
214
+ "submit_time": submission_time.isoformat(),
215
+ }
216
+ if val_or_test in contact_infos:
217
+ contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
218
+ else:
219
+ contact_infos[val_or_test] = Dataset.from_list([contact_info])
220
+
221
+ if not LOCAL_DEBUG:
222
+ try:
223
+ contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
224
+ except Exception as e: return format_warning(f"Submission recorded, but contact info failed to save: {e}")
225
+ else: print("mock uploaded contact info", flush=True)
226
+
227
+
228
+ # 3. Process and score the submission
229
+ eval_result_obj = None # Define to avoid NameError
230
+ try:
231
+ json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
232
+ if not json_path.exists():
233
+ return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
234
+
235
+ eval_result_obj = EvalResult.model_validate_json(json_path.read_text(encoding="utf-8"))
236
+ if eval_result_obj.suite_config.version != CONFIG_NAME:
237
+ return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
238
+ if eval_result_obj.split != val_or_test:
239
+ return format_error(f"Split mismatch: expected {val_or_test}, got {eval_result_obj.split}.")
240
+
241
+ # Re-compute results from logs for integrity
242
+ eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
243
+ eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
244
+
245
+ except Exception as e:
246
+ return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
247
+
248
+ # 4. Upload scored submission files
249
+ logs_url_private_val, logs_url_public_val = None, None
250
+ scored_submission_name = f"{submission_name}_scored"
251
+ if not LOCAL_DEBUG:
252
+ try:
253
+ logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
254
+ if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
255
+ logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
256
+ except ValueError as e: return format_error(str(e))
257
+ except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
258
+ else: print("mock uploaded scored submission", flush=True)
259
+
260
+
261
+ # Update EvalResult with submission details
262
+ eval_result_obj.submission.agent_name = agent_name
263
+ eval_result_obj.submission.agent_description = agent_description
264
+ eval_result_obj.submission.agent_url = agent_url
265
+ eval_result_obj.submission.openness = openness
266
+ eval_result_obj.submission.degree_of_control = degree_of_control
267
+ eval_result_obj.submission.username = username
268
+ eval_result_obj.submission.submit_time = submission_time
269
+ eval_result_obj.submission.logs_url = logs_url_private_val
270
+ eval_result_obj.submission.logs_url_public = logs_url_public_val
271
+
272
+ # 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
273
+ if not LOCAL_DEBUG:
274
+ try:
275
+ upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
276
+ except Exception as e:
277
+ return format_error(f"Failed to upload summary results to leaderboard: {e}")
278
+ else: print("mock uploaded results to lb", flush=True)
279
+
280
+ # Invalidate viewer cache for the split that was updated
281
+ if val_or_test in CACHED_VIEWERS:
282
+ del CACHED_VIEWERS[val_or_test]
283
+ if val_or_test in CACHED_TAG_MAPS:
284
+ del CACHED_TAG_MAPS[val_or_test]
285
+
286
+
287
+ return format_log(
288
+ f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
289
+ "Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
290
+ )
291
+
292
+ with gr.Blocks() as demo:
293
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
294
+ gr.HTML(INTRO_PARAGRAPH, elem_id="intro-paragraph")
295
+
296
+ # --- Submission Accordion ---
297
+ with gr.Accordion("🚀 Submit a new agent for evaluation", open=False, elem_classes="submission-accordion"):
298
+ gr.Markdown(SUBMISSION_TEXT, elem_id="markdown-text")
299
+ with gr.Row():
300
+ with gr.Column():
301
+ level_of_test_radio = gr.Radio(["validation", "test"], value="validation", label="Split")
302
+ agent_name_tb = gr.Textbox(label="Agent Name")
303
+ agent_desc_tb = gr.Textbox(label="Agent Description")
304
+ agent_url_tb = gr.Textbox(label="URL to Agent Information")
305
+ openness_radio = gr.Radio(["Open Source", "API", "UI"], value=None, label="Openness of Agent")
306
+ degree_of_control_radio = gr.Radio(["Standard", "Custom"], value=None, label="Degree of Control")
307
+ with gr.Column():
308
+ username_tb = gr.Textbox(label="Organization or User Name (Defaults to HF username)")
309
+ mail_tb = gr.Textbox(label="Contact Email (Private, for submission issues)")
310
+ file_upload_comp = gr.File(
311
+ label="Submission File (.tar.gz ...)", # Shortened for brevity
312
+ file_types=[".gz", ".tar.gz"]
313
+ )
314
+ with gr.Row():
315
+ gr.LoginButton()
316
+ submit_eval_button = gr.Button("Submit Evaluation")
317
+ submission_result = gr.Markdown()
318
+
319
+ submit_eval_button.click(
320
+ add_new_eval,
321
+ [
322
+ level_of_test_radio,
323
+ agent_name_tb,
324
+ agent_desc_tb,
325
+ agent_url_tb,
326
+ openness_radio,
327
+ degree_of_control_radio,
328
+ file_upload_comp,
329
+ username_tb,
330
+ mail_tb
331
+ ],
332
+ submission_result,
333
+ )
334
+
335
+ # --- Leaderboard Display Section ---
336
+ gr.Markdown("---")
337
+ CATEGORY_NAME = "Overall"
338
+ gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
339
+
340
+ with gr.Tabs() as tabs:
341
+ with gr.Tab("Results: Validation"):
342
+ # 1. Load all necessary data for the "validation" split ONCE.
343
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
344
+
345
+ # Check if data was loaded successfully before trying to display it
346
+ if not validation_df.empty:
347
+ # 2. Render the display by calling the factory with the loaded data.
348
+ create_leaderboard_display(
349
+ full_df=validation_df,
350
+ tag_map=validation_tag_map,
351
+ category_name=CATEGORY_NAME, # Use our constant
352
+ split_name="validation"
353
+ )
354
+ else:
355
+ # Display a message if no data is available
356
+ gr.Markdown("No data available for validation split.")
357
+
358
+ with gr.Tab("Results: Test"):
359
+ test_df, test_tag_map = get_full_leaderboard_data("test")
360
+ if not test_df.empty:
361
+ create_leaderboard_display(
362
+ full_df=test_df,
363
+ tag_map=test_tag_map,
364
+ category_name=CATEGORY_NAME, # Use our constant
365
+ split_name="test"
366
+ )
367
+ else:
368
+ gr.Markdown("No data available for test split.")
369
+
370
+ with gr.Accordion("📙 Citation", open=False):
371
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
372
+
373
+
374
+ if __name__ == "__main__":
375
+ demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  datasets
2
- gradio[oauth]
3
  huggingface-hub
4
  APScheduler
5
- agent-eval @ git+https://github.com/allenai/agent-eval.git@d302cb5d0ba983ae5f0764c53fde4e017118d0df#egg=agent-eval
 
1
  datasets
2
+ gradio[oauth]==5.30.0
3
  huggingface-hub
4
  APScheduler
5
+ agent-eval==0.1.9
ui_components.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.events import SelectData
3
+ import pandas as pd
4
+ import plotly.graph_objects as go
5
+ import os
6
+
7
+ from agenteval.leaderboard.view import LeaderboardViewer
8
+ from huggingface_hub import HfApi
9
+
10
+ from leaderboard_transformer import DataTransformer, transform_raw_dataframe, create_pretty_tag_map, INFORMAL_TO_FORMAL_NAME_MAP, _plot_scatter_plotly, format_cost_column, format_score_column
11
+ from content import (
12
+ SCATTER_DISCLAIMER,
13
+ format_error,
14
+ format_log,
15
+ format_warning,
16
+ hf_uri_to_web_url,
17
+ hyperlink,
18
+ )
19
+
20
+ # --- Constants and Configuration ---
21
+ LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
22
+ CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
23
+ IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
24
+
25
+ OWNER = "allenai"
26
+ PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
27
+ SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
28
+ SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
29
+ CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
30
+ RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
31
+ LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
32
+
33
+ if LOCAL_DEBUG:
34
+ DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
35
+ else:
36
+ DATA_DIR = "/home/user/data/" + CONFIG_NAME
37
+ EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
38
+
39
+ api = HfApi()
40
+ MAX_UPLOAD_BYTES = 100 * 1024**2
41
+ AGENTEVAL_MANIFEST_NAME = "agenteval.json"
42
+ os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
43
+
44
+
45
+ # --- Global State for Viewers (simple caching) ---
46
+ CACHED_VIEWERS = {}
47
+ CACHED_TAG_MAPS = {}
48
+
49
+ # --- New Helper Class to Solve the Type Mismatch Bug ---
50
+ class DummyViewer:
51
+ """A mock viewer to be cached on error. It has a ._load() method
52
+ to ensure it behaves like the real LeaderboardViewer."""
53
+ def __init__(self, error_df):
54
+ self._error_df = error_df
55
+
56
+ def _load(self):
57
+ # The _load method returns the error DataFrame and an empty tag map
58
+ return self._error_df, {}
59
+
60
+ def get_leaderboard_viewer_instance(split: str):
61
+ """
62
+ Fetches the LeaderboardViewer for a split, using a cache to avoid
63
+ re-downloading data. On error, returns a stable DummyViewer object.
64
+ """
65
+ global CACHED_VIEWERS, CACHED_TAG_MAPS
66
+
67
+ if split in CACHED_VIEWERS:
68
+ # Cache hit: return the cached viewer and tag map
69
+ return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
70
+
71
+ # --- Cache miss: try to load data from the source ---
72
+ try:
73
+ print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}")
74
+ viewer = LeaderboardViewer(
75
+ repo_id=RESULTS_DATASET,
76
+ config=CONFIG_NAME,
77
+ split=split,
78
+ is_internal=IS_INTERNAL
79
+ )
80
+
81
+ # Simplify tag map creation
82
+ pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
83
+
84
+ # Cache the results for next time
85
+ CACHED_VIEWERS[split] = viewer
86
+ CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
87
+
88
+ return viewer, pretty_tag_map
89
+
90
+ except Exception as e:
91
+ # On ANY error, create a consistent error message and cache a DummyViewer
92
+ error_message = f"Error loading data for split '{split}': {e}"
93
+ print(format_error(error_message))
94
+
95
+ dummy_df = pd.DataFrame({"Message": [error_message]})
96
+ dummy_viewer = DummyViewer(dummy_df)
97
+ dummy_tag_map = {"Overall": []}
98
+
99
+ # Cache the dummy objects so we don't try to fetch again on this run
100
+ CACHED_VIEWERS[split] = dummy_viewer
101
+ CACHED_TAG_MAPS[split] = dummy_tag_map
102
+
103
+ return dummy_viewer, dummy_tag_map
104
+
105
+
106
+ def create_leaderboard_display(
107
+ full_df: pd.DataFrame,
108
+ tag_map: dict,
109
+ category_name: str,
110
+ split_name: str
111
+ ):
112
+ """
113
+ This UI factory takes pre-loaded data and renders the main DataFrame and Plot
114
+ for a given category (e.g., "Overall" or "Literature Understanding").
115
+ """
116
+ # 1. Instantiate the transformer and get the specific view for this category.
117
+ # The function no longer loads data itself; it filters the data it receives.
118
+ transformer = DataTransformer(full_df, tag_map)
119
+ df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
120
+ # format cost columns
121
+ for col in df_view.columns:
122
+ if "Cost" in col:
123
+ df_view = format_cost_column(df_view, col)
124
+
125
+ # 2. Fill NaN scores with 0
126
+ for col in df_view.columns:
127
+ if "Score" in col:
128
+ df_view = format_score_column(df_view, col)
129
+ scatter_plot = plots_dict.get('scatter_plot', go.Figure())
130
+
131
+ # 2. Define the UI components with the filtered data.
132
+ df_headers = df_view.columns.tolist()
133
+ df_datatypes = ["markdown" if col == "Logs" or "Cost" in col or "Score" in col else "str" for col in df_headers]
134
+
135
+ dataframe_component = gr.DataFrame(
136
+ headers=df_headers,
137
+ value=df_view,
138
+ datatype=df_datatypes,
139
+ interactive=False,
140
+ wrap=True,
141
+ column_widths=[100, 100, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 75, 75, 50, 50]
142
+ )
143
+
144
+ plot_component = gr.Plot(
145
+ value=scatter_plot,
146
+ label=f"Score vs. Cost ({category_name})"
147
+ )
148
+ gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
149
+
150
+ # Return the components so they can be referenced elsewhere.
151
+ return dataframe_component, plot_component
152
+
153
+ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
154
+ """
155
+ Loads and transforms the complete dataset for a given split.
156
+ This function handles caching and returns the final "pretty" DataFrame and tag map.
157
+ """
158
+ # This reuses your existing robust caching logic
159
+ viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
160
+
161
+ if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)):
162
+ raw_df, _ = viewer_or_data._load()
163
+ if raw_df.empty:
164
+ return pd.DataFrame(), {}
165
+
166
+ pretty_df = transform_raw_dataframe(raw_df)
167
+ pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
168
+ if "Logs" in pretty_df.columns:
169
+ def format_log_entry_to_html(raw_uri):
170
+ if pd.isna(raw_uri) or raw_uri == "": return ""
171
+ web_url = hf_uri_to_web_url(str(raw_uri))
172
+ return hyperlink(web_url, "🔗") if web_url else ""
173
+
174
+ # Apply the function to the "Logs" column
175
+ pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
176
+
177
+ return pretty_df, pretty_tag_map
178
+
179
+ # Fallback for unexpected types
180
+ return pd.DataFrame(), {}
181
+
182
+ # --- Detailed Benchmark Display ---
183
+ def create_benchmark_details_display(
184
+ full_df: pd.DataFrame,
185
+ tag_map: dict,
186
+ category_name: str
187
+ ):
188
+ """
189
+ Generates a detailed breakdown for each benchmark within a given category.
190
+ For each benchmark, it creates a title, a filtered table, and a scatter plot.
191
+ Args:
192
+ full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
193
+ tag_map (dict): The "pretty" tag map to find the list of benchmarks.
194
+ category_name (str): The main category to display details for (e.g., "Literature Understanding").
195
+ """
196
+ # 1. Get the list of benchmarks for the selected category
197
+ benchmark_names = tag_map.get(category_name, [])
198
+
199
+ if not benchmark_names:
200
+ gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
201
+ return
202
+
203
+ gr.Markdown("---")
204
+ gr.Markdown("## Detailed Benchmark Results")
205
+
206
+ # 2. Loop through each benchmark and create its UI components
207
+ for benchmark_name in benchmark_names:
208
+ with gr.Blocks():
209
+ gr.Markdown(f"### {benchmark_name}")
210
+
211
+ # 3. Prepare the data for this specific benchmark's table and plot
212
+ benchmark_score_col = f"{benchmark_name} Score"
213
+ benchmark_cost_col = f"{benchmark_name} Cost"
214
+
215
+ # Define the columns needed for the detailed table
216
+ table_cols = ['Agent', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs']
217
+
218
+ # Filter to only columns that actually exist in the full dataframe
219
+ existing_table_cols = [col for col in table_cols if col in full_df.columns]
220
+
221
+ if benchmark_score_col not in existing_table_cols:
222
+ gr.Markdown(f"Score data for {benchmark_name} not available.")
223
+ continue # Skip to the next benchmark if score is missing
224
+
225
+ # Create a specific DataFrame for the table view
226
+ benchmark_table_df = full_df[existing_table_cols].copy()
227
+ # Calculated and add "Benchmark Attempted" column
228
+ def check_benchmark_status(row):
229
+ has_score = pd.notna(row.get(benchmark_score_col))
230
+ has_cost = pd.notna(row.get(benchmark_cost_col))
231
+
232
+ if has_score and has_cost:
233
+ return "✅"
234
+ if has_score or has_cost:
235
+ return "⚠️"
236
+ return "🚫 "
237
+
238
+ # Apply the function to create the new column
239
+ benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
240
+ # Sort the DataFrame
241
+ if benchmark_score_col in benchmark_table_df.columns:
242
+ benchmark_table_df = benchmark_table_df.sort_values(
243
+ by=benchmark_score_col, ascending=False, na_position='last'
244
+ )
245
+ # 1. Format the cost and score columns
246
+ benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
247
+ benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
248
+ desired_cols_in_order = [
249
+ 'Agent',
250
+ 'Submitter',
251
+ 'Attempted Benchmark',
252
+ benchmark_score_col,
253
+ benchmark_cost_col,
254
+ 'Openness',
255
+ 'Degree of Control',
256
+ 'Date',
257
+ 'Logs'
258
+ ]
259
+ for col in desired_cols_in_order:
260
+ if col not in benchmark_table_df.columns:
261
+ benchmark_table_df[col] = pd.NA # Add as an empty column
262
+ benchmark_table_df = benchmark_table_df[desired_cols_in_order]
263
+ # Rename columns for a cleaner table display, as requested
264
+ benchmark_table_df.rename(columns={
265
+ benchmark_score_col: 'Score',
266
+ benchmark_cost_col: 'Cost'
267
+ }, inplace=True)
268
+ # Ensure the 'Logs' column is formatted correctly
269
+ table_headers = benchmark_table_df.columns.tolist()
270
+ # If the column is 'Logs', render as markdown; otherwise, as a string.
271
+ df_datatypes = [
272
+ "markdown" if col in ["Logs", "Cost", "Score"] else "str"
273
+ for col in table_headers
274
+ ]
275
+
276
+ # Create the Gradio component, now with the correct datatypes
277
+ gr.DataFrame(
278
+ value=benchmark_table_df,
279
+ datatype=df_datatypes,
280
+ interactive=False,
281
+ wrap=True,
282
+ )
283
+
284
+ # Create the scatter plot using the full data for context, but plotting benchmark metrics
285
+ # This shows all agents on the same axis for better comparison.
286
+ benchmark_plot = _plot_scatter_plotly(
287
+ data=full_df,
288
+ x=benchmark_cost_col,
289
+ y=benchmark_score_col,
290
+ agent_col="Agent"
291
+ )
292
+ gr.Plot(value=benchmark_plot)
293
+ gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")