Spaces:
Paused
Paused
Amber Tanaka
commited on
Asta Leaderboard First Draft (#3)
Browse files- Ai2_logo_pink_padding_RGB.png +0 -0
- app.py +83 -623
- c_and_e.py +54 -0
- content.py +66 -2
- data/1.0.0-dev1/agenteval.json +332 -0
- data_analysis.py +54 -0
- e2e.py +54 -0
- json_leaderboard.py +485 -0
- leaderboard_transformer.py +436 -0
- leaderboard_viewer.py +319 -0
- literature_understanding.py +54 -0
- main_page.py +375 -0
- requirements.txt +2 -2
- ui_components.py +293 -0
Ai2_logo_pink_padding_RGB.png
ADDED
![]() |
app.py
CHANGED
@@ -1,644 +1,104 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
Modeled after the GAIA huggingface leaderboard app.
|
4 |
-
|
5 |
-
"""
|
6 |
-
|
7 |
-
import json
|
8 |
import os
|
9 |
-
import shutil
|
10 |
-
import tarfile
|
11 |
-
import tempfile
|
12 |
-
from datetime import datetime, timedelta, timezone
|
13 |
-
from email.utils import parseaddr
|
14 |
-
from pathlib import Path
|
15 |
-
from zoneinfo import ZoneInfo
|
16 |
|
17 |
-
import gradio as gr
|
18 |
-
import numpy as np
|
19 |
-
import pandas as pd
|
20 |
-
import requests
|
21 |
-
from agenteval import (
|
22 |
-
compute_summary_statistics,
|
23 |
-
process_eval_logs,
|
24 |
-
upload_folder_to_hf,
|
25 |
-
upload_summary_to_hf,
|
26 |
-
)
|
27 |
-
from agenteval.models import EvalResult
|
28 |
-
from agenteval.upload import sanitize_path_component
|
29 |
from apscheduler.schedulers.background import BackgroundScheduler
|
30 |
-
from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
|
31 |
-
from datasets.data_files import EmptyDatasetError
|
32 |
from huggingface_hub import HfApi
|
|
|
33 |
|
34 |
-
from content import
|
35 |
-
CITATION_BUTTON_LABEL,
|
36 |
-
CITATION_BUTTON_TEXT,
|
37 |
-
INTRODUCTION_TEXT,
|
38 |
-
SUBMISSION_TEXT,
|
39 |
-
TITLE,
|
40 |
-
format_error,
|
41 |
-
format_log,
|
42 |
-
format_warning,
|
43 |
-
hf_uri_to_web_url,
|
44 |
-
hyperlink,
|
45 |
-
)
|
46 |
|
47 |
-
#
|
48 |
LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
|
49 |
-
|
50 |
-
|
51 |
-
CONFIG_NAME = "1.0.0-dev1"
|
52 |
-
|
53 |
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
|
54 |
-
|
55 |
OWNER = "allenai"
|
56 |
PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
|
57 |
-
SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions" # all raw and scored submissions (val and test)
|
58 |
-
SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public" # copy scored val submissions (public for transparency - not used for rendering leaderboard)
|
59 |
-
CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
|
60 |
-
RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # just the summary score statistics (val and test), to be displayed on the leaderboard
|
61 |
LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
|
62 |
-
|
63 |
-
if LOCAL_DEBUG:
|
64 |
-
DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
|
65 |
-
else:
|
66 |
-
DATA_DIR = "/home/user/data/" + CONFIG_NAME
|
67 |
-
EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
|
68 |
-
|
69 |
api = HfApi()
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
local_df = eval_results.get(split)
|
120 |
-
# return default if split is missing or contains no records
|
121 |
-
if local_df is None or len(local_df) == 0:
|
122 |
-
default_raw_cols = [
|
123 |
-
"agent_name",
|
124 |
-
"agent_description",
|
125 |
-
"username",
|
126 |
-
"submit_time",
|
127 |
-
]
|
128 |
-
pretty_cols = [pretty_column_name(c) for c in default_raw_cols]
|
129 |
-
return pd.DataFrame({col: ["No data"] for col in pretty_cols})
|
130 |
-
|
131 |
-
# Use the first suite_config for all rows
|
132 |
-
# because the suite_config should not change given a single CONFIG_NAME
|
133 |
-
first_suite_config = None
|
134 |
-
if len(local_df) > 0:
|
135 |
-
first_suite_config = EvalResult.model_validate(local_df[0]).suite_config
|
136 |
-
|
137 |
-
def extract_scores(eval_res: EvalResult) -> dict[str, float | None]:
|
138 |
-
summary_stats = compute_summary_statistics(
|
139 |
-
suite_config=first_suite_config,
|
140 |
-
split=split,
|
141 |
-
results=eval_res.results,
|
142 |
-
)
|
143 |
-
|
144 |
-
values: dict[str, float | None] = {}
|
145 |
-
for key in summary_stats:
|
146 |
-
if key == "overall":
|
147 |
-
values["overall/score"] = summary_stats[key].score
|
148 |
-
values["overall/cost"] = summary_stats[key].cost
|
149 |
-
elif key.startswith("tag/"):
|
150 |
-
tag = key.split("/")[1]
|
151 |
-
values[f"tag/{tag}/score"] = summary_stats[key].score
|
152 |
-
values[f"tag/{tag}/cost"] = summary_stats[key].cost
|
153 |
-
elif key.startswith("task/"):
|
154 |
-
task = key.split("/")[1]
|
155 |
-
values[f"task/{task}/score"] = summary_stats[key].score
|
156 |
-
values[f"task/{task}/score_stderr"] = summary_stats[key].score_stderr
|
157 |
-
values[f"task/{task}/cost"] = summary_stats[key].cost
|
158 |
-
values[f"task/{task}/cost_stderr"] = summary_stats[key].cost_stderr
|
159 |
-
return values
|
160 |
-
|
161 |
-
def format_row(row) -> dict[str, float | str | None]:
|
162 |
-
eval_res = EvalResult.model_validate(row)
|
163 |
-
sub = eval_res.submission
|
164 |
-
sub.submit_time = sub.submit_time or datetime(1970, 1, 1, 0, 0, 0)
|
165 |
-
data = {
|
166 |
-
"submit_time": sub.submit_time.astimezone(ZoneInfo("US/Pacific")).strftime(
|
167 |
-
"%Y-%m-%d"
|
168 |
-
),
|
169 |
-
"agent_name": (
|
170 |
-
hyperlink(sub.agent_url, sub.agent_name)
|
171 |
-
if sub.agent_url
|
172 |
-
else sub.agent_name
|
173 |
-
),
|
174 |
-
"agent_description": sub.agent_description or "",
|
175 |
-
"username": sub.username or "",
|
176 |
-
**extract_scores(eval_res),
|
177 |
-
"logs_url": (
|
178 |
-
hyperlink(
|
179 |
-
hf_uri_to_web_url(
|
180 |
-
sub.logs_url if IS_INTERNAL else sub.logs_url_public
|
181 |
-
),
|
182 |
-
"🔗",
|
183 |
-
)
|
184 |
-
if (sub.logs_url or sub.logs_url_public)
|
185 |
-
else ""
|
186 |
-
),
|
187 |
-
}
|
188 |
-
return data
|
189 |
-
|
190 |
-
local_df = local_df.map(format_row)
|
191 |
-
|
192 |
-
df = pd.DataFrame(local_df)
|
193 |
-
|
194 |
-
# Multiply score, cost, and stderr values by 100 and round to 1 decimal
|
195 |
-
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
196 |
-
df[numeric_cols] = df[numeric_cols].multiply(100).round(1)
|
197 |
-
|
198 |
-
# Build column order on raw names, then rename via pretty_column_name
|
199 |
-
all_cols = df.columns.tolist()
|
200 |
-
base = ["agent_name", "agent_description", "username"]
|
201 |
-
overall = ["overall/score", "overall/cost"]
|
202 |
-
tags = sorted({c.split("/")[1] for c in all_cols if c.startswith("tag/")})
|
203 |
-
tasks = sorted({c.split("/")[1] for c in all_cols if c.startswith("task/")})
|
204 |
-
rest = ["submit_time", "logs_url"]
|
205 |
-
column_order = (
|
206 |
-
base
|
207 |
-
+ overall
|
208 |
-
+ [col for tag in tags for col in (f"tag/{tag}/score", f"tag/{tag}/cost")]
|
209 |
-
+ [
|
210 |
-
col
|
211 |
-
for t in tasks
|
212 |
-
for col in (
|
213 |
-
f"task/{t}/score",
|
214 |
-
f"task/{t}/score_stderr",
|
215 |
-
f"task/{t}/cost",
|
216 |
-
f"task/{t}/cost_stderr",
|
217 |
-
)
|
218 |
-
]
|
219 |
-
+ rest
|
220 |
-
)
|
221 |
-
df = df.reindex(columns=[c for c in column_order if c in all_cols])
|
222 |
-
# sort by overall score (descending)
|
223 |
-
df = df.sort_values(by=["overall/score"], ascending=False)
|
224 |
-
# apply all renames via pretty_column_name
|
225 |
-
orig_cols = df.columns.tolist()
|
226 |
-
df.columns = [pretty_column_name(col) for col in orig_cols]
|
227 |
-
|
228 |
-
# blank out any null/NaN cells
|
229 |
-
df = df.fillna("")
|
230 |
-
|
231 |
-
return df
|
232 |
-
|
233 |
-
|
234 |
-
def load_and_format_dataframes():
|
235 |
-
eval_results = try_load_dataset(
|
236 |
-
RESULTS_DATASET,
|
237 |
-
CONFIG_NAME,
|
238 |
-
download_mode="force_redownload",
|
239 |
-
verification_mode=VerificationMode.NO_CHECKS,
|
240 |
-
trust_remote_code=True,
|
241 |
-
)
|
242 |
-
eval_dataframe_val = get_dataframe_from_results(
|
243 |
-
eval_results=eval_results, split="validation"
|
244 |
-
)
|
245 |
-
eval_dataframe_test = get_dataframe_from_results(
|
246 |
-
eval_results=eval_results, split="test"
|
247 |
-
)
|
248 |
-
return eval_results, eval_dataframe_val, eval_dataframe_test
|
249 |
-
|
250 |
-
|
251 |
-
# Display the results
|
252 |
-
eval_results, eval_dataframe_val, eval_dataframe_test = load_and_format_dataframes()
|
253 |
-
|
254 |
-
|
255 |
-
def restart_space():
|
256 |
-
api.restart_space(repo_id=LEADERBOARD_PATH)
|
257 |
-
|
258 |
-
|
259 |
-
def checked_upload_folder(
|
260 |
-
api,
|
261 |
-
folder_path: str,
|
262 |
-
repo_id: str,
|
263 |
-
config_name: str,
|
264 |
-
split: str,
|
265 |
-
submission_name: str,
|
266 |
-
) -> str:
|
267 |
-
"""Upload with inline size check; raises ValueError if too large."""
|
268 |
-
total = 0
|
269 |
-
for root, _, files in os.walk(folder_path):
|
270 |
-
for f in files:
|
271 |
-
total += os.path.getsize(os.path.join(root, f))
|
272 |
-
if total > MAX_UPLOAD_BYTES:
|
273 |
-
raise ValueError(
|
274 |
-
f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
|
275 |
-
)
|
276 |
-
# NOTE: This function raises ValueError if unsafe characters are found in the path.
|
277 |
-
return upload_folder_to_hf(
|
278 |
-
api=api,
|
279 |
-
folder_path=folder_path,
|
280 |
-
repo_id=repo_id,
|
281 |
-
config_name=config_name,
|
282 |
-
split=split,
|
283 |
-
submission_name=submission_name,
|
284 |
-
)
|
285 |
-
|
286 |
-
|
287 |
-
def add_new_eval(
|
288 |
-
val_or_test: str,
|
289 |
-
agent_name: str | None,
|
290 |
-
agent_description: str,
|
291 |
-
agent_url: str,
|
292 |
-
path_to_file: tempfile._TemporaryFileWrapper | None,
|
293 |
-
username: str,
|
294 |
-
mail: str,
|
295 |
-
profile: gr.OAuthProfile,
|
296 |
-
):
|
297 |
-
# default username if none provided
|
298 |
-
if not username or username.strip() == "":
|
299 |
-
username = profile.username
|
300 |
-
|
301 |
-
if not agent_name:
|
302 |
-
return format_warning("Please provide an agent name.")
|
303 |
-
|
304 |
-
submission_time = datetime.now(timezone.utc)
|
305 |
-
|
306 |
-
# Was the profile created less than 2 month ago?
|
307 |
-
user_data = requests.get(
|
308 |
-
f"https://huggingface.co/api/users/{profile.username}/overview"
|
309 |
-
)
|
310 |
-
creation_date = json.loads(user_data.content)["createdAt"]
|
311 |
-
|
312 |
-
created_at = datetime.strptime(creation_date, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
|
313 |
-
tzinfo=timezone.utc
|
314 |
-
)
|
315 |
-
if submission_time - created_at < timedelta(days=60):
|
316 |
-
return format_error("This account is not authorized to submit here.")
|
317 |
-
|
318 |
-
contact_infos = try_load_dataset(
|
319 |
-
CONTACT_DATASET,
|
320 |
-
CONFIG_NAME,
|
321 |
-
download_mode="force_redownload",
|
322 |
-
verification_mode=VerificationMode.NO_CHECKS,
|
323 |
-
trust_remote_code=True,
|
324 |
-
)
|
325 |
-
user_submission_dates = sorted(
|
326 |
-
datetime.fromisoformat(row["submit_time"])
|
327 |
-
for row in contact_infos.get(val_or_test, [])
|
328 |
-
if row["username_auth"] == profile.username
|
329 |
-
)
|
330 |
-
if len(user_submission_dates) > 0 and abs(
|
331 |
-
submission_time - user_submission_dates[-1]
|
332 |
-
) < timedelta(seconds=24 * 60 * 60):
|
333 |
-
return format_error(
|
334 |
-
"You already submitted once in the last 24h; please try again later."
|
335 |
-
)
|
336 |
-
|
337 |
-
is_validation = val_or_test == "validation"
|
338 |
-
|
339 |
-
# Very basic email parsing
|
340 |
-
_, parsed_mail = parseaddr(mail)
|
341 |
-
if "@" not in parsed_mail:
|
342 |
-
return format_warning("Please provide a valid email adress.")
|
343 |
-
|
344 |
-
# Check duplicate submissions by inspecting the nested "submission" dicts
|
345 |
-
if val_or_test in eval_results and len(eval_results[val_or_test]) > 0:
|
346 |
-
existing = eval_results[val_or_test]
|
347 |
-
subs = existing.to_dict().get("submission", [])
|
348 |
-
names = {item.get("agent_name", "").lower() for item in subs}
|
349 |
-
users = {item.get("username", "").lower() for item in subs}
|
350 |
-
if agent_name.lower() in names and username.lower() in users:
|
351 |
-
return format_warning("This agent has been already submitted.")
|
352 |
-
|
353 |
-
if path_to_file is None:
|
354 |
-
return format_warning("Please attach a file.")
|
355 |
-
|
356 |
-
# sanitize username and agent_name for filesystem
|
357 |
-
safe_username = sanitize_path_component(username)
|
358 |
-
safe_agent_name = sanitize_path_component(agent_name)
|
359 |
-
|
360 |
-
extracted_dir = os.path.join(
|
361 |
-
EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}"
|
362 |
-
)
|
363 |
-
|
364 |
-
if LOCAL_DEBUG:
|
365 |
-
print("mock extracted file", flush=True)
|
366 |
-
else:
|
367 |
-
try:
|
368 |
-
# 1) remove old extraction if present
|
369 |
-
if os.path.exists(extracted_dir):
|
370 |
-
shutil.rmtree(extracted_dir)
|
371 |
-
os.makedirs(extracted_dir, exist_ok=True)
|
372 |
-
|
373 |
-
# 2) securely extract only regular files, flatten structure
|
374 |
-
# Flatten structure to aid finding the manifest agenteval.json file
|
375 |
-
# and because hierarchical structure is not needed
|
376 |
-
with tarfile.open(path_to_file.name, "r:gz") as tar:
|
377 |
-
for member in tar.getmembers():
|
378 |
-
if not member.isreg():
|
379 |
-
continue
|
380 |
-
fname = os.path.basename(member.name)
|
381 |
-
# skip empty or hidden
|
382 |
-
if not fname or fname.startswith("."):
|
383 |
-
continue
|
384 |
-
fobj = tar.extractfile(member)
|
385 |
-
if not fobj:
|
386 |
-
continue
|
387 |
-
target = os.path.join(extracted_dir, fname)
|
388 |
-
with open(target, "wb") as out:
|
389 |
-
out.write(fobj.read())
|
390 |
-
|
391 |
-
# 3) ensure something was extracted
|
392 |
-
if not os.listdir(extracted_dir):
|
393 |
-
return format_error("Submission tarball is empty or invalid.")
|
394 |
-
|
395 |
-
except Exception as e:
|
396 |
-
return format_error(
|
397 |
-
f"Error while extracting the file: {e}. Be sure to upload a valid .tar.gz file."
|
398 |
-
)
|
399 |
-
|
400 |
-
submission_name = (
|
401 |
-
f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d')}"
|
402 |
)
|
|
|
403 |
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
except ValueError as e:
|
418 |
-
return format_error(str(e))
|
419 |
-
|
420 |
-
# SAVE CONTACT
|
421 |
-
contact_info = {
|
422 |
-
"agent_name": agent_name,
|
423 |
-
"agent_description": agent_description,
|
424 |
-
"url": agent_url,
|
425 |
-
"username": username,
|
426 |
-
"username_auth": profile.username,
|
427 |
-
"mail": mail,
|
428 |
-
"submit_time": submission_time.isoformat(),
|
429 |
-
}
|
430 |
-
# add or init contact dataset for this split
|
431 |
-
if val_or_test in contact_infos:
|
432 |
-
contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
|
433 |
-
else:
|
434 |
-
contact_infos[val_or_test] = Dataset.from_list([contact_info])
|
435 |
-
if LOCAL_DEBUG:
|
436 |
-
print("mock uploaded contact info", flush=True)
|
437 |
-
else:
|
438 |
-
contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
|
439 |
-
|
440 |
try:
|
441 |
-
|
442 |
-
|
443 |
-
return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME}")
|
444 |
-
raw = json_path.read_text(encoding="utf-8")
|
445 |
-
eval_result = EvalResult.model_validate_json(raw)
|
446 |
-
if eval_result.suite_config.version != CONFIG_NAME:
|
447 |
-
return format_error(
|
448 |
-
f"Error: submitted suite version {eval_result.suite_config.version} "
|
449 |
-
f"does not match currently accepted version {CONFIG_NAME}"
|
450 |
-
)
|
451 |
-
if eval_result.split != val_or_test:
|
452 |
-
return format_error(
|
453 |
-
f"Error: uploaded split {eval_result.split} does not match selected split {val_or_test}"
|
454 |
-
)
|
455 |
-
|
456 |
-
# NOTE: Trusting user-computed scores, but re-computing the derived results based on the log files
|
457 |
-
eval_result.results = process_eval_logs(extracted_dir)[0]
|
458 |
-
eval_result.save_json(str(json_path))
|
459 |
-
|
460 |
except Exception as e:
|
461 |
-
|
462 |
-
|
463 |
-
|
|
|
464 |
|
465 |
-
# # SAVE SCORED SUBMISSION
|
466 |
-
if LOCAL_DEBUG:
|
467 |
-
print("mock uploaded scored submission")
|
468 |
-
else:
|
469 |
-
try:
|
470 |
-
logs_url_private = checked_upload_folder(
|
471 |
-
api=api,
|
472 |
-
folder_path=extracted_dir,
|
473 |
-
repo_id=SUBMISSION_DATASET,
|
474 |
-
config_name=CONFIG_NAME,
|
475 |
-
split=val_or_test,
|
476 |
-
submission_name=f"{submission_name}_scored",
|
477 |
-
)
|
478 |
-
except ValueError as e:
|
479 |
-
return format_error(str(e))
|
480 |
-
|
481 |
-
# Validation submissions are public for public leaderboard
|
482 |
-
if is_validation and not IS_INTERNAL:
|
483 |
-
try:
|
484 |
-
logs_url_public = checked_upload_folder(
|
485 |
-
api=api,
|
486 |
-
folder_path=extracted_dir,
|
487 |
-
repo_id=SUBMISSION_DATASET_PUBLIC,
|
488 |
-
config_name=CONFIG_NAME,
|
489 |
-
split=val_or_test,
|
490 |
-
submission_name=f"{submission_name}_scored",
|
491 |
-
)
|
492 |
-
except ValueError as e:
|
493 |
-
return format_error(str(e))
|
494 |
-
else:
|
495 |
-
logs_url_public = None
|
496 |
-
|
497 |
-
eval_result.submission.agent_name = agent_name
|
498 |
-
eval_result.submission.agent_description = agent_description
|
499 |
-
eval_result.submission.agent_url = agent_url
|
500 |
-
eval_result.submission.username = username
|
501 |
-
eval_result.submission.submit_time = submission_time
|
502 |
-
eval_result.submission.logs_url = logs_url_private
|
503 |
-
eval_result.submission.logs_url_public = logs_url_public
|
504 |
|
|
|
|
|
505 |
if LOCAL_DEBUG:
|
506 |
-
print("
|
|
|
|
|
507 |
else:
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
config_name=CONFIG_NAME,
|
513 |
-
split=val_or_test,
|
514 |
-
submission_name=f"{submission_name}_scored",
|
515 |
-
)
|
516 |
-
|
517 |
-
return format_log(
|
518 |
-
f"Agent {agent_name} submitted by {username} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed."
|
519 |
-
)
|
520 |
-
|
521 |
-
|
522 |
-
def refresh():
|
523 |
-
_, eval_dataframe_val, eval_dataframe_test = load_and_format_dataframes()
|
524 |
-
return eval_dataframe_val, eval_dataframe_test
|
525 |
-
|
526 |
-
|
527 |
-
# Determine column types dynamically based on dataframe columns
|
528 |
-
def compute_column_types(df):
|
529 |
-
col_types = []
|
530 |
-
for col in df.columns:
|
531 |
-
if col == "Agent":
|
532 |
-
col_types.append("markdown")
|
533 |
-
elif col in ["Agent description", "User/organization", "Submission date"]:
|
534 |
-
col_types.append("str")
|
535 |
-
elif col == "Logs":
|
536 |
-
col_types.append("markdown")
|
537 |
-
else:
|
538 |
-
col_types.append("number")
|
539 |
-
return col_types
|
540 |
-
|
541 |
-
|
542 |
-
test_col_types = compute_column_types(eval_dataframe_test)
|
543 |
-
val_col_types = compute_column_types(eval_dataframe_val)
|
544 |
-
|
545 |
-
demo = gr.Blocks()
|
546 |
-
with demo:
|
547 |
-
gr.HTML(TITLE)
|
548 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
549 |
-
|
550 |
-
with gr.Row():
|
551 |
-
with gr.Accordion("📙 Citation", open=False):
|
552 |
-
citation_button = gr.Textbox(
|
553 |
-
value=CITATION_BUTTON_TEXT,
|
554 |
-
label=CITATION_BUTTON_LABEL,
|
555 |
-
elem_id="citation-button",
|
556 |
-
) # .style(show_copy_button=True)
|
557 |
-
|
558 |
-
leaderboard_table_test = gr.Dataframe(
|
559 |
-
value=eval_dataframe_test,
|
560 |
-
headers=list(eval_dataframe_test.columns),
|
561 |
-
datatype=test_col_types,
|
562 |
-
interactive=False,
|
563 |
-
column_widths=["20%"],
|
564 |
-
render=False,
|
565 |
-
)
|
566 |
-
|
567 |
-
leaderboard_table_val = gr.Dataframe(
|
568 |
-
value=eval_dataframe_val,
|
569 |
-
headers=list(eval_dataframe_val.columns),
|
570 |
-
datatype=val_col_types,
|
571 |
-
interactive=False,
|
572 |
-
column_widths=["20%"],
|
573 |
-
render=False,
|
574 |
-
)
|
575 |
-
|
576 |
-
# Build tab layout list based on desired order
|
577 |
-
tabs = [
|
578 |
-
("Results: Test", leaderboard_table_test),
|
579 |
-
("Results: Validation", leaderboard_table_val),
|
580 |
-
]
|
581 |
-
|
582 |
-
if IS_INTERNAL:
|
583 |
-
tabs = [tabs[1], tabs[0]] # Validation first for internal users
|
584 |
-
|
585 |
-
# Render the tabs in desired order
|
586 |
-
for label, component in tabs:
|
587 |
-
with gr.Tab(label):
|
588 |
-
component.render()
|
589 |
-
|
590 |
-
refresh_button = gr.Button("Refresh")
|
591 |
-
refresh_button.click(
|
592 |
-
refresh,
|
593 |
-
inputs=[],
|
594 |
-
outputs=[
|
595 |
-
leaderboard_table_val,
|
596 |
-
leaderboard_table_test,
|
597 |
-
],
|
598 |
-
)
|
599 |
-
with gr.Accordion("Submit a new agent for evaluation"):
|
600 |
-
with gr.Row():
|
601 |
-
gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
|
602 |
-
with gr.Row():
|
603 |
-
with gr.Column():
|
604 |
-
level_of_test = gr.Radio(
|
605 |
-
["validation", "test"], value="validation", label="Split"
|
606 |
-
)
|
607 |
-
agent_name_textbox = gr.Textbox(label="Agent name")
|
608 |
-
agent_description_textbox = gr.Textbox(label="Agent description")
|
609 |
-
agent_url_textbox = gr.Textbox(label="Url to agent information")
|
610 |
-
with gr.Column():
|
611 |
-
username = gr.Textbox(
|
612 |
-
label="Organization or user name (defaults to your HF username)",
|
613 |
-
placeholder="Leave blank to use your HF username",
|
614 |
-
)
|
615 |
-
mail = gr.Textbox(
|
616 |
-
label="Contact email (will be stored privately, & used if there is an issue with your submission)"
|
617 |
-
)
|
618 |
-
file_output = gr.File()
|
619 |
-
|
620 |
-
with gr.Row():
|
621 |
-
gr.LoginButton()
|
622 |
-
submit_button = gr.Button("Submit Eval")
|
623 |
-
submission_result = gr.Markdown()
|
624 |
-
submit_button.click(
|
625 |
-
add_new_eval,
|
626 |
-
[
|
627 |
-
level_of_test,
|
628 |
-
agent_name_textbox,
|
629 |
-
agent_description_textbox,
|
630 |
-
agent_url_textbox,
|
631 |
-
file_output,
|
632 |
-
username,
|
633 |
-
mail,
|
634 |
-
],
|
635 |
-
submission_result,
|
636 |
-
)
|
637 |
|
638 |
-
scheduler = BackgroundScheduler()
|
639 |
-
scheduler.add_job(restart_space, "interval", seconds=3600)
|
640 |
-
scheduler.start()
|
641 |
-
if LOCAL_DEBUG:
|
642 |
-
demo.launch(debug=True)
|
643 |
-
else:
|
644 |
-
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|
|
|
1 |
+
# app.py
|
2 |
+
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
3 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
|
|
6 |
from huggingface_hub import HfApi
|
7 |
+
import literature_understanding, main_page, c_and_e, data_analysis, e2e
|
8 |
|
9 |
+
from content import TITLE, css
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
# --- Constants and Configuration ---
|
12 |
LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
|
|
|
|
|
|
|
|
|
13 |
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
|
|
|
14 |
OWNER = "allenai"
|
15 |
PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
|
|
|
|
|
|
|
|
|
16 |
LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
api = HfApi()
|
18 |
+
LOGO_PATH = "Ai2_logo_pink_padding_RGB.png"
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
# --- Theme Definition ---
|
23 |
+
theme = gr.themes.Base(
|
24 |
+
primary_hue=gr.themes.Color(c100="#CFF5E8", c200="#B7EFDD", c300="#9FEAD1", c400="#87E5C5", c50="#E7FAF3", c500="#6FE0BA", c600="#57DBAF", c700="#3FD5A3", c800="#27D09C", c900="#0FCB8C", c950="#0fcb8c"),
|
25 |
+
secondary_hue=gr.themes.Color(c100="#FCDCEB", c200="#FBCBE1", c300="#F9BAD7", c400="#F7A8CD", c50="#FDEEF5", c500="#F697C4", c600="#F586BA", c700="#F375B0", c800="#F263A6", c900="#F0529C", c950="#F0529C"),
|
26 |
+
neutral_hue=gr.themes.Color(c100="#FDF9F4", c200="#C9C9C3", c300="#B0B5AF", c400="#97A09C", c50="#FAF2E9", c500="#7F8C89", c600="#667876", c700="#344F4F", c800="#1C3A3C", c900="#032629", c950="032629"),
|
27 |
+
font=[gr.themes.GoogleFont('Manrope'), 'ui-sans-serif', 'sans-serif', 'sans-serif'],
|
28 |
+
font_mono=[gr.themes.GoogleFont('Roboto Mono'), 'ui-monospace', 'monospace', 'monospace'],
|
29 |
+
).set(
|
30 |
+
body_text_color='*neutral_950',
|
31 |
+
body_text_color_dark='*neutral_50',
|
32 |
+
background_fill_primary='*neutral_50',
|
33 |
+
background_fill_primary_dark='*neutral_900',
|
34 |
+
background_fill_secondary='*neutral_100',
|
35 |
+
background_fill_secondary_dark='*neutral_800',
|
36 |
+
border_color_accent='*secondary_900',
|
37 |
+
border_color_accent_subdued='*neutral_400',
|
38 |
+
border_color_accent_subdued_dark='*neutral_400',
|
39 |
+
color_accent='*primary_900',
|
40 |
+
color_accent_soft='*neutral_200',
|
41 |
+
color_accent_soft_dark='*neutral_800',
|
42 |
+
link_text_color='*secondary_900',
|
43 |
+
link_text_color_dark='*primary_900',
|
44 |
+
link_text_color_active_dark='*primary_600',
|
45 |
+
link_text_color_hover_dark='*primary_700',
|
46 |
+
link_text_color_visited_dark='*primary_600',
|
47 |
+
table_even_background_fill='*neutral_100',
|
48 |
+
table_even_background_fill_dark='*neutral_800',
|
49 |
+
button_primary_background_fill='*secondary_900',
|
50 |
+
button_primary_background_fill_dark='*primary_900',
|
51 |
+
button_primary_background_fill_hover='*secondary_600',
|
52 |
+
button_primary_background_fill_hover_dark='*primary_600',
|
53 |
+
button_primary_text_color='*neutral_900',
|
54 |
+
button_primary_text_color_dark='*neutral_900'
|
55 |
+
)
|
56 |
+
# --- Gradio App Definition ---
|
57 |
+
demo = gr.Blocks(theme=theme, css=css)
|
58 |
+
with demo:
|
59 |
+
gr.Image(
|
60 |
+
value=LOGO_PATH,
|
61 |
+
show_label=False,
|
62 |
+
interactive=False,
|
63 |
+
container=False,
|
64 |
+
show_download_button=False,
|
65 |
+
show_fullscreen_button=False,
|
66 |
+
elem_id="logo-image"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
)
|
68 |
+
gr.HTML(TITLE)
|
69 |
|
70 |
+
main_page.demo.render()
|
71 |
+
with demo.route("Literature Understanding"):
|
72 |
+
literature_understanding.demo.render()
|
73 |
+
with demo.route("Code & Execution"):
|
74 |
+
c_and_e.demo.render()
|
75 |
+
with demo.route("Data Analysis"):
|
76 |
+
data_analysis.demo.render()
|
77 |
+
with demo.route("Discovery"):
|
78 |
+
e2e.demo.render()
|
79 |
+
|
80 |
+
# --- Scheduler and Launch
|
81 |
+
def restart_space_job():
|
82 |
+
print("Scheduler: Attempting to restart space.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
try:
|
84 |
+
api.restart_space(repo_id=LEADERBOARD_PATH)
|
85 |
+
print("Scheduler: Space restart request sent.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
except Exception as e:
|
87 |
+
print(f"Scheduler: Error restarting space: {e}")
|
88 |
+
scheduler = BackgroundScheduler(timezone="UTC")
|
89 |
+
scheduler.add_job(restart_space_job, "interval", hours=1)
|
90 |
+
scheduler.start()
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
+
# Launch the Gradio app
|
94 |
+
if __name__ == "__main__":
|
95 |
if LOCAL_DEBUG:
|
96 |
+
print("Launching in LOCAL_DEBUG mode.")
|
97 |
+
def get_initial_global_tag_choices(): return ["Overall", "TagA"]
|
98 |
+
demo.launch(debug=True)
|
99 |
else:
|
100 |
+
print("Launching in Space mode.")
|
101 |
+
# For Spaces, share=False is typical unless specific tunneling is needed.
|
102 |
+
# debug=True can be set to False for a "production" Space.
|
103 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
c_and_e.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Import our UI factories and the data loader
|
5 |
+
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
|
6 |
+
|
7 |
+
# Define the category for this page
|
8 |
+
CATEGORY_NAME = "Code Execution"
|
9 |
+
|
10 |
+
with gr.Blocks() as demo:
|
11 |
+
gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
|
12 |
+
|
13 |
+
# --- This page now has two main sections: Validation and Test ---
|
14 |
+
with gr.Tabs():
|
15 |
+
with gr.Tab("Results: Validation"):
|
16 |
+
# 1. Load all necessary data for the "validation" split ONCE.
|
17 |
+
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
18 |
+
|
19 |
+
if not validation_df.empty:
|
20 |
+
# 2. Render the main category display using the loaded data.
|
21 |
+
create_leaderboard_display(
|
22 |
+
full_df=validation_df,
|
23 |
+
tag_map=validation_tag_map,
|
24 |
+
category_name=CATEGORY_NAME,
|
25 |
+
split_name="validation"
|
26 |
+
)
|
27 |
+
|
28 |
+
# 3. Render the detailed breakdown for each benchmark in the category.
|
29 |
+
create_benchmark_details_display(
|
30 |
+
full_df=validation_df,
|
31 |
+
tag_map=validation_tag_map,
|
32 |
+
category_name=CATEGORY_NAME
|
33 |
+
)
|
34 |
+
else:
|
35 |
+
gr.Markdown("No data available for validation split.")
|
36 |
+
|
37 |
+
with gr.Tab("Results: Test"):
|
38 |
+
# Repeat the process for the "test" split
|
39 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
40 |
+
|
41 |
+
if not test_df.empty:
|
42 |
+
create_leaderboard_display(
|
43 |
+
full_df=test_df,
|
44 |
+
tag_map=test_tag_map,
|
45 |
+
category_name=CATEGORY_NAME,
|
46 |
+
split_name="test"
|
47 |
+
)
|
48 |
+
create_benchmark_details_display(
|
49 |
+
full_df=test_df,
|
50 |
+
tag_map=test_tag_map,
|
51 |
+
category_name=CATEGORY_NAME
|
52 |
+
)
|
53 |
+
else:
|
54 |
+
gr.Markdown("No data available for test split.")
|
content.py
CHANGED
@@ -3,6 +3,14 @@ TITLE = """<h1 align="center" id="space-title">AstaBench Leaderboard</h1>"""
|
|
3 |
INTRODUCTION_TEXT = """
|
4 |
## Introduction
|
5 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
SUBMISSION_TEXT = """
|
8 |
## Submissions
|
@@ -32,8 +40,11 @@ def format_log(msg):
|
|
32 |
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
33 |
|
34 |
|
35 |
-
def hyperlink(
|
36 |
-
|
|
|
|
|
|
|
37 |
|
38 |
|
39 |
def hf_uri_to_web_url(uri: str) -> str:
|
@@ -53,3 +64,56 @@ def hf_uri_to_web_url(uri: str) -> str:
|
|
53 |
|
54 |
namespace, repo, path = parts
|
55 |
return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
INTRODUCTION_TEXT = """
|
4 |
## Introduction
|
5 |
"""
|
6 |
+
INTRO_PARAGRAPH = """
|
7 |
+
AI agents are on the rise, promising everything from travel planning to scientific discovery. But evaluating them—especially for real-world research tasks—remains a messy, inconsistent process. Metrics vary, cost is often ignored, and scientific use cases are rarely the focus. <br>
|
8 |
+
<br>
|
9 |
+
Enter AstaBench, a grand challenge benchmark developed by Ai2 to test how well agentic AI systems perform on scientific tasks that actually matter. As part of the Asta initiative, AstaBench spans ten multi-step benchmarks covering literature review, data analysis, code execution, and complex decision-making. It brings standardization and transparency to agent evaluation, with statistical confidence reporting, and a leaderboard that highlights tradeoffs between accuracy and computational cost.
|
10 |
+
"""
|
11 |
+
SCATTER_DISCLAIMER = """
|
12 |
+
Only agents that have cost data available will be shown in the scatter plot. If you don't see your agent, please ensure that you have provided cost data in your submission.
|
13 |
+
"""
|
14 |
|
15 |
SUBMISSION_TEXT = """
|
16 |
## Submissions
|
|
|
40 |
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
41 |
|
42 |
|
43 |
+
def hyperlink(link_url: str, text: str = "🔗") -> str:
|
44 |
+
if not link_url or not isinstance(link_url, str):
|
45 |
+
return str(text) # Or simply "" if link_url is bad
|
46 |
+
# Using a simpler style here for broad compatibility, your original style is fine too.
|
47 |
+
return f'<a target="_blank" href="{link_url}">{text}</a>'
|
48 |
|
49 |
|
50 |
def hf_uri_to_web_url(uri: str) -> str:
|
|
|
64 |
|
65 |
namespace, repo, path = parts
|
66 |
return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
|
67 |
+
|
68 |
+
css = """
|
69 |
+
.submission-accordion {
|
70 |
+
border-style: solid;
|
71 |
+
border-width: 3px !important;
|
72 |
+
border-color: #ec4899;
|
73 |
+
}
|
74 |
+
.submission-accordion span.svelte-1w6vloh {
|
75 |
+
font-weight: bold !important;
|
76 |
+
font-size: 1.2em !important;
|
77 |
+
}
|
78 |
+
#logo-image {
|
79 |
+
margin: auto;
|
80 |
+
max-width: 250px;
|
81 |
+
height: auto;
|
82 |
+
}
|
83 |
+
.table-component{
|
84 |
+
height: auto !important;
|
85 |
+
max-height: none !important;
|
86 |
+
}
|
87 |
+
|
88 |
+
.table-wrap {
|
89 |
+
max-height: none !important;
|
90 |
+
height: auto !important;
|
91 |
+
overflow-y: visible !important;
|
92 |
+
}
|
93 |
+
/* --- New Rules for Table Density --- */
|
94 |
+
table.gr-table th, table.gr-table td {
|
95 |
+
padding: 4px 4px !important;
|
96 |
+
width: 1%;
|
97 |
+
white-space: nowrap;
|
98 |
+
}
|
99 |
+
|
100 |
+
table.gr-table {
|
101 |
+
font-size: 14px !important;
|
102 |
+
}
|
103 |
+
|
104 |
+
/* Example of making the "Agent" column (the 1st column) a bit wider if needed */
|
105 |
+
table.gr-table th:nth-child(1),
|
106 |
+
table.gr-table td:nth-child(1) {
|
107 |
+
min-width: 150px !important;
|
108 |
+
white-space: normal !important; /* Allow agent names to wrap if long */
|
109 |
+
}
|
110 |
+
.html-container {
|
111 |
+
padding-top: 0 !important;
|
112 |
+
}
|
113 |
+
#scatter-disclaimer {
|
114 |
+
color: #f0529c !important;
|
115 |
+
}
|
116 |
+
thead.svelte-1e98i6s th {
|
117 |
+
background: white !important;
|
118 |
+
}
|
119 |
+
"""
|
data/1.0.0-dev1/agenteval.json
ADDED
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"suite_config": {
|
3 |
+
"name": "asta-bench",
|
4 |
+
"version": "1.0.0-dev1",
|
5 |
+
"splits": [
|
6 |
+
{
|
7 |
+
"name": "validation",
|
8 |
+
"tasks": [
|
9 |
+
{
|
10 |
+
"name": "arxivdigestables_validation",
|
11 |
+
"path": "astabench/arxivdigestables_validation",
|
12 |
+
"primary_metric": "score_tables/mean",
|
13 |
+
"tags": [
|
14 |
+
"lit"
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"name": "sqa_dev",
|
19 |
+
"path": "astabench/sqa_dev",
|
20 |
+
"primary_metric": "global_avg/mean",
|
21 |
+
"tags": [
|
22 |
+
"lit"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"name": "litqa2_validation",
|
27 |
+
"path": "astabench/litqa2_validation",
|
28 |
+
"primary_metric": "is_correct/accuracy",
|
29 |
+
"tags": [
|
30 |
+
"lit"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"name": "paper_finder_validation",
|
35 |
+
"path": "astabench/paper_finder_validation",
|
36 |
+
"primary_metric": "score_paper_finder/macro_avg",
|
37 |
+
"tags": [
|
38 |
+
"lit"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"name": "discoverybench_validation",
|
43 |
+
"path": "astabench/discoverybench_validation",
|
44 |
+
"primary_metric": "score_discoverybench/mean",
|
45 |
+
"tags": [
|
46 |
+
"data"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"name": "core_bench_validation",
|
51 |
+
"path": "astabench/core_bench_validation",
|
52 |
+
"primary_metric": "evaluate_task_questions/accuracy",
|
53 |
+
"tags": [
|
54 |
+
"code"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"name": "ds1000_validation",
|
59 |
+
"path": "astabench/ds1000_validation",
|
60 |
+
"primary_metric": "ds1000_scorer/accuracy",
|
61 |
+
"tags": [
|
62 |
+
"code"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"name": "e2e_discovery_validation",
|
67 |
+
"path": "astabench/e2e_discovery_validation",
|
68 |
+
"primary_metric": "score_rubric/accuracy",
|
69 |
+
"tags": [
|
70 |
+
"discovery"
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"name": "super_validation",
|
75 |
+
"path": "astabench/super_validation",
|
76 |
+
"primary_metric": "check_super_execution/entrypoints",
|
77 |
+
"tags": [
|
78 |
+
"code"
|
79 |
+
]
|
80 |
+
}
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"name": "test",
|
85 |
+
"tasks": [
|
86 |
+
{
|
87 |
+
"name": "paper_finder_test",
|
88 |
+
"path": "astabench/paper_finder_test",
|
89 |
+
"primary_metric": "score_paper_finder/macro_avg",
|
90 |
+
"tags": [
|
91 |
+
"lit"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"name": "sqa_test",
|
96 |
+
"path": "astabench/sqa_test",
|
97 |
+
"primary_metric": "global_avg/mean",
|
98 |
+
"tags": [
|
99 |
+
"lit"
|
100 |
+
]
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "arxivdigestables_test",
|
104 |
+
"path": "astabench/arxivdigestables_test",
|
105 |
+
"primary_metric": "score_tables/mean",
|
106 |
+
"tags": [
|
107 |
+
"lit"
|
108 |
+
]
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"name": "litqa2_test",
|
112 |
+
"path": "astabench/litqa2_test",
|
113 |
+
"primary_metric": "is_correct/accuracy",
|
114 |
+
"tags": [
|
115 |
+
"lit"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"name": "discoverybench_test",
|
120 |
+
"path": "astabench/discoverybench_test",
|
121 |
+
"primary_metric": "score_discoverybench/mean",
|
122 |
+
"tags": [
|
123 |
+
"data"
|
124 |
+
]
|
125 |
+
},
|
126 |
+
{
|
127 |
+
"name": "core_bench_test",
|
128 |
+
"path": "astabench/core_bench_test",
|
129 |
+
"primary_metric": "evaluate_task_questions/accuracy",
|
130 |
+
"tags": [
|
131 |
+
"code"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"name": "ds1000_test",
|
136 |
+
"path": "astabench/ds1000_test",
|
137 |
+
"primary_metric": "ds1000_scorer/accuracy",
|
138 |
+
"tags": [
|
139 |
+
"code"
|
140 |
+
]
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"name": "e2e_discovery_test",
|
144 |
+
"path": "astabench/e2e_discovery_test",
|
145 |
+
"primary_metric": "score_rubric/accuracy",
|
146 |
+
"tags": [
|
147 |
+
"discovery"
|
148 |
+
]
|
149 |
+
},
|
150 |
+
{
|
151 |
+
"name": "super_test",
|
152 |
+
"path": "astabench/super_test",
|
153 |
+
"primary_metric": "check_super_execution/entrypoints",
|
154 |
+
"tags": [
|
155 |
+
"code"
|
156 |
+
]
|
157 |
+
}
|
158 |
+
]
|
159 |
+
}
|
160 |
+
]
|
161 |
+
},
|
162 |
+
"split": "validation",
|
163 |
+
"results": [
|
164 |
+
{
|
165 |
+
"task_name": "sqa_dev",
|
166 |
+
"metrics": [
|
167 |
+
{
|
168 |
+
"name": "global_avg/mean",
|
169 |
+
"value": 0.6215245045241414
|
170 |
+
},
|
171 |
+
{
|
172 |
+
"name": "global_avg/stderr",
|
173 |
+
"value": 0.02088486499225903
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"name": "ingredient_recall/mean",
|
177 |
+
"value": 0.6029178145087237
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"name": "ingredient_recall/stderr",
|
181 |
+
"value": 0.026215888361291618
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "answer_precision/mean",
|
185 |
+
"value": 0.7960436785436785
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"name": "answer_precision/stderr",
|
189 |
+
"value": 0.027692773517249983
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"name": "citation_precision/mean",
|
193 |
+
"value": 0.697849041353826
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"name": "citation_precision/stderr",
|
197 |
+
"value": 0.026784164936602798
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"name": "citation_recall/mean",
|
201 |
+
"value": 0.3892874836903378
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"name": "citation_recall/stderr",
|
205 |
+
"value": 0.015094770200171756
|
206 |
+
}
|
207 |
+
],
|
208 |
+
"model_costs": [
|
209 |
+
1.3829150000000001,
|
210 |
+
0.9759700000000001,
|
211 |
+
2.2324650000000004,
|
212 |
+
0.76631,
|
213 |
+
0.9277900000000001,
|
214 |
+
2.6388600000000006,
|
215 |
+
0.8114100000000002,
|
216 |
+
2.3263174999999996,
|
217 |
+
2.5423725,
|
218 |
+
1.2398675000000001,
|
219 |
+
1.7387300000000003,
|
220 |
+
1.2176599999999997,
|
221 |
+
0.564655,
|
222 |
+
0.9726750000000001,
|
223 |
+
0.7675700000000001,
|
224 |
+
1.5198850000000002,
|
225 |
+
1.4726625000000002,
|
226 |
+
2.1937650000000004,
|
227 |
+
0.6907700000000001,
|
228 |
+
1.39835,
|
229 |
+
1.2598175,
|
230 |
+
2.5373550000000002,
|
231 |
+
2.19239,
|
232 |
+
1.2508875000000006,
|
233 |
+
2.2650550000000007,
|
234 |
+
1.6047725,
|
235 |
+
0.6525125000000003,
|
236 |
+
1.4262200000000003,
|
237 |
+
1.0533299999999999,
|
238 |
+
1.7252375,
|
239 |
+
1.407145,
|
240 |
+
1.5408700000000004,
|
241 |
+
2.8073224999999993,
|
242 |
+
1.0448125000000006,
|
243 |
+
1.7037300000000004,
|
244 |
+
0.8650500000000001,
|
245 |
+
1.0171225000000002,
|
246 |
+
0.5697925000000001,
|
247 |
+
2.7851025,
|
248 |
+
1.0551425,
|
249 |
+
2.9213775,
|
250 |
+
1.7772975000000004,
|
251 |
+
1.2753225000000001,
|
252 |
+
0.8108325000000001,
|
253 |
+
0.6958375000000001,
|
254 |
+
0.8840950000000003,
|
255 |
+
1.2028724999999998,
|
256 |
+
1.2490475000000003,
|
257 |
+
2.4272,
|
258 |
+
1.95026,
|
259 |
+
1.5352475,
|
260 |
+
2.11181,
|
261 |
+
2.3612249999999997,
|
262 |
+
1.8619225000000004,
|
263 |
+
0.7431075000000001,
|
264 |
+
1.5189675000000002,
|
265 |
+
1.089575,
|
266 |
+
1.6103700000000003,
|
267 |
+
1.4201450000000002,
|
268 |
+
2.397835,
|
269 |
+
1.469175,
|
270 |
+
1.0723550000000004,
|
271 |
+
0.7964050000000003,
|
272 |
+
3.3733175,
|
273 |
+
4.197085,
|
274 |
+
4.2637675,
|
275 |
+
1.2982124999999998,
|
276 |
+
0.66146,
|
277 |
+
1.1130475000000002,
|
278 |
+
2.4393974999999997,
|
279 |
+
2.582,
|
280 |
+
1.7381725000000001,
|
281 |
+
0.415025,
|
282 |
+
1.6777325,
|
283 |
+
1.0507825000000002,
|
284 |
+
2.4627125000000003,
|
285 |
+
1.017005,
|
286 |
+
1.9210250000000002,
|
287 |
+
1.5009025000000003,
|
288 |
+
0.8283125000000001,
|
289 |
+
2.9854425,
|
290 |
+
0.4633375000000001,
|
291 |
+
0.397685,
|
292 |
+
1.2803425,
|
293 |
+
3.0388200000000003,
|
294 |
+
1.2610875000000004,
|
295 |
+
1.798365,
|
296 |
+
3.427287500000001,
|
297 |
+
0.29307750000000005,
|
298 |
+
0.37101249999999997,
|
299 |
+
2.8046925000000003,
|
300 |
+
0.35557000000000005,
|
301 |
+
3.5481700000000007,
|
302 |
+
1.1073975,
|
303 |
+
1.5280825,
|
304 |
+
1.1714900000000001,
|
305 |
+
3.1791275000000003,
|
306 |
+
3.8214725000000005,
|
307 |
+
1.8440275,
|
308 |
+
1.730515,
|
309 |
+
1.9350675000000002,
|
310 |
+
1.6592125000000002,
|
311 |
+
1.9227124999999998,
|
312 |
+
1.202885,
|
313 |
+
1.2688150000000002,
|
314 |
+
0.8819875000000001,
|
315 |
+
0.6989325,
|
316 |
+
1.965635,
|
317 |
+
1.7467800000000002,
|
318 |
+
1.6940625000000002
|
319 |
+
]
|
320 |
+
}
|
321 |
+
],
|
322 |
+
"submission": {
|
323 |
+
"submit_time": "2025-06-09T20:55:35.869831Z",
|
324 |
+
"username": "miked-ai",
|
325 |
+
"agent_name": "Basic ReAct",
|
326 |
+
"agent_description": null,
|
327 |
+
"agent_url": null,
|
328 |
+
"logs_url": "hf://datasets/allenai/asta-bench-internal-submissions/1.0.0-dev1/validation/miked-ai_Basic_ReAct__task_tools__report_editor__2025-06-09T20-55-35",
|
329 |
+
"logs_url_public": null,
|
330 |
+
"summary_url": null
|
331 |
+
}
|
332 |
+
}
|
data_analysis.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Import our UI factories and the data loader
|
5 |
+
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
|
6 |
+
|
7 |
+
# Define the category for this page
|
8 |
+
CATEGORY_NAME = "Data Analysis"
|
9 |
+
|
10 |
+
with gr.Blocks() as demo:
|
11 |
+
gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
|
12 |
+
|
13 |
+
# --- This page now has two main sections: Validation and Test ---
|
14 |
+
with gr.Tabs():
|
15 |
+
with gr.Tab("Results: Validation"):
|
16 |
+
# 1. Load all necessary data for the "validation" split ONCE.
|
17 |
+
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
18 |
+
|
19 |
+
if not validation_df.empty:
|
20 |
+
# 2. Render the main category display using the loaded data.
|
21 |
+
create_leaderboard_display(
|
22 |
+
full_df=validation_df,
|
23 |
+
tag_map=validation_tag_map,
|
24 |
+
category_name=CATEGORY_NAME,
|
25 |
+
split_name="validation"
|
26 |
+
)
|
27 |
+
|
28 |
+
# 3. Render the detailed breakdown for each benchmark in the category.
|
29 |
+
create_benchmark_details_display(
|
30 |
+
full_df=validation_df,
|
31 |
+
tag_map=validation_tag_map,
|
32 |
+
category_name=CATEGORY_NAME
|
33 |
+
)
|
34 |
+
else:
|
35 |
+
gr.Markdown("No data available for validation split.")
|
36 |
+
|
37 |
+
with gr.Tab("Results: Test"):
|
38 |
+
# Repeat the process for the "test" split
|
39 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
40 |
+
|
41 |
+
if not test_df.empty:
|
42 |
+
create_leaderboard_display(
|
43 |
+
full_df=test_df,
|
44 |
+
tag_map=test_tag_map,
|
45 |
+
category_name=CATEGORY_NAME,
|
46 |
+
split_name="test"
|
47 |
+
)
|
48 |
+
create_benchmark_details_display(
|
49 |
+
full_df=test_df,
|
50 |
+
tag_map=test_tag_map,
|
51 |
+
category_name=CATEGORY_NAME
|
52 |
+
)
|
53 |
+
else:
|
54 |
+
gr.Markdown("No data available for test split.")
|
e2e.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Import our UI factories and the data loader
|
5 |
+
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
|
6 |
+
|
7 |
+
# Define the category for this page
|
8 |
+
CATEGORY_NAME = "Discovery"
|
9 |
+
|
10 |
+
with gr.Blocks() as demo:
|
11 |
+
gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
|
12 |
+
|
13 |
+
# --- This page now has two main sections: Validation and Test ---
|
14 |
+
with gr.Tabs():
|
15 |
+
with gr.Tab("Results: Validation"):
|
16 |
+
# 1. Load all necessary data for the "validation" split ONCE.
|
17 |
+
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
18 |
+
|
19 |
+
if not validation_df.empty:
|
20 |
+
# 2. Render the main category display using the loaded data.
|
21 |
+
create_leaderboard_display(
|
22 |
+
full_df=validation_df,
|
23 |
+
tag_map=validation_tag_map,
|
24 |
+
category_name=CATEGORY_NAME,
|
25 |
+
split_name="validation"
|
26 |
+
)
|
27 |
+
|
28 |
+
# 3. Render the detailed breakdown for each benchmark in the category.
|
29 |
+
create_benchmark_details_display(
|
30 |
+
full_df=validation_df,
|
31 |
+
tag_map=validation_tag_map,
|
32 |
+
category_name=CATEGORY_NAME
|
33 |
+
)
|
34 |
+
else:
|
35 |
+
gr.Markdown("No data available for validation split.")
|
36 |
+
|
37 |
+
with gr.Tab("Results: Test"):
|
38 |
+
# Repeat the process for the "test" split
|
39 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
40 |
+
|
41 |
+
if not test_df.empty:
|
42 |
+
create_leaderboard_display(
|
43 |
+
full_df=test_df,
|
44 |
+
tag_map=test_tag_map,
|
45 |
+
category_name=CATEGORY_NAME,
|
46 |
+
split_name="test"
|
47 |
+
)
|
48 |
+
create_benchmark_details_display(
|
49 |
+
full_df=test_df,
|
50 |
+
tag_map=test_tag_map,
|
51 |
+
category_name=CATEGORY_NAME
|
52 |
+
)
|
53 |
+
else:
|
54 |
+
gr.Markdown("No data available for test split.")
|
json_leaderboard.py
ADDED
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import Optional, Any, Dict # Added Dict
|
3 |
+
from zoneinfo import ZoneInfo
|
4 |
+
|
5 |
+
# datasets import might not be strictly needed by LeaderboardViewer itself anymore,
|
6 |
+
# but _get_dataframe might still use types from it if EvalResult refers to them.
|
7 |
+
# For now, let's keep it if your EvalResult or SuiteConfig models have dependencies.
|
8 |
+
# If not, it can be removed from here.
|
9 |
+
import datasets # Potentially removable from this file
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import plotly.express as px
|
12 |
+
import plotly.graph_objects as go
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
import seaborn as sns
|
16 |
+
import json # For loading the local JSON file
|
17 |
+
import os # For checking file existence
|
18 |
+
|
19 |
+
from agenteval import compute_summary_statistics
|
20 |
+
from agenteval.config import SuiteConfig
|
21 |
+
from agenteval.models import EvalResult
|
22 |
+
|
23 |
+
logger = logging.getLogger(__name__)
|
24 |
+
|
25 |
+
import logging
|
26 |
+
from typing import Optional, Any, Dict, List # Added List
|
27 |
+
from zoneinfo import ZoneInfo # Assuming this might be used by SuiteConfig/EvalResult or _get_dataframe
|
28 |
+
import json
|
29 |
+
import os
|
30 |
+
|
31 |
+
# Assuming these are correctly imported from your project
|
32 |
+
from agenteval.config import SuiteConfig
|
33 |
+
from agenteval.models import EvalResult
|
34 |
+
# from agenteval import compute_summary_statistics # Used by _get_dataframe
|
35 |
+
|
36 |
+
|
37 |
+
class DataTransformer:
|
38 |
+
"""
|
39 |
+
Load and visualize leaderboard from a single, local JSON result file.
|
40 |
+
"""
|
41 |
+
_INFORMAL_TO_FORMAL_NAME_MAP = {
|
42 |
+
"lit": "Literature Understanding",
|
43 |
+
"data": "Data Analysis",
|
44 |
+
"code": "Code Execution",
|
45 |
+
"discovery": "Discovery",
|
46 |
+
"arxivdigestables_validation": "Arxivdigestables Validation",
|
47 |
+
"sqa_dev": "Sqa Dev",
|
48 |
+
"litqa2_validation": "Litqa2 Validation",
|
49 |
+
"paper_finder_validation": "Paper Finder Validation",
|
50 |
+
"discoverybench_validation": "Discoverybench Validation",
|
51 |
+
"core_bench_validation": "Core Bench Validation",
|
52 |
+
"ds1000_validation": "DS1000 Validation",
|
53 |
+
"e2e_discovery_validation": "E2E Discovery Validation",
|
54 |
+
"super_validation": "Super Validation",
|
55 |
+
# Add any other raw names that can appear in task.name or task.tags
|
56 |
+
}
|
57 |
+
|
58 |
+
def __init__(
|
59 |
+
self,
|
60 |
+
json_file_path: str, # Mandatory: path to the local JSON file
|
61 |
+
split: str, # Still needed for context within the JSON's suite_config
|
62 |
+
is_internal: bool = False
|
63 |
+
):
|
64 |
+
self._json_file_path = json_file_path
|
65 |
+
self._split = split
|
66 |
+
self._internal = is_internal
|
67 |
+
self._loaded_json_data: Optional[Dict[str, Any]] = None
|
68 |
+
self._cfg: Optional[SuiteConfig] = None
|
69 |
+
|
70 |
+
logger.info(f"Initializing LeaderboardViewer with local JSON file: {self._json_file_path}")
|
71 |
+
|
72 |
+
# --- Load and Validate JSON data ---
|
73 |
+
if not os.path.exists(self._json_file_path):
|
74 |
+
raise FileNotFoundError(f"JSON file not found at path: {self._json_file_path}")
|
75 |
+
try:
|
76 |
+
with open(self._json_file_path, 'r', encoding='utf-8') as f:
|
77 |
+
self._loaded_json_data = json.load(f)
|
78 |
+
except json.JSONDecodeError as e:
|
79 |
+
raise ValueError(f"Failed to parse JSON from local file {self._json_file_path}: {e}")
|
80 |
+
except Exception as e:
|
81 |
+
raise ValueError(f"Error reading local file {self._json_file_path}: {e}")
|
82 |
+
|
83 |
+
if not self._loaded_json_data:
|
84 |
+
raise ValueError(f"No data loaded from JSON file {self._json_file_path}.")
|
85 |
+
|
86 |
+
try:
|
87 |
+
eval_result = EvalResult.model_validate(self._loaded_json_data)
|
88 |
+
except Exception as e:
|
89 |
+
raise ValueError(f"Failed to validate JSON data from file '{self._json_file_path}' against EvalResult model: {e}")
|
90 |
+
|
91 |
+
self._cfg = eval_result.suite_config
|
92 |
+
if not isinstance(self._cfg, SuiteConfig):
|
93 |
+
raise TypeError(f"self._cfg is not a SuiteConfig object after loading from '{self._json_file_path}', got {type(self._cfg)}.")
|
94 |
+
|
95 |
+
# --- Populate Tag Map (Corrected Placement and Helper Function Access) ---
|
96 |
+
self.tag_map: dict[str, list[str]] = {}
|
97 |
+
|
98 |
+
# Access tasks from the loaded config
|
99 |
+
tasks_for_split: List[Any] = self._cfg.get_tasks(self._split) # Assuming get_tasks returns a list of task-like objects
|
100 |
+
|
101 |
+
for task in tasks_for_split:
|
102 |
+
# Ensure task object has 'name' and 'tags' attributes
|
103 |
+
if not hasattr(task, 'name') or not hasattr(task, 'tags'):
|
104 |
+
logger.warning(f"Task object {task} is missing 'name' or 'tags' attribute. Skipping.")
|
105 |
+
continue
|
106 |
+
|
107 |
+
formal_task_display_name = self._get_formal_display_name_static(task.name) # Use the helper method
|
108 |
+
|
109 |
+
if not (task.tags or []):
|
110 |
+
continue
|
111 |
+
|
112 |
+
for raw_tag_name in task.tags:
|
113 |
+
formal_tag_display_name_key = self._get_formal_display_name_static(raw_tag_name)
|
114 |
+
|
115 |
+
self.tag_map.setdefault(formal_tag_display_name_key, []).append(formal_task_display_name)
|
116 |
+
|
117 |
+
for key in self.tag_map:
|
118 |
+
self.tag_map[key] = sorted(list(set(self.tag_map[key])))
|
119 |
+
|
120 |
+
# --- Helper function defined as a static method or regular method ---
|
121 |
+
# Option 1: Static method (doesn't need 'self', uses the class attribute)
|
122 |
+
@staticmethod
|
123 |
+
def _get_formal_display_name_static(raw_name: str) -> str:
|
124 |
+
"""
|
125 |
+
Helper function to get the formal display name for a raw tag or task name.
|
126 |
+
Uses the class's map and provides a fallback.
|
127 |
+
"""
|
128 |
+
return DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP.get(raw_name, raw_name.replace("_", " ").title())
|
129 |
+
|
130 |
+
def _load(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
|
131 |
+
"""
|
132 |
+
Prepares the DataFrame from the loaded JSON data.
|
133 |
+
The JSON data is already loaded and validated in __init__.
|
134 |
+
"""
|
135 |
+
if self._loaded_json_data is None or self._cfg is None:
|
136 |
+
# This should not happen if __init__ completed successfully
|
137 |
+
raise RuntimeError("LeaderboardViewer2 not properly initialized. JSON data or SuiteConfig is missing.")
|
138 |
+
|
139 |
+
# The _get_dataframe function expects a list of records.
|
140 |
+
# Since we have a single JSON file representing one result, wrap it in a list.
|
141 |
+
records_list: list[dict] = [self._loaded_json_data]
|
142 |
+
|
143 |
+
overview_df = _get_dataframe(
|
144 |
+
records_list=records_list,
|
145 |
+
split=self._split,
|
146 |
+
is_internal=self._internal,
|
147 |
+
suite_config=self._cfg, # Pass the SuiteConfig loaded in __init__
|
148 |
+
)
|
149 |
+
return overview_df, self.tag_map
|
150 |
+
|
151 |
+
# --- view method remains the same as your last version ---
|
152 |
+
def view(
|
153 |
+
self,
|
154 |
+
tag: Optional[str] = None,
|
155 |
+
with_plots: bool = False,
|
156 |
+
use_plotly: bool = False,
|
157 |
+
) -> tuple[pd.DataFrame, dict[str, Any]]:
|
158 |
+
data, tag_map = self._load() # tag_map is also returned by _load now
|
159 |
+
print(f"AHAHASHJDBFGASJHDBJAHSDB,AHDB {tag_map}")
|
160 |
+
print(f"THIS IS THE DATA DATA DTAA {data.columns}")
|
161 |
+
if data.empty or (len(data) == 1 and data.iloc[0].get("Agent") == "No data"):
|
162 |
+
logger.warning("No data available to view. Returning empty DataFrame and plots.")
|
163 |
+
return data, {}
|
164 |
+
|
165 |
+
base_cols = ["Agent", "Submitter", "Date", "Logs"]
|
166 |
+
existing_cols = [col for col in base_cols if col in data.columns]
|
167 |
+
|
168 |
+
primary_score_col: str
|
169 |
+
group_metric_names: list[str]
|
170 |
+
|
171 |
+
if tag is None:
|
172 |
+
primary = "Overall"
|
173 |
+
group = list(tag_map.keys())
|
174 |
+
else:
|
175 |
+
primary = tag
|
176 |
+
group = tag_map.get(tag, [])
|
177 |
+
|
178 |
+
if f"{primary} Score" in data.columns:
|
179 |
+
data = data.sort_values(f"{primary} Score", ascending=False)
|
180 |
+
else:
|
181 |
+
logger.warning(f"Primary metric '{primary}' for sorting not found. Data will not be sorted by it.")
|
182 |
+
|
183 |
+
metrics_to_display = []
|
184 |
+
if f"{primary} Cost" in data.columns:
|
185 |
+
metrics_to_display.append(f"{primary} Cost")
|
186 |
+
if f"{primary} Score" in data.columns:
|
187 |
+
metrics_to_display.append(f"{primary} Score")
|
188 |
+
|
189 |
+
for g_item in group:
|
190 |
+
if g_item in data.columns:
|
191 |
+
metrics_to_display.append(g_item)
|
192 |
+
if f"{g_item} Cost" in data.columns:
|
193 |
+
metrics_to_display.append(f"{g_item} Cost")
|
194 |
+
if f"{g_item} Score" in data.columns:
|
195 |
+
metrics_to_display.append(f"{g_item} Score")
|
196 |
+
|
197 |
+
|
198 |
+
final_cols_to_display = existing_cols + [m for m in metrics_to_display if m in data.columns]
|
199 |
+
final_cols_to_display = sorted(list(set(final_cols_to_display)), key=final_cols_to_display.index)
|
200 |
+
|
201 |
+
df_view = data.loc[:, final_cols_to_display].reset_index(drop=True)
|
202 |
+
|
203 |
+
plots: dict[str, Any] = {}
|
204 |
+
if with_plots:
|
205 |
+
plot_metric_names = [primary] + [g_item for g_item in group if g_item in data.columns]
|
206 |
+
for metric_name in plot_metric_names:
|
207 |
+
score_col = f"{metric_name} Score"
|
208 |
+
cost_col = f"{metric_name} Cost"
|
209 |
+
if score_col in df_view.columns and cost_col in df_view.columns:
|
210 |
+
if use_plotly:
|
211 |
+
fig = _plot_scatter_plotly(df_view, x=cost_col, y=score_col, agent_col="Agent")
|
212 |
+
plots[f"scatter_{metric_name}"] = fig
|
213 |
+
else:
|
214 |
+
logger.warning(
|
215 |
+
f"Skipping plot for '{metric_name}': score column '{score_col}' or cost column '{cost_col}' not found."
|
216 |
+
)
|
217 |
+
return df_view, plots
|
218 |
+
|
219 |
+
|
220 |
+
def _safe_round(value, digits=2):
|
221 |
+
return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
|
222 |
+
|
223 |
+
def _get_dataframe(
|
224 |
+
records_list: list[dict],
|
225 |
+
split: str,
|
226 |
+
is_internal: bool,
|
227 |
+
suite_config: SuiteConfig,
|
228 |
+
timezone: str = "US/Pacific",
|
229 |
+
) -> pd.DataFrame:
|
230 |
+
# This function remains the same as in the previous version you provided.
|
231 |
+
# It takes a list of records (which will be a list containing one item
|
232 |
+
# from the loaded JSON file) and processes it.
|
233 |
+
if not records_list:
|
234 |
+
logger.warning(f"No records provided to _get_dataframe for split '{split}'. Returning empty DataFrame with placeholder.")
|
235 |
+
expected_pretty_cols = ["Agent Name", "Submitter", "Date", "Overall Score", "Logs"]
|
236 |
+
empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
|
237 |
+
return empty_df
|
238 |
+
|
239 |
+
cfg = suite_config
|
240 |
+
|
241 |
+
rows = []
|
242 |
+
for itm_idx, itm in enumerate(records_list):
|
243 |
+
if not isinstance(itm, dict):
|
244 |
+
logger.warning(f"Item {itm_idx} in records_list is not a dict, skipping.")
|
245 |
+
continue
|
246 |
+
try:
|
247 |
+
ev = EvalResult.model_validate(itm)
|
248 |
+
except Exception as e:
|
249 |
+
logger.error(f"Failed to validate item {itm_idx} with EvalResult: {itm}. Error: {e}")
|
250 |
+
continue
|
251 |
+
|
252 |
+
sub = ev.submission
|
253 |
+
date_str = None
|
254 |
+
if sub.submit_time is not None:
|
255 |
+
submit_dt = sub.submit_time
|
256 |
+
if not isinstance(submit_dt, pd.Timestamp):
|
257 |
+
if submit_dt.tzinfo is None:
|
258 |
+
logger.debug(f"Submission time for {sub.agent_name} is timezone-naive, assuming UTC.")
|
259 |
+
submit_dt = submit_dt.replace(tzinfo=ZoneInfo("UTC"))
|
260 |
+
date_str = pd.Timestamp(submit_dt).tz_convert(ZoneInfo(timezone)).strftime("%Y-%m-%d")
|
261 |
+
else:
|
262 |
+
date_str = None
|
263 |
+
|
264 |
+
if not ev.results:
|
265 |
+
logger.warning(
|
266 |
+
f"Skipping submission {sub.agent_name} ({sub.username or 'N/A'}) "
|
267 |
+
f"({sub.submit_time or 'N/A'}) due to no results."
|
268 |
+
)
|
269 |
+
continue
|
270 |
+
stats = compute_summary_statistics(
|
271 |
+
suite_config=cfg, split=split, results=ev.results
|
272 |
+
)
|
273 |
+
flat = {}
|
274 |
+
print(f"STATS STATS ASTATAS SD T S T A A {stats}")
|
275 |
+
for key, s_obj in stats.items():
|
276 |
+
parts = key.split("/")
|
277 |
+
if parts[0] == "overall":
|
278 |
+
flat["overall/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
|
279 |
+
flat["overall/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
|
280 |
+
elif parts[0] == "tag" and len(parts) > 1:
|
281 |
+
tag_name = parts[1]
|
282 |
+
flat[f"tag/{tag_name}/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
|
283 |
+
flat[f"tag/{tag_name}/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
|
284 |
+
elif parts[0] == "task" and len(parts) > 1:
|
285 |
+
task_name = parts[1]
|
286 |
+
score = getattr(s_obj, 'score', np.nan)
|
287 |
+
cost = getattr(s_obj, 'cost', np.nan)
|
288 |
+
score_stderr = getattr(s_obj, 'score_stderr', np.nan)
|
289 |
+
cost_stderr = getattr(s_obj, 'cost_stderr', np.nan)
|
290 |
+
|
291 |
+
flat[f"task/{task_name}/score"] = _safe_round(score)
|
292 |
+
flat[f"task/{task_name}/score_ci"] = _safe_round(score_stderr * 1.96 if pd.notna(score_stderr) else np.nan)
|
293 |
+
flat[f"task/{task_name}/cost"] = _safe_round(cost)
|
294 |
+
flat[f"task/{task_name}/cost_ci"] = _safe_round(cost_stderr * 1.96 if pd.notna(cost_stderr) else np.nan)
|
295 |
+
else:
|
296 |
+
logger.debug(f"Uncommon key structure from compute_summary_statistics: '{key}'. Attempting generic add.")
|
297 |
+
if hasattr(s_obj, 'score'):
|
298 |
+
flat[f"{key}/score"] = _safe_round(s_obj.score)
|
299 |
+
if hasattr(s_obj, 'cost'):
|
300 |
+
flat[f"{key}/cost"] = _safe_round(s_obj.cost)
|
301 |
+
|
302 |
+
current_logs_url = None
|
303 |
+
if is_internal and sub.logs_url:
|
304 |
+
current_logs_url = str(sub.logs_url)
|
305 |
+
elif not is_internal and sub.logs_url_public:
|
306 |
+
current_logs_url = str(sub.logs_url_public)
|
307 |
+
|
308 |
+
rows.append(
|
309 |
+
{
|
310 |
+
"agent_name": sub.agent_name or "N/A",
|
311 |
+
"username": sub.username or "N/A",
|
312 |
+
"submit_time": date_str,
|
313 |
+
**flat,
|
314 |
+
"logs_url": current_logs_url,
|
315 |
+
}
|
316 |
+
)
|
317 |
+
|
318 |
+
if not rows:
|
319 |
+
logger.warning(f"No valid rows generated from records_list for split '{split}'. Returning empty DataFrame with placeholder.")
|
320 |
+
expected_pretty_cols = ["Agent", "Submitter", "Date", "Overall Score", "Overall Cost", "Logs"]
|
321 |
+
empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
|
322 |
+
return empty_df
|
323 |
+
|
324 |
+
df = pd.DataFrame(rows)
|
325 |
+
pretty_cols = {c: _pretty_column_name(c) for c in df.columns if c in df.columns}
|
326 |
+
overview = df.rename(columns=pretty_cols)
|
327 |
+
return overview
|
328 |
+
|
329 |
+
def _pretty_column_name(col: str) -> str:
|
330 |
+
"""Map raw column name to display name."""
|
331 |
+
# --- Step 1: Fixed, direct mappings ---
|
332 |
+
fixed_mappings = {
|
333 |
+
"submit_time": "Date",
|
334 |
+
"agent_name": "Agent",
|
335 |
+
"username": "Submitter",
|
336 |
+
"logs_url": "Logs",
|
337 |
+
"overall/score": "Overall Score",
|
338 |
+
"overall/cost": "Overall Cost",
|
339 |
+
}
|
340 |
+
if col in fixed_mappings:
|
341 |
+
return fixed_mappings[col]
|
342 |
+
|
343 |
+
# --- Step 2: Define your mapping for informal names to descriptive names ---
|
344 |
+
informal_map = DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP
|
345 |
+
|
346 |
+
# --- Step 3: Dynamic mappings for task or tag columns using the informal_to_formal_name_map ---
|
347 |
+
parts = col.split("/")
|
348 |
+
if len(parts) == 3:
|
349 |
+
item_type, informal_name, metric_suffix = parts #
|
350 |
+
|
351 |
+
formal_name = informal_map.get(informal_name)
|
352 |
+
if formal_name is None:
|
353 |
+
formal_name = informal_name.replace("_", " ").title()
|
354 |
+
print(f"[DEBUG _pretty_column_name] Informal name '{informal_name}' not in map, using fallback: '{formal_name}'")
|
355 |
+
|
356 |
+
if metric_suffix == "score":
|
357 |
+
return f"{formal_name} Score"
|
358 |
+
if metric_suffix == "cost":
|
359 |
+
return f"{formal_name} Cost"
|
360 |
+
if metric_suffix == "score_ci":
|
361 |
+
return f"{formal_name} Score 95% CI"
|
362 |
+
if metric_suffix == "cost_ci":
|
363 |
+
return f"{formal_name} Cost 95% CI"
|
364 |
+
|
365 |
+
# --- Step 4: Fallback for columns that don't match the "type/name/metric" pattern ---
|
366 |
+
if "/" not in col:
|
367 |
+
return col.replace("_", " ").title()
|
368 |
+
else:
|
369 |
+
return parts[-1].replace("_", " ").title()
|
370 |
+
|
371 |
+
DEFAULT_Y_COLUMN = "Overall Score"
|
372 |
+
DUMMY_X_VALUE_FOR_MISSING_COSTS = 0 # Value to use if x-axis data (costs) is missing
|
373 |
+
|
374 |
+
def _plot_scatter_plotly(
|
375 |
+
data: pd.DataFrame,
|
376 |
+
x: Optional[str],
|
377 |
+
y: str,
|
378 |
+
agent_col: str = "Agent"
|
379 |
+
) -> go.Figure:
|
380 |
+
|
381 |
+
x_col_to_use = x
|
382 |
+
y_col_to_use = y
|
383 |
+
|
384 |
+
# 1. Check if y-column exists
|
385 |
+
if y_col_to_use not in data.columns:
|
386 |
+
logger.error(
|
387 |
+
f"y-axis column '{y_col_to_use}' MUST exist in DataFrame. "
|
388 |
+
f"Cannot generate plot. Available columns: {data.columns.tolist()}"
|
389 |
+
)
|
390 |
+
return go.Figure()
|
391 |
+
|
392 |
+
# 2. Check if agent_col exists
|
393 |
+
if agent_col not in data.columns:
|
394 |
+
logger.warning(
|
395 |
+
f"Agent column '{agent_col}' not found in DataFrame. "
|
396 |
+
f"Available columns: {data.columns.tolist()}. Returning empty figure."
|
397 |
+
)
|
398 |
+
return go.Figure()
|
399 |
+
|
400 |
+
# 3. Prepare data (make a copy, handle numeric conversion for y)
|
401 |
+
data_plot = data.copy()
|
402 |
+
try:
|
403 |
+
data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
|
404 |
+
except Exception as e:
|
405 |
+
logger.error(f"Error converting y-column '{y_col_to_use}' to numeric: {e}. Returning empty figure.")
|
406 |
+
return go.Figure()
|
407 |
+
|
408 |
+
# 4. Handle x-column (costs)
|
409 |
+
x_axis_label = x_col_to_use if x_col_to_use else "Cost (Data N/A)" # Label for the x-axis
|
410 |
+
x_data_is_valid = False
|
411 |
+
|
412 |
+
if x_col_to_use and x_col_to_use in data_plot.columns:
|
413 |
+
try:
|
414 |
+
data_plot[x_col_to_use] = pd.to_numeric(data_plot[x_col_to_use], errors='coerce')
|
415 |
+
# Check if there's any non-NaN data after coercion for x
|
416 |
+
if data_plot[x_col_to_use].notna().any():
|
417 |
+
x_data_is_valid = True
|
418 |
+
else:
|
419 |
+
logger.info(f"x-axis column '{x_col_to_use}' exists but contains all NaN/None values after numeric conversion.")
|
420 |
+
except Exception as e:
|
421 |
+
logger.warning(f"Error converting x-column '{x_col_to_use}' to numeric: {e}. Will use dummy x-values.")
|
422 |
+
# x_data_is_valid remains False
|
423 |
+
else:
|
424 |
+
if x_col_to_use: # Name was provided but column doesn't exist
|
425 |
+
logger.warning(f"x-axis column '{x_col_to_use}' not found in DataFrame.")
|
426 |
+
else: # x (column name) was None
|
427 |
+
logger.info("x-axis column name was not provided (is None).")
|
428 |
+
|
429 |
+
if not x_data_is_valid:
|
430 |
+
logger.info(f"Using dummy x-value '{DUMMY_X_VALUE_FOR_MISSING_COSTS}' for all data points as x-data is missing or invalid.")
|
431 |
+
# Create a new column with the dummy x-value for all rows
|
432 |
+
# Use a unique name for this dummy column to avoid potential clashes
|
433 |
+
dummy_x_col_name = "__dummy_x_for_plotting__"
|
434 |
+
data_plot[dummy_x_col_name] = DUMMY_X_VALUE_FOR_MISSING_COSTS
|
435 |
+
x_col_to_use = dummy_x_col_name # Update x_col_to_use to point to our dummy data
|
436 |
+
x_axis_label = x if x else "Cost (Data N/A)" # Use original x name for label if provided
|
437 |
+
# or a generic label if x was None.
|
438 |
+
# Could also be f"Cost (Fixed at {DUMMY_X_VALUE_FOR_MISSING_COSTS})"
|
439 |
+
|
440 |
+
|
441 |
+
# 5. Drop rows where y is NaN (x is now guaranteed to have values, either real or dummy)
|
442 |
+
data_plot.dropna(subset=[y_col_to_use], inplace=True)
|
443 |
+
|
444 |
+
fig = go.Figure()
|
445 |
+
|
446 |
+
if data_plot.empty:
|
447 |
+
logger.warning(f"No valid data to plot for y='{y_col_to_use}' (and x='{x_col_to_use}') after cleaning NaNs from y.")
|
448 |
+
# Still return a figure object, but it will be empty. Update layout for clarity.
|
449 |
+
fig.update_layout(
|
450 |
+
title=f"{y_col_to_use} vs. {x_axis_label} (No Data)",
|
451 |
+
xaxis=dict(title=x_axis_label, range=[DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1] if not x_data_is_valid else None),
|
452 |
+
yaxis=dict(title=y_col_to_use)
|
453 |
+
)
|
454 |
+
return fig
|
455 |
+
|
456 |
+
|
457 |
+
for agent, group in data_plot.groupby(agent_col):
|
458 |
+
hover_x_display = "%{x:.2f}" if x_data_is_valid else str(DUMMY_X_VALUE_FOR_MISSING_COSTS) + " (fixed)"
|
459 |
+
fig.add_trace(go.Scatter(
|
460 |
+
x=group[x_col_to_use],
|
461 |
+
y=group[y_col_to_use],
|
462 |
+
mode='markers',
|
463 |
+
name=str(agent),
|
464 |
+
hovertemplate=f"{x_axis_label}: {hover_x_display}<br>{y_col_to_use}: %{{y:.2f}}<extra>{str(agent)}</extra>",
|
465 |
+
marker=dict(size=10)
|
466 |
+
))
|
467 |
+
|
468 |
+
# Configure layout
|
469 |
+
xaxis_config = dict(title=x_axis_label)
|
470 |
+
if not x_data_is_valid: # If using dummy x, set a tighter, fixed range for x-axis
|
471 |
+
xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
|
472 |
+
xaxis_config['tickvals'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS] # Show only one tick at the dummy value
|
473 |
+
xaxis_config['ticktext'] = [str(DUMMY_X_VALUE_FOR_MISSING_COSTS)]
|
474 |
+
else: # Real x-data
|
475 |
+
xaxis_config['rangemode'] = "tozero"
|
476 |
+
|
477 |
+
|
478 |
+
fig.update_layout(
|
479 |
+
title=f"{y_col_to_use} vs. {x_axis_label}",
|
480 |
+
xaxis=xaxis_config,
|
481 |
+
yaxis=dict(title=y_col_to_use, rangemode="tozero"),
|
482 |
+
legend_title_text=agent_col
|
483 |
+
)
|
484 |
+
|
485 |
+
return fig
|
leaderboard_transformer.py
ADDED
@@ -0,0 +1,436 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import plotly.graph_objects as go
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import logging
|
5 |
+
from typing import Optional, Any, Dict, List # Added List
|
6 |
+
from zoneinfo import ZoneInfo # Assuming this might be used by SuiteConfig/EvalResult or _get_dataframe
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
INFORMAL_TO_FORMAL_NAME_MAP = {
|
13 |
+
# Short Names
|
14 |
+
"lit": "Literature Understanding",
|
15 |
+
"data": "Data Analysis",
|
16 |
+
"code": "Code Execution",
|
17 |
+
"discovery": "Discovery",
|
18 |
+
|
19 |
+
# Long Raw Names
|
20 |
+
"arxivdigestables_validation": "Arxivdigestables Validation",
|
21 |
+
"sqa_dev": "Sqa Dev",
|
22 |
+
"litqa2_validation": "Litqa2 Validation",
|
23 |
+
"paper_finder_validation": "Paper Finder Validation",
|
24 |
+
"discoverybench_validation": "Discoverybench Validation",
|
25 |
+
"core_bench_validation": "Core Bench Validation",
|
26 |
+
"ds1000_validation": "DS1000 Validation",
|
27 |
+
"e2e_discovery_validation": "E2E Discovery Validation",
|
28 |
+
"super_validation": "Super Validation",
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
### 2. The Updated Helper Functions ###
|
33 |
+
|
34 |
+
def _safe_round(value, digits=2):
|
35 |
+
"""Rounds a number if it's a valid float/int, otherwise returns it as is."""
|
36 |
+
return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
|
37 |
+
|
38 |
+
|
39 |
+
def _pretty_column_name(raw_col: str) -> str:
|
40 |
+
"""
|
41 |
+
Takes a raw column name from the DataFrame and returns a "pretty" version.
|
42 |
+
Handles three cases:
|
43 |
+
1. Fixed names (e.g., 'User/organization' -> 'Submitter').
|
44 |
+
2. Dynamic names (e.g., 'ds1000_validation score' -> 'DS1000 Validation Score').
|
45 |
+
3. Fallback for any other names.
|
46 |
+
"""
|
47 |
+
# Case 1: Handle fixed, special-case mappings first.
|
48 |
+
fixed_mappings = {
|
49 |
+
'Agent': 'Agent',
|
50 |
+
'Agent description': 'Agent Description',
|
51 |
+
'User/organization': 'Submitter',
|
52 |
+
'Submission date': 'Date',
|
53 |
+
'Overall': 'Overall Score',
|
54 |
+
'Overall cost': 'Overall Cost',
|
55 |
+
'Logs': 'Logs'
|
56 |
+
}
|
57 |
+
if raw_col in fixed_mappings:
|
58 |
+
return fixed_mappings[raw_col]
|
59 |
+
|
60 |
+
# Case 2: Handle dynamic names by finding the longest matching base name.
|
61 |
+
# We sort by length (desc) to match 'core_bench_validation' before 'core_bench'.
|
62 |
+
sorted_base_names = sorted(INFORMAL_TO_FORMAL_NAME_MAP.keys(), key=len, reverse=True)
|
63 |
+
|
64 |
+
for base_name in sorted_base_names:
|
65 |
+
if raw_col.startswith(base_name):
|
66 |
+
formal_name = INFORMAL_TO_FORMAL_NAME_MAP[base_name]
|
67 |
+
|
68 |
+
# Get the metric part (e.g., ' score' or ' cost 95% CI')
|
69 |
+
metric_part = raw_col[len(base_name):].strip()
|
70 |
+
|
71 |
+
# Capitalize the metric part correctly (e.g., 'score' -> 'Score')
|
72 |
+
pretty_metric = metric_part.capitalize()
|
73 |
+
|
74 |
+
return f"{formal_name} {pretty_metric}"
|
75 |
+
|
76 |
+
# Case 3: If no specific rule applies, just make it title case.
|
77 |
+
return raw_col.title()
|
78 |
+
|
79 |
+
|
80 |
+
def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
|
81 |
+
"""
|
82 |
+
Converts a tag map with raw names into a tag map with pretty, formal names.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
raw_tag_map: The map with raw keys and values (e.g., {'lit': ['litqa2_validation']}).
|
86 |
+
name_map: The INFORMAL_TO_FORMAL_NAME_MAP used for translation.
|
87 |
+
|
88 |
+
Returns:
|
89 |
+
A new dictionary with pretty names (e.g., {'Literature Understanding': ['Litqa2 Validation']}).
|
90 |
+
"""
|
91 |
+
pretty_map = {}
|
92 |
+
# A reverse map to find raw keys from formal names if needed, though not used here
|
93 |
+
# This is just for understanding; the main logic uses the forward map.
|
94 |
+
|
95 |
+
# Helper to get pretty name with a fallback
|
96 |
+
def get_pretty(raw_name):
|
97 |
+
return name_map.get(raw_name, raw_name.replace("_", " ").title())
|
98 |
+
|
99 |
+
for raw_key, raw_value_list in raw_tag_map.items():
|
100 |
+
pretty_key = get_pretty(raw_key)
|
101 |
+
pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
|
102 |
+
pretty_map[pretty_key] = sorted(list(set(pretty_value_list)))
|
103 |
+
|
104 |
+
return pretty_map
|
105 |
+
|
106 |
+
|
107 |
+
def transform_raw_dataframe(raw_df: pd.DataFrame) -> pd.DataFrame:
|
108 |
+
"""
|
109 |
+
Transforms a raw leaderboard DataFrame into a presentation-ready format.
|
110 |
+
|
111 |
+
This function performs two main actions:
|
112 |
+
1. Rounds all numeric metric values (columns containing 'score' or 'cost').
|
113 |
+
2. Renames all columns to a "pretty", human-readable format.
|
114 |
+
Args:
|
115 |
+
raw_df (pd.DataFrame): The DataFrame with raw data and column names
|
116 |
+
like 'agent_name', 'overall/score', 'tag/code/cost'.
|
117 |
+
Returns:
|
118 |
+
pd.DataFrame: A new DataFrame ready for display.
|
119 |
+
"""
|
120 |
+
if not isinstance(raw_df, pd.DataFrame):
|
121 |
+
raise TypeError("Input 'raw_df' must be a pandas DataFrame.")
|
122 |
+
|
123 |
+
df = raw_df.copy()
|
124 |
+
|
125 |
+
# Create the mapping for pretty column names
|
126 |
+
pretty_cols_map = {col: _pretty_column_name(col) for col in df.columns}
|
127 |
+
|
128 |
+
# Rename the columns and return the new DataFrame
|
129 |
+
transformed_df = df.rename(columns=pretty_cols_map)
|
130 |
+
# Apply safe rounding to all metric columns
|
131 |
+
for col in transformed_df.columns:
|
132 |
+
if 'Score' in col or 'Cost' in col:
|
133 |
+
transformed_df[col] = transformed_df[col].apply(_safe_round)
|
134 |
+
|
135 |
+
logger.info("Raw DataFrame transformed: numbers rounded and columns renamed.")
|
136 |
+
return transformed_df
|
137 |
+
|
138 |
+
|
139 |
+
class DataTransformer:
|
140 |
+
"""
|
141 |
+
Visualizes a pre-processed leaderboard DataFrame.
|
142 |
+
|
143 |
+
This class takes a "pretty" DataFrame and a tag map, and provides
|
144 |
+
methods to view filtered versions of the data and generate plots.
|
145 |
+
"""
|
146 |
+
def __init__(self, dataframe: pd.DataFrame, tag_map: dict[str, list[str]]):
|
147 |
+
"""
|
148 |
+
Initializes the viewer.
|
149 |
+
|
150 |
+
Args:
|
151 |
+
dataframe (pd.DataFrame): The presentation-ready leaderboard data.
|
152 |
+
tag_map (dict): A map of formal tag names to formal task names.
|
153 |
+
"""
|
154 |
+
if not isinstance(dataframe, pd.DataFrame):
|
155 |
+
raise TypeError("Input 'dataframe' must be a pandas DataFrame.")
|
156 |
+
if not isinstance(tag_map, dict):
|
157 |
+
raise TypeError("Input 'tag_map' must be a dictionary.")
|
158 |
+
|
159 |
+
self.data = dataframe
|
160 |
+
self.tag_map = tag_map
|
161 |
+
logger.info(f"DataTransformer initialized with a DataFrame of shape {self.data.shape}.")
|
162 |
+
|
163 |
+
|
164 |
+
def view(
|
165 |
+
self,
|
166 |
+
tag: Optional[str] = "Overall", # Default to "Overall" for clarity
|
167 |
+
use_plotly: bool = False,
|
168 |
+
) -> tuple[pd.DataFrame, dict[str, go.Figure]]:
|
169 |
+
"""
|
170 |
+
Generates a filtered view of the DataFrame and a corresponding scatter plot.
|
171 |
+
"""
|
172 |
+
if self.data.empty:
|
173 |
+
logger.warning("No data available to view.")
|
174 |
+
return self.data, {}
|
175 |
+
|
176 |
+
# --- 1. Determine Primary and Group Metrics Based on the Tag ---
|
177 |
+
if tag is None or tag == "Overall":
|
178 |
+
primary_metric = "Overall"
|
179 |
+
group_metrics = list(self.tag_map.keys())
|
180 |
+
else:
|
181 |
+
primary_metric = tag
|
182 |
+
# For a specific tag, the group is its list of sub-tasks.
|
183 |
+
group_metrics = self.tag_map.get(tag, [])
|
184 |
+
|
185 |
+
# --- 2. Sort the DataFrame by the Primary Score ---
|
186 |
+
primary_score_col = f"{primary_metric} Score"
|
187 |
+
df_sorted = self.data
|
188 |
+
if primary_score_col in self.data.columns:
|
189 |
+
df_sorted = self.data.sort_values(primary_score_col, ascending=False, na_position='last')
|
190 |
+
|
191 |
+
# --- 3. Build the List of Columns to Display ---
|
192 |
+
base_cols = ["Agent", "Submitter"]
|
193 |
+
new_cols = ["Openness", "Degree of Control"]
|
194 |
+
ending_cols = ["Date", "Logs"]
|
195 |
+
|
196 |
+
# Start with the primary metric score and cost
|
197 |
+
metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
|
198 |
+
|
199 |
+
# Add the score and cost for each item in our group
|
200 |
+
for item in group_metrics:
|
201 |
+
metrics_to_display.append(f"{item} Score")
|
202 |
+
metrics_to_display.append(f"{item} Cost")
|
203 |
+
|
204 |
+
# Combine base columns with metric columns, ensuring uniqueness and order
|
205 |
+
final_cols_ordered = base_cols + list(dict.fromkeys(metrics_to_display))+ new_cols + ending_cols
|
206 |
+
|
207 |
+
# Filter to only include columns that actually exist in our DataFrame
|
208 |
+
df_view = df_sorted.copy()
|
209 |
+
for col in final_cols_ordered:
|
210 |
+
if col not in df_view.columns:
|
211 |
+
df_view[col] = pd.NA
|
212 |
+
|
213 |
+
df_view = df_view[final_cols_ordered].reset_index(drop=True)
|
214 |
+
|
215 |
+
# Calculated and add "Categories Attempted" column
|
216 |
+
if primary_metric == "Overall":
|
217 |
+
def calculate_attempted(row):
|
218 |
+
main_categories = ['Literature Understanding', 'Data Analysis', 'Code Execution', 'Discovery']
|
219 |
+
count = sum(1 for category in main_categories if pd.notna(row.get(f"{category} Cost")))
|
220 |
+
|
221 |
+
# Return the formatted string with the correct emoji
|
222 |
+
if count == 4:
|
223 |
+
return f"4/4 ✅"
|
224 |
+
if count == 0:
|
225 |
+
return f"0/4 🚫"
|
226 |
+
return f"{count}/4 ⚠️"
|
227 |
+
|
228 |
+
# Apply the function row-wise to create the new column
|
229 |
+
attempted_column = df_view.apply(calculate_attempted, axis=1)
|
230 |
+
# Insert the new column at a nice position (e.g., after "Date")
|
231 |
+
df_view.insert(2, "Categories Attempted", attempted_column)
|
232 |
+
else:
|
233 |
+
total_benchmarks = len(group_metrics)
|
234 |
+
def calculate_benchmarks_attempted(row):
|
235 |
+
# Count how many benchmarks in this category have COST data reported
|
236 |
+
count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Cost")))
|
237 |
+
if count == total_benchmarks:
|
238 |
+
return f"{count}/{total_benchmarks} ✅"
|
239 |
+
elif count == 0:
|
240 |
+
return f"{count}/{total_benchmarks} 🚫"
|
241 |
+
else:
|
242 |
+
return f"{count}/{total_benchmarks}⚠️"
|
243 |
+
# Insert the new column, for example, after "Date"
|
244 |
+
df_view.insert(2, "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
|
245 |
+
|
246 |
+
|
247 |
+
# --- 4. Generate the Scatter Plot for the Primary Metric ---
|
248 |
+
plots: dict[str, go.Figure] = {}
|
249 |
+
if use_plotly:
|
250 |
+
primary_cost_col = f"{primary_metric} Cost"
|
251 |
+
# Check if the primary score and cost columns exist in the FINAL view
|
252 |
+
if primary_score_col in df_view.columns and primary_cost_col in df_view.columns:
|
253 |
+
fig = _plot_scatter_plotly(
|
254 |
+
data=df_view,
|
255 |
+
x=primary_cost_col,
|
256 |
+
y=primary_score_col,
|
257 |
+
agent_col="Agent"
|
258 |
+
)
|
259 |
+
# Use a consistent key for easy retrieval later
|
260 |
+
plots['scatter_plot'] = fig
|
261 |
+
else:
|
262 |
+
logger.warning(
|
263 |
+
f"Skipping plot for '{primary_metric}': score column '{primary_score_col}' "
|
264 |
+
f"or cost column '{primary_cost_col}' not found."
|
265 |
+
)
|
266 |
+
# Add an empty figure to avoid downstream errors
|
267 |
+
plots['scatter_plot'] = go.Figure()
|
268 |
+
return df_view, plots
|
269 |
+
|
270 |
+
DEFAULT_Y_COLUMN = "Overall Score"
|
271 |
+
DUMMY_X_VALUE_FOR_MISSING_COSTS = 0
|
272 |
+
|
273 |
+
def _plot_scatter_plotly(
|
274 |
+
data: pd.DataFrame,
|
275 |
+
x: Optional[str],
|
276 |
+
y: str,
|
277 |
+
agent_col: str = "Agent"
|
278 |
+
) -> go.Figure:
|
279 |
+
|
280 |
+
# --- Steps 1-4: Data Validation and Preparation ---
|
281 |
+
x_col_to_use = x
|
282 |
+
y_col_to_use = y
|
283 |
+
|
284 |
+
if y_col_to_use not in data.columns:
|
285 |
+
logger.error(f"y-axis column '{y_col_to_use}' not found.")
|
286 |
+
return go.Figure()
|
287 |
+
if agent_col not in data.columns:
|
288 |
+
logger.warning(f"Agent column '{agent_col}' not found.")
|
289 |
+
return go.Figure()
|
290 |
+
|
291 |
+
data_plot = data.copy()
|
292 |
+
data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
|
293 |
+
|
294 |
+
x_axis_label = x if x else "Cost (Data N/A)"
|
295 |
+
x_data_is_valid = False
|
296 |
+
if x and x in data_plot.columns:
|
297 |
+
try:
|
298 |
+
data_plot[x_col_to_use] = pd.to_numeric(data_plot[x_col_to_use], errors='coerce')
|
299 |
+
if data_plot[x_col_to_use].notna().any():
|
300 |
+
x_data_is_valid = True
|
301 |
+
except Exception as e:
|
302 |
+
logger.warning(f"Error converting x-column '{x_col_to_use}' to numeric: {e}")
|
303 |
+
|
304 |
+
if not x_data_is_valid:
|
305 |
+
dummy_x_col_name = "__dummy_x_for_plotting__"
|
306 |
+
data_plot[dummy_x_col_name] = DUMMY_X_VALUE_FOR_MISSING_COSTS
|
307 |
+
x_col_to_use = dummy_x_col_name
|
308 |
+
logger.info("Using dummy x-values for plotting.")
|
309 |
+
|
310 |
+
# --- Step 5: Clean Data and Initialize Figure ---
|
311 |
+
data_plot.dropna(subset=[y_col_to_use, x_col_to_use], inplace=True)
|
312 |
+
fig = go.Figure()
|
313 |
+
if data_plot.empty:
|
314 |
+
logger.warning(f"No valid data to plot for y='{y_col_to_use}' and x='{x_col_to_use}'.")
|
315 |
+
return fig
|
316 |
+
|
317 |
+
# Step 6 - Calculate and Draw the Efficiency Frontier Line ---
|
318 |
+
if x_data_is_valid:
|
319 |
+
# Sort by cost (ascending), then by score (descending) to break ties
|
320 |
+
sorted_data = data_plot.sort_values(by=[x_col_to_use, y_col_to_use], ascending=[True, False])
|
321 |
+
|
322 |
+
frontier_points = []
|
323 |
+
max_score_so_far = float('-inf')
|
324 |
+
|
325 |
+
for index, row in sorted_data.iterrows():
|
326 |
+
score = row[y_col_to_use]
|
327 |
+
# If this point offers a better score than any we've seen before,
|
328 |
+
# it's part of the frontier.
|
329 |
+
if score > max_score_so_far:
|
330 |
+
frontier_points.append({'x': row[x_col_to_use], 'y': score})
|
331 |
+
max_score_so_far = score
|
332 |
+
|
333 |
+
# Add the frontier line trace to the plot if we found any points
|
334 |
+
if frontier_points:
|
335 |
+
frontier_df = pd.DataFrame(frontier_points)
|
336 |
+
fig.add_trace(go.Scatter(
|
337 |
+
x=frontier_df['x'],
|
338 |
+
y=frontier_df['y'],
|
339 |
+
mode='lines',
|
340 |
+
name='Efficiency Frontier',
|
341 |
+
line=dict(color='firebrick', width=2, dash='dash'),
|
342 |
+
hoverinfo='skip' # The line doesn't need a hover tooltip
|
343 |
+
))
|
344 |
+
|
345 |
+
# --- Step 7: Plot Individual Agent Markers (No changes here) ---
|
346 |
+
for agent, group in data_plot.groupby(agent_col):
|
347 |
+
hover_x_display = "%{x:.2f}" if x_data_is_valid else "N/A"
|
348 |
+
fig.add_trace(go.Scatter(
|
349 |
+
x=group[x_col_to_use],
|
350 |
+
y=group[y_col_to_use],
|
351 |
+
mode='markers',
|
352 |
+
name=str(agent),
|
353 |
+
hovertemplate=f"<b>{str(agent)}</b><br>{x_axis_label}: {hover_x_display}<br>{y_col_to_use}: %{{y:.2f}}""<extra></extra>",
|
354 |
+
marker=dict(size=10, opacity=0.8)
|
355 |
+
))
|
356 |
+
|
357 |
+
# --- Step 8: Configure Layout (No changes here) ---
|
358 |
+
xaxis_config = dict(title=x_axis_label)
|
359 |
+
if not x_data_is_valid:
|
360 |
+
xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
|
361 |
+
xaxis_config['tickvals'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS]
|
362 |
+
else:
|
363 |
+
xaxis_config['rangemode'] = "tozero"
|
364 |
+
|
365 |
+
fig.update_layout(
|
366 |
+
title=f"{y_col_to_use} vs. {x_axis_label}",
|
367 |
+
xaxis=xaxis_config,
|
368 |
+
yaxis=dict(title=y_col_to_use, rangemode="tozero"),
|
369 |
+
legend_title_text=agent_col
|
370 |
+
)
|
371 |
+
|
372 |
+
return fig
|
373 |
+
|
374 |
+
def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
|
375 |
+
"""
|
376 |
+
Applies custom formatting to a cost column based on its corresponding score column.
|
377 |
+
- If cost is not null, it remains unchanged.
|
378 |
+
- If cost is null but score is not, it becomes "Missing Cost".
|
379 |
+
- If both cost and score are null, it becomes "Not Attempted".
|
380 |
+
Args:
|
381 |
+
df: The DataFrame to modify.
|
382 |
+
cost_col_name: The name of the cost column to format (e.g., "Overall Cost").
|
383 |
+
Returns:
|
384 |
+
The DataFrame with the formatted cost column.
|
385 |
+
"""
|
386 |
+
# Find the corresponding score column by replacing "Cost" with "Score"
|
387 |
+
score_col_name = cost_col_name.replace("Cost", "Score")
|
388 |
+
|
389 |
+
# Ensure the score column actually exists to avoid errors
|
390 |
+
if score_col_name not in df.columns:
|
391 |
+
return df # Return the DataFrame unmodified if there's no matching score
|
392 |
+
|
393 |
+
def apply_formatting_logic(row):
|
394 |
+
cost_value = row[cost_col_name]
|
395 |
+
score_value = row[score_col_name]
|
396 |
+
status_color = "#ec4899"
|
397 |
+
|
398 |
+
if pd.notna(cost_value) and isinstance(cost_value, (int, float)):
|
399 |
+
return f"${cost_value:.2f}"
|
400 |
+
elif pd.notna(score_value):
|
401 |
+
return f'<span style="color: {status_color};">Missing Cost</span>' # Score exists, but cost is missing
|
402 |
+
else:
|
403 |
+
return f'<span style="color: {status_color};">Not Attempted</span>' # Neither score nor cost exists
|
404 |
+
|
405 |
+
# Apply the logic to the specified cost column and update the DataFrame
|
406 |
+
df[cost_col_name] = df.apply(apply_formatting_logic, axis=1)
|
407 |
+
|
408 |
+
return df
|
409 |
+
|
410 |
+
def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
|
411 |
+
"""
|
412 |
+
Applies custom formatting to a score column for display.
|
413 |
+
- If a score is 0 or NaN, it's displayed as a colored "0".
|
414 |
+
- Other scores are formatted to two decimal places.
|
415 |
+
"""
|
416 |
+
status_color = "#ec4899" # The same color as your other status text
|
417 |
+
|
418 |
+
# First, fill any NaN values with 0 so we only have one case to handle.
|
419 |
+
# We must use reassignment to avoid the SettingWithCopyWarning.
|
420 |
+
df[score_col_name] = df[score_col_name].fillna(0)
|
421 |
+
|
422 |
+
def apply_formatting(score_value):
|
423 |
+
# Now, we just check if the value is 0.
|
424 |
+
if score_value == 0:
|
425 |
+
return f'<span style="color: {status_color};">0.0</span>'
|
426 |
+
|
427 |
+
# For all other numbers, format them for consistency.
|
428 |
+
if isinstance(score_value, (int, float)):
|
429 |
+
return f"{score_value:.2f}"
|
430 |
+
|
431 |
+
# Fallback for any unexpected non-numeric data
|
432 |
+
return score_value
|
433 |
+
|
434 |
+
# Apply the formatting and return the updated DataFrame
|
435 |
+
return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
|
436 |
+
|
leaderboard_viewer.py
ADDED
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
View and plot leaderboard results.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
from typing import Optional
|
7 |
+
from zoneinfo import ZoneInfo
|
8 |
+
|
9 |
+
import datasets
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import numpy as np
|
12 |
+
import pandas as pd
|
13 |
+
import seaborn as sns
|
14 |
+
|
15 |
+
from agenteval import compute_summary_statistics
|
16 |
+
from agenteval.config import SuiteConfig
|
17 |
+
from agenteval.models import EvalResult
|
18 |
+
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
class LeaderboardViewer:
|
23 |
+
"""
|
24 |
+
Load and visualize leaderboard for a given HF dataset split.
|
25 |
+
"""
|
26 |
+
|
27 |
+
def __init__(
|
28 |
+
self, repo_id: str, config: str, split: str, is_internal: bool = False
|
29 |
+
):
|
30 |
+
self._repo_id = repo_id
|
31 |
+
self._config = config
|
32 |
+
self._split = split
|
33 |
+
self._internal = is_internal
|
34 |
+
|
35 |
+
# build suite_config and mapping from tags to tasks from the first result
|
36 |
+
# TODO: Verify the sort order
|
37 |
+
ds = datasets.load_dataset(repo_id, name=config).get(split)
|
38 |
+
if not ds:
|
39 |
+
raise ValueError(f"Split '{split}' not found in dataset results")
|
40 |
+
suite = EvalResult.model_validate(ds[0]).suite_config
|
41 |
+
self._cfg = suite
|
42 |
+
self.tag_map: dict[str, list[str]] = {}
|
43 |
+
for task in suite.get_tasks(split):
|
44 |
+
for t in task.tags or []:
|
45 |
+
self.tag_map.setdefault(t, []).append(task.name)
|
46 |
+
|
47 |
+
def _load(self):
|
48 |
+
results = datasets.load_dataset(self._repo_id, name=self._config)
|
49 |
+
overview = _get_dataframe(
|
50 |
+
eval_results=results,
|
51 |
+
split=self._split,
|
52 |
+
is_internal=self._internal,
|
53 |
+
suite_config=self._cfg,
|
54 |
+
)
|
55 |
+
return overview, self.tag_map
|
56 |
+
|
57 |
+
def view(
|
58 |
+
self, tag: Optional[str] = None, with_plots: bool = False
|
59 |
+
) -> tuple[pd.DataFrame, dict[str, plt.Figure]]:
|
60 |
+
"""
|
61 |
+
If tag is None, primary="Overall" and group=all tags.
|
62 |
+
Otherwise primary=tag and group=tasks under that tag.
|
63 |
+
"""
|
64 |
+
data, tag_map = self._load()
|
65 |
+
cols = [
|
66 |
+
"Agent",
|
67 |
+
"Submitter",
|
68 |
+
"Completeness",
|
69 |
+
"LLM Base",
|
70 |
+
"Openness" ,
|
71 |
+
"Date",
|
72 |
+
"Logs",
|
73 |
+
]
|
74 |
+
|
75 |
+
# choose primary metric and its sub‐group
|
76 |
+
if tag is None:
|
77 |
+
primary = "Overall"
|
78 |
+
group = list(tag_map.keys())
|
79 |
+
else:
|
80 |
+
primary = tag
|
81 |
+
group = tag_map.get(tag, [])
|
82 |
+
data = data.sort_values(primary, ascending=False)
|
83 |
+
|
84 |
+
# build full metric list: primary + its cost + each member and its cost
|
85 |
+
metrics = [primary, f"{primary} cost"] + [
|
86 |
+
m for t in group for m in (t, f"{t} cost")
|
87 |
+
]
|
88 |
+
|
89 |
+
# filter to relevant columns
|
90 |
+
ci_cols = [f"{m} 95% CI" for m in metrics if f"{m} 95% CI" in data.columns]
|
91 |
+
df = data.loc[
|
92 |
+
:,
|
93 |
+
cols + [c for c in metrics if c in data.columns] + ci_cols,
|
94 |
+
].reset_index(drop=True)
|
95 |
+
|
96 |
+
plots: dict[str, plt.Figure] = {}
|
97 |
+
if with_plots:
|
98 |
+
avail = [c for c in metrics if c in df.columns]
|
99 |
+
for m in [primary] + group:
|
100 |
+
x, y = f"{m} cost", m
|
101 |
+
if x in df.columns and y in df.columns:
|
102 |
+
plots[f"scatter_{m}"] = _plot_scatter(
|
103 |
+
df, x=x, y=y, agent_col="Agent"
|
104 |
+
)
|
105 |
+
|
106 |
+
return df, plots
|
107 |
+
|
108 |
+
|
109 |
+
def _get_dataframe(
|
110 |
+
eval_results: datasets.DatasetDict,
|
111 |
+
split: str,
|
112 |
+
is_internal: bool,
|
113 |
+
suite_config: SuiteConfig,
|
114 |
+
timezone: str = "US/Pacific",
|
115 |
+
) -> pd.DataFrame:
|
116 |
+
"""
|
117 |
+
Load leaderboard results from the given dataset split and return a DataFrame.
|
118 |
+
"""
|
119 |
+
ds = eval_results.get(split)
|
120 |
+
if not ds:
|
121 |
+
cols = ["agent_name", "agent_description", "username", "submit_time"]
|
122 |
+
pretty = [_pretty_column_name(c) for c in cols]
|
123 |
+
empty = pd.DataFrame({c: ["No data"] for c in pretty})
|
124 |
+
return empty
|
125 |
+
|
126 |
+
cfg = suite_config
|
127 |
+
|
128 |
+
rows = []
|
129 |
+
for itm in ds:
|
130 |
+
ev = EvalResult.model_validate(itm)
|
131 |
+
sub = ev.submission
|
132 |
+
# only format if submit_time present, else leave as None
|
133 |
+
ts = sub.submit_time
|
134 |
+
if ts is not None:
|
135 |
+
date = ts.astimezone(ZoneInfo(timezone)).strftime("%Y-%m-%d")
|
136 |
+
else:
|
137 |
+
date = None
|
138 |
+
|
139 |
+
if not ev.results:
|
140 |
+
logger.warning(
|
141 |
+
f"Skipping submission {sub.agent_name} ({sub.username}) "
|
142 |
+
f"({sub.submit_time}) with no results"
|
143 |
+
)
|
144 |
+
continue
|
145 |
+
stats = compute_summary_statistics(
|
146 |
+
suite_config=cfg, split=split, results=ev.results
|
147 |
+
)
|
148 |
+
flat = {}
|
149 |
+
for key, s in stats.items():
|
150 |
+
parts = key.split("/")
|
151 |
+
if parts[0] == "overall":
|
152 |
+
flat["overall/score"], flat["overall/cost"] = s.score, s.cost
|
153 |
+
elif parts[0] == "tag":
|
154 |
+
flat[f"tag/{parts[1]}/score"], flat[f"tag/{parts[1]}/cost"] = (
|
155 |
+
s.score,
|
156 |
+
s.cost,
|
157 |
+
)
|
158 |
+
else: # task
|
159 |
+
t0 = parts[1]
|
160 |
+
# compute 95% CI half-width from stderr
|
161 |
+
flat.update(
|
162 |
+
{
|
163 |
+
f"task/{t0}/score": s.score,
|
164 |
+
f"task/{t0}/score_ci": (
|
165 |
+
(s.score_stderr * 1.96)
|
166 |
+
if s.score_stderr is not None
|
167 |
+
else np.nan
|
168 |
+
),
|
169 |
+
f"task/{t0}/cost": s.cost,
|
170 |
+
f"task/{t0}/cost_ci": (
|
171 |
+
(s.cost_stderr * 1.96)
|
172 |
+
if s.cost_stderr is not None
|
173 |
+
else np.nan
|
174 |
+
),
|
175 |
+
}
|
176 |
+
)
|
177 |
+
|
178 |
+
rows.append(
|
179 |
+
{
|
180 |
+
"agent_name": sub.agent_name,
|
181 |
+
"username": sub.username or "",
|
182 |
+
"submit_time": date,
|
183 |
+
**flat,
|
184 |
+
"logs_url": sub.logs_url if is_internal else sub.logs_url_public,
|
185 |
+
}
|
186 |
+
)
|
187 |
+
|
188 |
+
df = pd.DataFrame(rows)
|
189 |
+
|
190 |
+
# prepare pretty column mapping
|
191 |
+
pretty_cols = {c: _pretty_column_name(c) for c in df.columns}
|
192 |
+
|
193 |
+
# construct overview table with human-friendly names
|
194 |
+
overview = df.rename(columns=pretty_cols)
|
195 |
+
|
196 |
+
return overview
|
197 |
+
|
198 |
+
|
199 |
+
def _pretty_column_name(col: str) -> str:
|
200 |
+
"""Map raw column name to display name."""
|
201 |
+
# fixed mappings
|
202 |
+
mapping = {
|
203 |
+
"submit_time": "Date",
|
204 |
+
"agent_name": "Agent",
|
205 |
+
"username": "User/organization",
|
206 |
+
"logs_url": "Logs",
|
207 |
+
"overall/score": "Score",
|
208 |
+
"overall/cost": "Cost (USD)",
|
209 |
+
}
|
210 |
+
if col in mapping:
|
211 |
+
return mapping[col]
|
212 |
+
# dynamic: task/{name}/{metric} or tag/{name}/{metric}
|
213 |
+
parts = col.split("/")
|
214 |
+
if len(parts) == 3:
|
215 |
+
_, name, metric = parts
|
216 |
+
if metric == "score":
|
217 |
+
return name
|
218 |
+
if metric == "cost":
|
219 |
+
return f"{name} cost"
|
220 |
+
if metric == "score_ci":
|
221 |
+
return f"{name} 95% CI"
|
222 |
+
if metric == "cost_ci":
|
223 |
+
return f"{name} cost 95% CI"
|
224 |
+
# fallback to last segment
|
225 |
+
return parts[-1]
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
def _plot_scatter(
|
230 |
+
data: pd.DataFrame,
|
231 |
+
x: str, # Cost column name (e.g., "Overall cost")
|
232 |
+
y: str, # Score column name (e.g., "Overall score")
|
233 |
+
agent_col: str,
|
234 |
+
) -> plt.Figure:
|
235 |
+
"""Scatter plot of agent results, showing score vs cost with Pareto frontier."""
|
236 |
+
fig, ax = plt.subplots(figsize=(20,7))
|
237 |
+
|
238 |
+
# Make a copy for manipulation to find frontier without affecting original data
|
239 |
+
plot_data = data.copy()
|
240 |
+
|
241 |
+
# Ensure score (y) and cost (x) are numeric and drop NaNs for frontier calculation
|
242 |
+
plot_data[y] = pd.to_numeric(plot_data[y], errors='coerce')
|
243 |
+
plot_data[x] = pd.to_numeric(plot_data[x], errors='coerce')
|
244 |
+
frontier_data = plot_data.dropna(subset=[y, x])
|
245 |
+
|
246 |
+
if not frontier_data.empty:
|
247 |
+
# Sort by cost (x) ascending, then by score (y) descending for tie-breaking
|
248 |
+
frontier_data = frontier_data.sort_values(by=[x, y], ascending=[True, False])
|
249 |
+
|
250 |
+
pareto_points = []
|
251 |
+
max_score_at_cost = -np.inf # Initialize with negative infinity
|
252 |
+
|
253 |
+
for index, row in frontier_data.iterrows():
|
254 |
+
current_score = row[y]
|
255 |
+
current_cost = row[x]
|
256 |
+
# Only add point if it offers a higher score than any previous point
|
257 |
+
# on the frontier with less or equal cost (implicit by sorting).
|
258 |
+
# More strictly, for a point to be on the frontier here, it must improve the score.
|
259 |
+
if current_score > max_score_at_cost:
|
260 |
+
# Optional: If allowing same score but lower cost (already handled by sort somewhat)
|
261 |
+
# you might need to check if a point with same score but lower cost exists
|
262 |
+
# For this algorithm, we simply take points that strictly increase score.
|
263 |
+
pareto_points.append(row)
|
264 |
+
max_score_at_cost = current_score
|
265 |
+
|
266 |
+
if pareto_points:
|
267 |
+
pareto_df = pd.DataFrame(pareto_points)
|
268 |
+
# Sort pareto_df by cost again just to be sure for plotting line
|
269 |
+
pareto_df = pareto_df.sort_values(by=x)
|
270 |
+
# Plot the Pareto frontier line
|
271 |
+
ax.plot(pareto_df[x], pareto_df[y], marker='o', linestyle='-', color='red', alpha=0.7, linewidth=2, markersize=5, label='Pareto Frontier')
|
272 |
+
|
273 |
+
# Plot all data points
|
274 |
+
sns.scatterplot(data=data, x=x, y=y, hue=agent_col, s=100, ax=ax, legend="auto")
|
275 |
+
|
276 |
+
# Error bars (if they exist)
|
277 |
+
x_ci_col = f"{x} 95% CI"
|
278 |
+
y_ci_col = f"{y} 95% CI"
|
279 |
+
if x_ci_col in data.columns or y_ci_col in data.columns:
|
280 |
+
# Filter data for error bars to only include rows present in the original 'data'
|
281 |
+
# This is important if 'frontier_data' subset was used for some logic but error bars are for all.
|
282 |
+
error_bar_data = data.copy() # Use original data for error bars
|
283 |
+
error_bar_data[x_ci_col] = pd.to_numeric(error_bar_data.get(x_ci_col), errors='coerce')
|
284 |
+
error_bar_data[y_ci_col] = pd.to_numeric(error_bar_data.get(y_ci_col), errors='coerce')
|
285 |
+
|
286 |
+
ax.errorbar(
|
287 |
+
x=error_bar_data[x], # Use original data's x
|
288 |
+
y=error_bar_data[y], # Use original data's y
|
289 |
+
xerr=error_bar_data.get(x_ci_col),
|
290 |
+
yerr=error_bar_data.get(y_ci_col),
|
291 |
+
fmt="none",
|
292 |
+
ecolor="gray",
|
293 |
+
alpha=0.5,
|
294 |
+
capsize=3,
|
295 |
+
zorder=0 # Draw error bars behind scatter points
|
296 |
+
)
|
297 |
+
|
298 |
+
ax.set_xlim(left=0)
|
299 |
+
ax.set_ylim(bottom=0) # Scores and costs are typically non-negative
|
300 |
+
ax.set_xlabel(x) # x is cost
|
301 |
+
ax.set_ylabel(y) # y is score
|
302 |
+
|
303 |
+
# Adjust legend: Get handles and labels from seaborn plot, then add frontier's
|
304 |
+
handles, labels = ax.get_legend_handles_labels()
|
305 |
+
# Check if "Pareto Frontier" was actually plotted and add its handle/label if so
|
306 |
+
if pareto_points and "Pareto Frontier" not in labels: # Avoid duplicate legend items
|
307 |
+
# Find the frontier line object to get its handle
|
308 |
+
frontier_line = next((line for line in ax.get_lines() if line.get_label() == 'Pareto Frontier'), None)
|
309 |
+
if frontier_line:
|
310 |
+
handles.append(frontier_line)
|
311 |
+
labels.append('Pareto Frontier')
|
312 |
+
|
313 |
+
ax.legend(handles=handles, labels=labels, title=agent_col, bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.)
|
314 |
+
|
315 |
+
plt.tight_layout(rect=[0, 0, 0.85, 1])
|
316 |
+
return fig
|
317 |
+
|
318 |
+
|
319 |
+
__all__ = ["LeaderboardViewer"]
|
literature_understanding.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Import our UI factories and the data loader
|
5 |
+
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
|
6 |
+
|
7 |
+
# Define the category for this page
|
8 |
+
CATEGORY_NAME = "Literature Understanding"
|
9 |
+
|
10 |
+
with gr.Blocks() as demo:
|
11 |
+
gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
|
12 |
+
|
13 |
+
# --- This page now has two main sections: Validation and Test ---
|
14 |
+
with gr.Tabs():
|
15 |
+
with gr.Tab("Results: Validation"):
|
16 |
+
# 1. Load all necessary data for the "validation" split ONCE.
|
17 |
+
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
18 |
+
|
19 |
+
if not validation_df.empty:
|
20 |
+
# 2. Render the main category display using the loaded data.
|
21 |
+
create_leaderboard_display(
|
22 |
+
full_df=validation_df,
|
23 |
+
tag_map=validation_tag_map,
|
24 |
+
category_name=CATEGORY_NAME,
|
25 |
+
split_name="validation"
|
26 |
+
)
|
27 |
+
|
28 |
+
# 3. Render the detailed breakdown for each benchmark in the category.
|
29 |
+
create_benchmark_details_display(
|
30 |
+
full_df=validation_df,
|
31 |
+
tag_map=validation_tag_map,
|
32 |
+
category_name=CATEGORY_NAME
|
33 |
+
)
|
34 |
+
else:
|
35 |
+
gr.Markdown("No data available for validation split.")
|
36 |
+
|
37 |
+
with gr.Tab("Results: Test"):
|
38 |
+
# Repeat the process for the "test" split
|
39 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
40 |
+
|
41 |
+
if not test_df.empty:
|
42 |
+
create_leaderboard_display(
|
43 |
+
full_df=test_df,
|
44 |
+
tag_map=test_tag_map,
|
45 |
+
category_name=CATEGORY_NAME,
|
46 |
+
split_name="test"
|
47 |
+
)
|
48 |
+
create_benchmark_details_display(
|
49 |
+
full_df=test_df,
|
50 |
+
tag_map=test_tag_map,
|
51 |
+
category_name=CATEGORY_NAME
|
52 |
+
)
|
53 |
+
else:
|
54 |
+
gr.Markdown("No data available for test split.")
|
main_page.py
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib
|
2 |
+
matplotlib.use('Agg')
|
3 |
+
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
import tarfile
|
7 |
+
import tempfile
|
8 |
+
from datetime import datetime, timedelta, timezone
|
9 |
+
from email.utils import parseaddr
|
10 |
+
from pathlib import Path
|
11 |
+
# from zoneinfo import ZoneInfo # LeaderboardViewer uses this, ensure it's available
|
12 |
+
|
13 |
+
import gradio as gr
|
14 |
+
import requests
|
15 |
+
from agenteval import (
|
16 |
+
# compute_summary_statistics, # This will now be used by LeaderboardViewer
|
17 |
+
process_eval_logs,
|
18 |
+
upload_folder_to_hf,
|
19 |
+
upload_summary_to_hf,
|
20 |
+
)
|
21 |
+
from agenteval.models import EvalResult # Used by submission and LeaderboardViewer (implicitly)
|
22 |
+
from agenteval.leaderboard.upload import sanitize_path_component
|
23 |
+
from datasets import Dataset, DatasetDict, VerificationMode, load_dataset # load_dataset used by LV
|
24 |
+
from datasets.data_files import EmptyDatasetError
|
25 |
+
from huggingface_hub import HfApi
|
26 |
+
|
27 |
+
from ui_components import create_leaderboard_display, get_full_leaderboard_data
|
28 |
+
|
29 |
+
from content import (
|
30 |
+
CITATION_BUTTON_LABEL,
|
31 |
+
CITATION_BUTTON_TEXT,
|
32 |
+
INTRODUCTION_TEXT,
|
33 |
+
SUBMISSION_TEXT,
|
34 |
+
INTRO_PARAGRAPH,
|
35 |
+
SCATTER_DISCLAIMER,
|
36 |
+
format_error,
|
37 |
+
format_log,
|
38 |
+
format_warning,
|
39 |
+
)
|
40 |
+
|
41 |
+
# --- Constants and Configuration ---
|
42 |
+
LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
|
43 |
+
CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
|
44 |
+
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
|
45 |
+
|
46 |
+
OWNER = "allenai"
|
47 |
+
PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
|
48 |
+
SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
|
49 |
+
SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
|
50 |
+
CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
|
51 |
+
RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
|
52 |
+
LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
|
53 |
+
|
54 |
+
if LOCAL_DEBUG:
|
55 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
|
56 |
+
else:
|
57 |
+
DATA_DIR = "/home/user/data/" + CONFIG_NAME
|
58 |
+
EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
|
59 |
+
|
60 |
+
api = HfApi()
|
61 |
+
MAX_UPLOAD_BYTES = 100 * 1024**2
|
62 |
+
AGENTEVAL_MANIFEST_NAME = "agenteval.json"
|
63 |
+
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
|
64 |
+
|
65 |
+
# --- Global State for Viewers (simple caching) ---
|
66 |
+
CACHED_VIEWERS = {}
|
67 |
+
CACHED_TAG_MAPS = {}
|
68 |
+
|
69 |
+
# --- Submission Logic (largely unchanged from original, ensure EvalResult and other deps are fine) ---
|
70 |
+
def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
|
71 |
+
try:
|
72 |
+
return load_dataset(*args, **kwargs)
|
73 |
+
except EmptyDatasetError:
|
74 |
+
return DatasetDict()
|
75 |
+
except ValueError: # Handles cases where dataset is empty or ill-formed
|
76 |
+
return DatasetDict()
|
77 |
+
|
78 |
+
def checked_upload_folder(
|
79 |
+
api_hf: HfApi, # Renamed to avoid conflict with global api
|
80 |
+
folder_path: str,
|
81 |
+
repo_id: str,
|
82 |
+
config_name_ul: str, # Renamed
|
83 |
+
split_ul: str, # Renamed
|
84 |
+
submission_name_ul: str, # Renamed
|
85 |
+
) -> str:
|
86 |
+
total = 0
|
87 |
+
for root, _, files in os.walk(folder_path):
|
88 |
+
for f_ul in files: # Renamed
|
89 |
+
total += os.path.getsize(os.path.join(root, f_ul))
|
90 |
+
if total > MAX_UPLOAD_BYTES:
|
91 |
+
raise ValueError(
|
92 |
+
f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
|
93 |
+
)
|
94 |
+
return upload_folder_to_hf(
|
95 |
+
api=api_hf, # Use renamed parameter
|
96 |
+
folder_path=folder_path,
|
97 |
+
repo_id=repo_id,
|
98 |
+
config_name=config_name_ul,
|
99 |
+
split=split_ul,
|
100 |
+
submission_name=submission_name_ul,
|
101 |
+
)
|
102 |
+
|
103 |
+
def add_new_eval(
|
104 |
+
val_or_test: str,
|
105 |
+
agent_name: str | None,
|
106 |
+
agent_description: str,
|
107 |
+
agent_url: str,
|
108 |
+
openness: str | None,
|
109 |
+
degree_of_control: str | None,
|
110 |
+
path_to_file: tempfile._TemporaryFileWrapper | None,
|
111 |
+
username: str,
|
112 |
+
mail: str,
|
113 |
+
profile: gr.OAuthProfile,
|
114 |
+
# We need global eval_results for checks; this might need rethinking if it's purely display driven now
|
115 |
+
# For now, let's assume we still load it for submission checks
|
116 |
+
):
|
117 |
+
# Load current eval_results for submission checks
|
118 |
+
# This is a bit redundant if display part reloads it, but submission needs its own consistent view
|
119 |
+
current_eval_results_for_submission = try_load_dataset_submission(
|
120 |
+
RESULTS_DATASET,
|
121 |
+
CONFIG_NAME,
|
122 |
+
download_mode="force_redownload", # Or a less aggressive mode
|
123 |
+
verification_mode=VerificationMode.NO_CHECKS,
|
124 |
+
trust_remote_code=True,
|
125 |
+
)
|
126 |
+
if not agent_name:
|
127 |
+
return format_warning("Please provide an agent name.")
|
128 |
+
|
129 |
+
submission_time = datetime.now(timezone.utc)
|
130 |
+
if not username or username.strip() == "":
|
131 |
+
username = profile.username # Default to HF username
|
132 |
+
|
133 |
+
# User account age check
|
134 |
+
try:
|
135 |
+
user_data_resp = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
|
136 |
+
user_data_resp.raise_for_status()
|
137 |
+
creation_date_str = user_data_resp.json()["createdAt"]
|
138 |
+
created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
|
139 |
+
if submission_time - created_at < timedelta(days=60):
|
140 |
+
return format_error("This account is not authorized to submit here (account too new).")
|
141 |
+
except Exception as e:
|
142 |
+
print(f"Error checking user account age: {e}")
|
143 |
+
return format_error("Could not verify account age. Please try again later.")
|
144 |
+
|
145 |
+
# Submission frequency check
|
146 |
+
contact_infos = try_load_dataset_submission(
|
147 |
+
CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
|
148 |
+
verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
|
149 |
+
)
|
150 |
+
user_submission_dates = sorted(
|
151 |
+
datetime.fromisoformat(row["submit_time"])
|
152 |
+
for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
|
153 |
+
)
|
154 |
+
if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
|
155 |
+
return format_error("You already submitted once in the last 24h for this split; please try again later.")
|
156 |
+
|
157 |
+
# Email validation
|
158 |
+
_, parsed_mail = parseaddr(mail)
|
159 |
+
if "@" not in parsed_mail:
|
160 |
+
return format_warning("Please provide a valid email address.")
|
161 |
+
|
162 |
+
# Duplicate submission check
|
163 |
+
if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0:
|
164 |
+
existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", [])
|
165 |
+
for sub_item in existing_submissions:
|
166 |
+
if (sub_item.get("agent_name", "").lower() == agent_name.lower() and
|
167 |
+
sub_item.get("username", "").lower() == username.lower()):
|
168 |
+
return format_warning("This agent name by this user has already been submitted to this split.")
|
169 |
+
|
170 |
+
if path_to_file is None:
|
171 |
+
return format_warning("Please attach a .tar.gz file.")
|
172 |
+
|
173 |
+
safe_username = sanitize_path_component(username)
|
174 |
+
safe_agent_name = sanitize_path_component(agent_name)
|
175 |
+
extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}")
|
176 |
+
|
177 |
+
# File extraction
|
178 |
+
if not LOCAL_DEBUG:
|
179 |
+
try:
|
180 |
+
if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir)
|
181 |
+
os.makedirs(extracted_dir, exist_ok=True)
|
182 |
+
with tarfile.open(path_to_file.name, "r:gz") as tar:
|
183 |
+
members_extracted = 0
|
184 |
+
for member in tar.getmembers():
|
185 |
+
if not member.isreg(): continue
|
186 |
+
fname = os.path.basename(member.name)
|
187 |
+
if not fname or fname.startswith("."): continue
|
188 |
+
fobj = tar.extractfile(member)
|
189 |
+
if not fobj: continue
|
190 |
+
with open(os.path.join(extracted_dir, fname), "wb") as out:
|
191 |
+
out.write(fobj.read())
|
192 |
+
members_extracted +=1
|
193 |
+
if members_extracted == 0:
|
194 |
+
return format_error("Submission tarball is empty or contains no valid files.")
|
195 |
+
except Exception as e:
|
196 |
+
return format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz.")
|
197 |
+
else: print("mock extracted file", flush=True)
|
198 |
+
|
199 |
+
|
200 |
+
submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}"
|
201 |
+
|
202 |
+
# 1. Upload raw (unscored) submission files
|
203 |
+
if not LOCAL_DEBUG:
|
204 |
+
try:
|
205 |
+
checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, submission_name)
|
206 |
+
except ValueError as e: return format_error(str(e))
|
207 |
+
except Exception as e: return format_error(f"Failed to upload raw submission: {e}")
|
208 |
+
else: print("mock uploaded raw submission", flush=True)
|
209 |
+
|
210 |
+
# 2. Save contact information
|
211 |
+
contact_info = {
|
212 |
+
"agent_name": agent_name, "agent_description": agent_description, "url": agent_url,
|
213 |
+
"username": username, "username_auth": profile.username, "mail": mail,
|
214 |
+
"submit_time": submission_time.isoformat(),
|
215 |
+
}
|
216 |
+
if val_or_test in contact_infos:
|
217 |
+
contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
|
218 |
+
else:
|
219 |
+
contact_infos[val_or_test] = Dataset.from_list([contact_info])
|
220 |
+
|
221 |
+
if not LOCAL_DEBUG:
|
222 |
+
try:
|
223 |
+
contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
|
224 |
+
except Exception as e: return format_warning(f"Submission recorded, but contact info failed to save: {e}")
|
225 |
+
else: print("mock uploaded contact info", flush=True)
|
226 |
+
|
227 |
+
|
228 |
+
# 3. Process and score the submission
|
229 |
+
eval_result_obj = None # Define to avoid NameError
|
230 |
+
try:
|
231 |
+
json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
|
232 |
+
if not json_path.exists():
|
233 |
+
return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
|
234 |
+
|
235 |
+
eval_result_obj = EvalResult.model_validate_json(json_path.read_text(encoding="utf-8"))
|
236 |
+
if eval_result_obj.suite_config.version != CONFIG_NAME:
|
237 |
+
return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
|
238 |
+
if eval_result_obj.split != val_or_test:
|
239 |
+
return format_error(f"Split mismatch: expected {val_or_test}, got {eval_result_obj.split}.")
|
240 |
+
|
241 |
+
# Re-compute results from logs for integrity
|
242 |
+
eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
|
243 |
+
eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
|
244 |
+
|
245 |
+
except Exception as e:
|
246 |
+
return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
|
247 |
+
|
248 |
+
# 4. Upload scored submission files
|
249 |
+
logs_url_private_val, logs_url_public_val = None, None
|
250 |
+
scored_submission_name = f"{submission_name}_scored"
|
251 |
+
if not LOCAL_DEBUG:
|
252 |
+
try:
|
253 |
+
logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
|
254 |
+
if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
|
255 |
+
logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
|
256 |
+
except ValueError as e: return format_error(str(e))
|
257 |
+
except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
|
258 |
+
else: print("mock uploaded scored submission", flush=True)
|
259 |
+
|
260 |
+
|
261 |
+
# Update EvalResult with submission details
|
262 |
+
eval_result_obj.submission.agent_name = agent_name
|
263 |
+
eval_result_obj.submission.agent_description = agent_description
|
264 |
+
eval_result_obj.submission.agent_url = agent_url
|
265 |
+
eval_result_obj.submission.openness = openness
|
266 |
+
eval_result_obj.submission.degree_of_control = degree_of_control
|
267 |
+
eval_result_obj.submission.username = username
|
268 |
+
eval_result_obj.submission.submit_time = submission_time
|
269 |
+
eval_result_obj.submission.logs_url = logs_url_private_val
|
270 |
+
eval_result_obj.submission.logs_url_public = logs_url_public_val
|
271 |
+
|
272 |
+
# 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
|
273 |
+
if not LOCAL_DEBUG:
|
274 |
+
try:
|
275 |
+
upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
|
276 |
+
except Exception as e:
|
277 |
+
return format_error(f"Failed to upload summary results to leaderboard: {e}")
|
278 |
+
else: print("mock uploaded results to lb", flush=True)
|
279 |
+
|
280 |
+
# Invalidate viewer cache for the split that was updated
|
281 |
+
if val_or_test in CACHED_VIEWERS:
|
282 |
+
del CACHED_VIEWERS[val_or_test]
|
283 |
+
if val_or_test in CACHED_TAG_MAPS:
|
284 |
+
del CACHED_TAG_MAPS[val_or_test]
|
285 |
+
|
286 |
+
|
287 |
+
return format_log(
|
288 |
+
f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
|
289 |
+
"Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
|
290 |
+
)
|
291 |
+
|
292 |
+
with gr.Blocks() as demo:
|
293 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
294 |
+
gr.HTML(INTRO_PARAGRAPH, elem_id="intro-paragraph")
|
295 |
+
|
296 |
+
# --- Submission Accordion ---
|
297 |
+
with gr.Accordion("🚀 Submit a new agent for evaluation", open=False, elem_classes="submission-accordion"):
|
298 |
+
gr.Markdown(SUBMISSION_TEXT, elem_id="markdown-text")
|
299 |
+
with gr.Row():
|
300 |
+
with gr.Column():
|
301 |
+
level_of_test_radio = gr.Radio(["validation", "test"], value="validation", label="Split")
|
302 |
+
agent_name_tb = gr.Textbox(label="Agent Name")
|
303 |
+
agent_desc_tb = gr.Textbox(label="Agent Description")
|
304 |
+
agent_url_tb = gr.Textbox(label="URL to Agent Information")
|
305 |
+
openness_radio = gr.Radio(["Open Source", "API", "UI"], value=None, label="Openness of Agent")
|
306 |
+
degree_of_control_radio = gr.Radio(["Standard", "Custom"], value=None, label="Degree of Control")
|
307 |
+
with gr.Column():
|
308 |
+
username_tb = gr.Textbox(label="Organization or User Name (Defaults to HF username)")
|
309 |
+
mail_tb = gr.Textbox(label="Contact Email (Private, for submission issues)")
|
310 |
+
file_upload_comp = gr.File(
|
311 |
+
label="Submission File (.tar.gz ...)", # Shortened for brevity
|
312 |
+
file_types=[".gz", ".tar.gz"]
|
313 |
+
)
|
314 |
+
with gr.Row():
|
315 |
+
gr.LoginButton()
|
316 |
+
submit_eval_button = gr.Button("Submit Evaluation")
|
317 |
+
submission_result = gr.Markdown()
|
318 |
+
|
319 |
+
submit_eval_button.click(
|
320 |
+
add_new_eval,
|
321 |
+
[
|
322 |
+
level_of_test_radio,
|
323 |
+
agent_name_tb,
|
324 |
+
agent_desc_tb,
|
325 |
+
agent_url_tb,
|
326 |
+
openness_radio,
|
327 |
+
degree_of_control_radio,
|
328 |
+
file_upload_comp,
|
329 |
+
username_tb,
|
330 |
+
mail_tb
|
331 |
+
],
|
332 |
+
submission_result,
|
333 |
+
)
|
334 |
+
|
335 |
+
# --- Leaderboard Display Section ---
|
336 |
+
gr.Markdown("---")
|
337 |
+
CATEGORY_NAME = "Overall"
|
338 |
+
gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
|
339 |
+
|
340 |
+
with gr.Tabs() as tabs:
|
341 |
+
with gr.Tab("Results: Validation"):
|
342 |
+
# 1. Load all necessary data for the "validation" split ONCE.
|
343 |
+
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
344 |
+
|
345 |
+
# Check if data was loaded successfully before trying to display it
|
346 |
+
if not validation_df.empty:
|
347 |
+
# 2. Render the display by calling the factory with the loaded data.
|
348 |
+
create_leaderboard_display(
|
349 |
+
full_df=validation_df,
|
350 |
+
tag_map=validation_tag_map,
|
351 |
+
category_name=CATEGORY_NAME, # Use our constant
|
352 |
+
split_name="validation"
|
353 |
+
)
|
354 |
+
else:
|
355 |
+
# Display a message if no data is available
|
356 |
+
gr.Markdown("No data available for validation split.")
|
357 |
+
|
358 |
+
with gr.Tab("Results: Test"):
|
359 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
360 |
+
if not test_df.empty:
|
361 |
+
create_leaderboard_display(
|
362 |
+
full_df=test_df,
|
363 |
+
tag_map=test_tag_map,
|
364 |
+
category_name=CATEGORY_NAME, # Use our constant
|
365 |
+
split_name="test"
|
366 |
+
)
|
367 |
+
else:
|
368 |
+
gr.Markdown("No data available for test split.")
|
369 |
+
|
370 |
+
with gr.Accordion("📙 Citation", open=False):
|
371 |
+
gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
|
372 |
+
|
373 |
+
|
374 |
+
if __name__ == "__main__":
|
375 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
datasets
|
2 |
-
gradio[oauth]
|
3 |
huggingface-hub
|
4 |
APScheduler
|
5 |
-
agent-eval
|
|
|
1 |
datasets
|
2 |
+
gradio[oauth]==5.30.0
|
3 |
huggingface-hub
|
4 |
APScheduler
|
5 |
+
agent-eval==0.1.9
|
ui_components.py
ADDED
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio.events import SelectData
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
import os
|
6 |
+
|
7 |
+
from agenteval.leaderboard.view import LeaderboardViewer
|
8 |
+
from huggingface_hub import HfApi
|
9 |
+
|
10 |
+
from leaderboard_transformer import DataTransformer, transform_raw_dataframe, create_pretty_tag_map, INFORMAL_TO_FORMAL_NAME_MAP, _plot_scatter_plotly, format_cost_column, format_score_column
|
11 |
+
from content import (
|
12 |
+
SCATTER_DISCLAIMER,
|
13 |
+
format_error,
|
14 |
+
format_log,
|
15 |
+
format_warning,
|
16 |
+
hf_uri_to_web_url,
|
17 |
+
hyperlink,
|
18 |
+
)
|
19 |
+
|
20 |
+
# --- Constants and Configuration ---
|
21 |
+
LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
|
22 |
+
CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
|
23 |
+
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
|
24 |
+
|
25 |
+
OWNER = "allenai"
|
26 |
+
PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
|
27 |
+
SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
|
28 |
+
SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
|
29 |
+
CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
|
30 |
+
RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
|
31 |
+
LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
|
32 |
+
|
33 |
+
if LOCAL_DEBUG:
|
34 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
|
35 |
+
else:
|
36 |
+
DATA_DIR = "/home/user/data/" + CONFIG_NAME
|
37 |
+
EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
|
38 |
+
|
39 |
+
api = HfApi()
|
40 |
+
MAX_UPLOAD_BYTES = 100 * 1024**2
|
41 |
+
AGENTEVAL_MANIFEST_NAME = "agenteval.json"
|
42 |
+
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
|
43 |
+
|
44 |
+
|
45 |
+
# --- Global State for Viewers (simple caching) ---
|
46 |
+
CACHED_VIEWERS = {}
|
47 |
+
CACHED_TAG_MAPS = {}
|
48 |
+
|
49 |
+
# --- New Helper Class to Solve the Type Mismatch Bug ---
|
50 |
+
class DummyViewer:
|
51 |
+
"""A mock viewer to be cached on error. It has a ._load() method
|
52 |
+
to ensure it behaves like the real LeaderboardViewer."""
|
53 |
+
def __init__(self, error_df):
|
54 |
+
self._error_df = error_df
|
55 |
+
|
56 |
+
def _load(self):
|
57 |
+
# The _load method returns the error DataFrame and an empty tag map
|
58 |
+
return self._error_df, {}
|
59 |
+
|
60 |
+
def get_leaderboard_viewer_instance(split: str):
|
61 |
+
"""
|
62 |
+
Fetches the LeaderboardViewer for a split, using a cache to avoid
|
63 |
+
re-downloading data. On error, returns a stable DummyViewer object.
|
64 |
+
"""
|
65 |
+
global CACHED_VIEWERS, CACHED_TAG_MAPS
|
66 |
+
|
67 |
+
if split in CACHED_VIEWERS:
|
68 |
+
# Cache hit: return the cached viewer and tag map
|
69 |
+
return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
|
70 |
+
|
71 |
+
# --- Cache miss: try to load data from the source ---
|
72 |
+
try:
|
73 |
+
print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}")
|
74 |
+
viewer = LeaderboardViewer(
|
75 |
+
repo_id=RESULTS_DATASET,
|
76 |
+
config=CONFIG_NAME,
|
77 |
+
split=split,
|
78 |
+
is_internal=IS_INTERNAL
|
79 |
+
)
|
80 |
+
|
81 |
+
# Simplify tag map creation
|
82 |
+
pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
|
83 |
+
|
84 |
+
# Cache the results for next time
|
85 |
+
CACHED_VIEWERS[split] = viewer
|
86 |
+
CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
|
87 |
+
|
88 |
+
return viewer, pretty_tag_map
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
# On ANY error, create a consistent error message and cache a DummyViewer
|
92 |
+
error_message = f"Error loading data for split '{split}': {e}"
|
93 |
+
print(format_error(error_message))
|
94 |
+
|
95 |
+
dummy_df = pd.DataFrame({"Message": [error_message]})
|
96 |
+
dummy_viewer = DummyViewer(dummy_df)
|
97 |
+
dummy_tag_map = {"Overall": []}
|
98 |
+
|
99 |
+
# Cache the dummy objects so we don't try to fetch again on this run
|
100 |
+
CACHED_VIEWERS[split] = dummy_viewer
|
101 |
+
CACHED_TAG_MAPS[split] = dummy_tag_map
|
102 |
+
|
103 |
+
return dummy_viewer, dummy_tag_map
|
104 |
+
|
105 |
+
|
106 |
+
def create_leaderboard_display(
|
107 |
+
full_df: pd.DataFrame,
|
108 |
+
tag_map: dict,
|
109 |
+
category_name: str,
|
110 |
+
split_name: str
|
111 |
+
):
|
112 |
+
"""
|
113 |
+
This UI factory takes pre-loaded data and renders the main DataFrame and Plot
|
114 |
+
for a given category (e.g., "Overall" or "Literature Understanding").
|
115 |
+
"""
|
116 |
+
# 1. Instantiate the transformer and get the specific view for this category.
|
117 |
+
# The function no longer loads data itself; it filters the data it receives.
|
118 |
+
transformer = DataTransformer(full_df, tag_map)
|
119 |
+
df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
|
120 |
+
# format cost columns
|
121 |
+
for col in df_view.columns:
|
122 |
+
if "Cost" in col:
|
123 |
+
df_view = format_cost_column(df_view, col)
|
124 |
+
|
125 |
+
# 2. Fill NaN scores with 0
|
126 |
+
for col in df_view.columns:
|
127 |
+
if "Score" in col:
|
128 |
+
df_view = format_score_column(df_view, col)
|
129 |
+
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
|
130 |
+
|
131 |
+
# 2. Define the UI components with the filtered data.
|
132 |
+
df_headers = df_view.columns.tolist()
|
133 |
+
df_datatypes = ["markdown" if col == "Logs" or "Cost" in col or "Score" in col else "str" for col in df_headers]
|
134 |
+
|
135 |
+
dataframe_component = gr.DataFrame(
|
136 |
+
headers=df_headers,
|
137 |
+
value=df_view,
|
138 |
+
datatype=df_datatypes,
|
139 |
+
interactive=False,
|
140 |
+
wrap=True,
|
141 |
+
column_widths=[100, 100, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 75, 75, 50, 50]
|
142 |
+
)
|
143 |
+
|
144 |
+
plot_component = gr.Plot(
|
145 |
+
value=scatter_plot,
|
146 |
+
label=f"Score vs. Cost ({category_name})"
|
147 |
+
)
|
148 |
+
gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
149 |
+
|
150 |
+
# Return the components so they can be referenced elsewhere.
|
151 |
+
return dataframe_component, plot_component
|
152 |
+
|
153 |
+
def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
154 |
+
"""
|
155 |
+
Loads and transforms the complete dataset for a given split.
|
156 |
+
This function handles caching and returns the final "pretty" DataFrame and tag map.
|
157 |
+
"""
|
158 |
+
# This reuses your existing robust caching logic
|
159 |
+
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
|
160 |
+
|
161 |
+
if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)):
|
162 |
+
raw_df, _ = viewer_or_data._load()
|
163 |
+
if raw_df.empty:
|
164 |
+
return pd.DataFrame(), {}
|
165 |
+
|
166 |
+
pretty_df = transform_raw_dataframe(raw_df)
|
167 |
+
pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
|
168 |
+
if "Logs" in pretty_df.columns:
|
169 |
+
def format_log_entry_to_html(raw_uri):
|
170 |
+
if pd.isna(raw_uri) or raw_uri == "": return ""
|
171 |
+
web_url = hf_uri_to_web_url(str(raw_uri))
|
172 |
+
return hyperlink(web_url, "🔗") if web_url else ""
|
173 |
+
|
174 |
+
# Apply the function to the "Logs" column
|
175 |
+
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
|
176 |
+
|
177 |
+
return pretty_df, pretty_tag_map
|
178 |
+
|
179 |
+
# Fallback for unexpected types
|
180 |
+
return pd.DataFrame(), {}
|
181 |
+
|
182 |
+
# --- Detailed Benchmark Display ---
|
183 |
+
def create_benchmark_details_display(
|
184 |
+
full_df: pd.DataFrame,
|
185 |
+
tag_map: dict,
|
186 |
+
category_name: str
|
187 |
+
):
|
188 |
+
"""
|
189 |
+
Generates a detailed breakdown for each benchmark within a given category.
|
190 |
+
For each benchmark, it creates a title, a filtered table, and a scatter plot.
|
191 |
+
Args:
|
192 |
+
full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
|
193 |
+
tag_map (dict): The "pretty" tag map to find the list of benchmarks.
|
194 |
+
category_name (str): The main category to display details for (e.g., "Literature Understanding").
|
195 |
+
"""
|
196 |
+
# 1. Get the list of benchmarks for the selected category
|
197 |
+
benchmark_names = tag_map.get(category_name, [])
|
198 |
+
|
199 |
+
if not benchmark_names:
|
200 |
+
gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
|
201 |
+
return
|
202 |
+
|
203 |
+
gr.Markdown("---")
|
204 |
+
gr.Markdown("## Detailed Benchmark Results")
|
205 |
+
|
206 |
+
# 2. Loop through each benchmark and create its UI components
|
207 |
+
for benchmark_name in benchmark_names:
|
208 |
+
with gr.Blocks():
|
209 |
+
gr.Markdown(f"### {benchmark_name}")
|
210 |
+
|
211 |
+
# 3. Prepare the data for this specific benchmark's table and plot
|
212 |
+
benchmark_score_col = f"{benchmark_name} Score"
|
213 |
+
benchmark_cost_col = f"{benchmark_name} Cost"
|
214 |
+
|
215 |
+
# Define the columns needed for the detailed table
|
216 |
+
table_cols = ['Agent', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs']
|
217 |
+
|
218 |
+
# Filter to only columns that actually exist in the full dataframe
|
219 |
+
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
220 |
+
|
221 |
+
if benchmark_score_col not in existing_table_cols:
|
222 |
+
gr.Markdown(f"Score data for {benchmark_name} not available.")
|
223 |
+
continue # Skip to the next benchmark if score is missing
|
224 |
+
|
225 |
+
# Create a specific DataFrame for the table view
|
226 |
+
benchmark_table_df = full_df[existing_table_cols].copy()
|
227 |
+
# Calculated and add "Benchmark Attempted" column
|
228 |
+
def check_benchmark_status(row):
|
229 |
+
has_score = pd.notna(row.get(benchmark_score_col))
|
230 |
+
has_cost = pd.notna(row.get(benchmark_cost_col))
|
231 |
+
|
232 |
+
if has_score and has_cost:
|
233 |
+
return "✅"
|
234 |
+
if has_score or has_cost:
|
235 |
+
return "⚠️"
|
236 |
+
return "🚫 "
|
237 |
+
|
238 |
+
# Apply the function to create the new column
|
239 |
+
benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
|
240 |
+
# Sort the DataFrame
|
241 |
+
if benchmark_score_col in benchmark_table_df.columns:
|
242 |
+
benchmark_table_df = benchmark_table_df.sort_values(
|
243 |
+
by=benchmark_score_col, ascending=False, na_position='last'
|
244 |
+
)
|
245 |
+
# 1. Format the cost and score columns
|
246 |
+
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
|
247 |
+
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
|
248 |
+
desired_cols_in_order = [
|
249 |
+
'Agent',
|
250 |
+
'Submitter',
|
251 |
+
'Attempted Benchmark',
|
252 |
+
benchmark_score_col,
|
253 |
+
benchmark_cost_col,
|
254 |
+
'Openness',
|
255 |
+
'Degree of Control',
|
256 |
+
'Date',
|
257 |
+
'Logs'
|
258 |
+
]
|
259 |
+
for col in desired_cols_in_order:
|
260 |
+
if col not in benchmark_table_df.columns:
|
261 |
+
benchmark_table_df[col] = pd.NA # Add as an empty column
|
262 |
+
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
|
263 |
+
# Rename columns for a cleaner table display, as requested
|
264 |
+
benchmark_table_df.rename(columns={
|
265 |
+
benchmark_score_col: 'Score',
|
266 |
+
benchmark_cost_col: 'Cost'
|
267 |
+
}, inplace=True)
|
268 |
+
# Ensure the 'Logs' column is formatted correctly
|
269 |
+
table_headers = benchmark_table_df.columns.tolist()
|
270 |
+
# If the column is 'Logs', render as markdown; otherwise, as a string.
|
271 |
+
df_datatypes = [
|
272 |
+
"markdown" if col in ["Logs", "Cost", "Score"] else "str"
|
273 |
+
for col in table_headers
|
274 |
+
]
|
275 |
+
|
276 |
+
# Create the Gradio component, now with the correct datatypes
|
277 |
+
gr.DataFrame(
|
278 |
+
value=benchmark_table_df,
|
279 |
+
datatype=df_datatypes,
|
280 |
+
interactive=False,
|
281 |
+
wrap=True,
|
282 |
+
)
|
283 |
+
|
284 |
+
# Create the scatter plot using the full data for context, but plotting benchmark metrics
|
285 |
+
# This shows all agents on the same axis for better comparison.
|
286 |
+
benchmark_plot = _plot_scatter_plotly(
|
287 |
+
data=full_df,
|
288 |
+
x=benchmark_cost_col,
|
289 |
+
y=benchmark_score_col,
|
290 |
+
agent_col="Agent"
|
291 |
+
)
|
292 |
+
gr.Plot(value=benchmark_plot)
|
293 |
+
gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|