xhluca
commited on
Commit
·
99e2870
1
Parent(s):
331ed33
Simplify table
Browse files
demo.py
CHANGED
|
@@ -484,95 +484,94 @@ base_screenshot_dir = Path(base_screenshot_dir)
|
|
| 484 |
|
| 485 |
hl_action_parser = _build_highlevel_action_parser()
|
| 486 |
|
| 487 |
-
with gr.Blocks(title="AgentRewardBench Demo") as demo
|
| 488 |
gr.Markdown(
|
| 489 |
"""
|
| 490 |
-
# AgentRewardBench Demo
|
| 491 |
-
| [**🤗Dataset**](https://huggingface.co/datasets/McGill-NLP/agent-reward-bench) | **📄Paper (TBA)** | [**🌐Website**](https://agent-reward-bench.github.io) | [**🏆Leaderboard**](https://huggingface.co/spaces/McGill-NLP/agent-reward-bench-leaderboard) | [**💻Demo**](https://huggingface.co/spaces/McGill-NLP/agent-reward-bench-demo)
|
| 492 |
-
| :--: | :--: | :--: | :--: | :--: |
|
| 493 |
"""
|
| 494 |
)
|
| 495 |
-
with gr.
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
agents = list_agents(base_traj_dir, benchmark_default)
|
| 502 |
-
model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
|
| 503 |
-
|
| 504 |
-
task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
|
| 505 |
-
task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
|
| 506 |
-
|
| 507 |
-
@benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
|
| 508 |
-
def update_agents(benchmark):
|
| 509 |
-
agents = list_agents(base_traj_dir, benchmark)
|
| 510 |
-
return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
|
| 511 |
-
|
| 512 |
-
@model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
|
| 513 |
-
def update_task_ids(benchmark, agent):
|
| 514 |
-
task_ids = list_task_ids(base_traj_dir, benchmark, agent)
|
| 515 |
-
|
| 516 |
-
return gr.Dropdown(choices=task_ids, value=task_ids[0])
|
| 517 |
-
|
| 518 |
-
with gr.Column(scale=8):
|
| 519 |
-
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
|
| 520 |
-
def render_trajectory(benchmark, agent, task_id):
|
| 521 |
-
traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
|
| 522 |
-
with open(traj_path, "rb") as f:
|
| 523 |
-
traj = orjson.loads(f.read())
|
| 524 |
-
|
| 525 |
-
goal = replace_string_content(traj["goal"])
|
| 526 |
|
| 527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
|
|
|
| 534 |
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
im = Image.open(screenshot_path)
|
| 538 |
-
im = apply_overlay_to_image(
|
| 539 |
-
im, step, highlevel_action_parser=hl_action_parser
|
| 540 |
)
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
if reasoning is not None:
|
| 544 |
-
gr.Textbox(reasoning, label="Reasoning", lines=4)
|
| 545 |
-
if action is not None:
|
| 546 |
-
gr.Textbox(action, label="Action", lines=2)
|
| 547 |
-
|
| 548 |
-
# multi-choices dropdown for judges
|
| 549 |
-
judge_dd = gr.Dropdown(
|
| 550 |
-
label="Judges",
|
| 551 |
-
choices=list(judges_dict.values()),
|
| 552 |
-
multiselect=True,
|
| 553 |
-
value=default_judges,
|
| 554 |
-
)
|
| 555 |
-
|
| 556 |
-
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
|
| 557 |
-
def render_judge(benchmark, agent, task_id, judge_choices):
|
| 558 |
-
# load judgments
|
| 559 |
-
for judge in judges_dict.values():
|
| 560 |
-
if judge not in judge_choices:
|
| 561 |
-
continue
|
| 562 |
-
|
| 563 |
-
judgment_path = get_judgment_path(
|
| 564 |
-
base_judgments_dir, benchmark, agent, judge, task_id
|
| 565 |
-
)
|
| 566 |
-
if not judgment_path.exists():
|
| 567 |
-
continue
|
| 568 |
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
|
| 576 |
-
|
| 577 |
|
| 578 |
demo.launch()
|
|
|
|
| 484 |
|
| 485 |
hl_action_parser = _build_highlevel_action_parser()
|
| 486 |
|
| 487 |
+
with gr.Blocks(title="AgentRewardBench Demo") as demo:
|
| 488 |
gr.Markdown(
|
| 489 |
"""
|
| 490 |
+
# AgentRewardBench Demo ([Website](https://agent-reward-bench.github.io))
|
|
|
|
|
|
|
| 491 |
"""
|
| 492 |
)
|
| 493 |
+
with gr.Row():
|
| 494 |
+
with gr.Column(scale=4):
|
| 495 |
+
benchmark_default = "WebArena"
|
| 496 |
+
benchmark_dd = gr.Dropdown(
|
| 497 |
+
label="Benchmark", choices=list_benchmarks(base_traj_dir), value=benchmark_default
|
| 498 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
+
agents = list_agents(base_traj_dir, benchmark_default)
|
| 501 |
+
model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
|
| 502 |
+
|
| 503 |
+
task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
|
| 504 |
+
task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
|
| 505 |
+
|
| 506 |
+
@benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
|
| 507 |
+
def update_agents(benchmark):
|
| 508 |
+
agents = list_agents(base_traj_dir, benchmark)
|
| 509 |
+
return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
|
| 510 |
+
|
| 511 |
+
@model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
|
| 512 |
+
def update_task_ids(benchmark, agent):
|
| 513 |
+
task_ids = list_task_ids(base_traj_dir, benchmark, agent)
|
| 514 |
+
|
| 515 |
+
return gr.Dropdown(choices=task_ids, value=task_ids[0])
|
| 516 |
+
|
| 517 |
+
with gr.Column(scale=8):
|
| 518 |
+
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
|
| 519 |
+
def render_trajectory(benchmark, agent, task_id):
|
| 520 |
+
traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
|
| 521 |
+
with open(traj_path, "rb") as f:
|
| 522 |
+
traj = orjson.loads(f.read())
|
| 523 |
+
|
| 524 |
+
goal = replace_string_content(traj["goal"])
|
| 525 |
+
|
| 526 |
+
gr.Textbox(label="Goal", value=goal, visible=True)
|
| 527 |
+
|
| 528 |
+
for step in traj["steps"]:
|
| 529 |
+
num = step["num"]
|
| 530 |
+
action = step["action"]
|
| 531 |
+
reasoning = step["reasoning"]
|
| 532 |
+
screenshot_path = step["screenshot_path"]
|
| 533 |
+
|
| 534 |
+
gr.Markdown(f"# Step {num}")
|
| 535 |
+
with gr.Group():
|
| 536 |
+
im = Image.open(screenshot_path)
|
| 537 |
+
im = apply_overlay_to_image(
|
| 538 |
+
im, step, highlevel_action_parser=hl_action_parser
|
| 539 |
+
)
|
| 540 |
+
format_ = "webp" if im.format is None else im.format
|
| 541 |
+
gr.Image(im, label="Screenshot", format=format_)
|
| 542 |
+
if reasoning is not None:
|
| 543 |
+
gr.Textbox(reasoning, label="Reasoning", lines=4)
|
| 544 |
+
if action is not None:
|
| 545 |
+
gr.Textbox(action, label="Action", lines=2)
|
| 546 |
+
|
| 547 |
+
# multi-choices dropdown for judges
|
| 548 |
+
judge_dd = gr.Dropdown(
|
| 549 |
+
label="Judges",
|
| 550 |
+
choices=list(judges_dict.values()),
|
| 551 |
+
multiselect=True,
|
| 552 |
+
value=default_judges,
|
| 553 |
+
)
|
| 554 |
|
| 555 |
+
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
|
| 556 |
+
def render_judge(benchmark, agent, task_id, judge_choices):
|
| 557 |
+
# load judgments
|
| 558 |
+
for judge in judges_dict.values():
|
| 559 |
+
if judge not in judge_choices:
|
| 560 |
+
continue
|
| 561 |
|
| 562 |
+
judgment_path = get_judgment_path(
|
| 563 |
+
base_judgments_dir, benchmark, agent, judge, task_id
|
|
|
|
|
|
|
|
|
|
| 564 |
)
|
| 565 |
+
if not judgment_path.exists():
|
| 566 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
|
| 568 |
+
with open(judgment_path, "rb") as f:
|
| 569 |
+
judgment = orjson.loads(f.read())
|
| 570 |
+
if judge == "Rule-based":
|
| 571 |
+
msg = get_message_from_rule_based(judgment)
|
| 572 |
+
else:
|
| 573 |
+
msg = get_message_from_judgment(judgment)
|
| 574 |
|
| 575 |
+
gr.Textbox(label=judge, value=msg, lines=4)
|
| 576 |
|
| 577 |
demo.launch()
|